LLVM 22.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include "llvm/IR/IntrinsicsRISCV.h"
20#include <cmath>
21#include <optional>
22using namespace llvm;
23using namespace llvm::PatternMatch;
24
25#define DEBUG_TYPE "riscvtti"
26
28 "riscv-v-register-bit-width-lmul",
30 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
31 "by autovectorized code. Fractional LMULs are not supported."),
33
35 "riscv-v-slp-max-vf",
37 "Overrides result used for getMaximumVF query which is used "
38 "exclusively by SLP vectorizer."),
40
42 RVVMinTripCount("riscv-v-min-trip-count",
43 cl::desc("Set the lower bound of a trip count to decide on "
44 "vectorization while tail-folding."),
46
48RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
50 // Check if the type is valid for all CostKind
51 if (!VT.isVector())
53 size_t NumInstr = OpCodes.size();
55 return NumInstr;
56 InstructionCost LMULCost = TLI->getLMULCost(VT);
58 return LMULCost * NumInstr;
59 InstructionCost Cost = 0;
60 for (auto Op : OpCodes) {
61 switch (Op) {
62 case RISCV::VRGATHER_VI:
63 Cost += TLI->getVRGatherVICost(VT);
64 break;
65 case RISCV::VRGATHER_VV:
66 Cost += TLI->getVRGatherVVCost(VT);
67 break;
68 case RISCV::VSLIDEUP_VI:
69 case RISCV::VSLIDEDOWN_VI:
70 Cost += TLI->getVSlideVICost(VT);
71 break;
72 case RISCV::VSLIDEUP_VX:
73 case RISCV::VSLIDEDOWN_VX:
74 Cost += TLI->getVSlideVXCost(VT);
75 break;
76 case RISCV::VREDMAX_VS:
77 case RISCV::VREDMIN_VS:
78 case RISCV::VREDMAXU_VS:
79 case RISCV::VREDMINU_VS:
80 case RISCV::VREDSUM_VS:
81 case RISCV::VREDAND_VS:
82 case RISCV::VREDOR_VS:
83 case RISCV::VREDXOR_VS:
84 case RISCV::VFREDMAX_VS:
85 case RISCV::VFREDMIN_VS:
86 case RISCV::VFREDUSUM_VS: {
87 unsigned VL = VT.getVectorMinNumElements();
88 if (!VT.isFixedLengthVector())
89 VL *= *getVScaleForTuning();
90 Cost += Log2_32_Ceil(VL);
91 break;
92 }
93 case RISCV::VFREDOSUM_VS: {
94 unsigned VL = VT.getVectorMinNumElements();
95 if (!VT.isFixedLengthVector())
96 VL *= *getVScaleForTuning();
97 Cost += VL;
98 break;
99 }
100 case RISCV::VMV_X_S:
101 case RISCV::VMV_S_X:
102 case RISCV::VFMV_F_S:
103 case RISCV::VFMV_S_F:
104 case RISCV::VMOR_MM:
105 case RISCV::VMXOR_MM:
106 case RISCV::VMAND_MM:
107 case RISCV::VMANDN_MM:
108 case RISCV::VMNAND_MM:
109 case RISCV::VCPOP_M:
110 case RISCV::VFIRST_M:
111 Cost += 1;
112 break;
113 default:
114 Cost += LMULCost;
115 }
116 }
117 return Cost;
118}
119
121 const RISCVSubtarget *ST,
122 const APInt &Imm, Type *Ty,
124 bool FreeZeroes) {
125 assert(Ty->isIntegerTy() &&
126 "getIntImmCost can only estimate cost of materialising integers");
127
128 // We have a Zero register, so 0 is always free.
129 if (Imm == 0)
130 return TTI::TCC_Free;
131
132 // Otherwise, we check how many instructions it will take to materialise.
133 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
134 /*CompressionCost=*/false, FreeZeroes);
135}
136
140 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
141}
142
143// Look for patterns of shift followed by AND that can be turned into a pair of
144// shifts. We won't need to materialize an immediate for the AND so these can
145// be considered free.
146static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
147 uint64_t Mask = Imm.getZExtValue();
148 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
149 if (!BO || !BO->hasOneUse())
150 return false;
151
152 if (BO->getOpcode() != Instruction::Shl)
153 return false;
154
155 if (!isa<ConstantInt>(BO->getOperand(1)))
156 return false;
157
158 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
159 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
160 // is a mask shifted by c2 bits with c3 leading zeros.
161 if (isShiftedMask_64(Mask)) {
162 unsigned Trailing = llvm::countr_zero(Mask);
163 if (ShAmt == Trailing)
164 return true;
165 }
166
167 return false;
168}
169
170// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
171// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
172// the type will be split so only the lower 32 bits need to be compared using
173// (srai/srli X, C) == C2.
174static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
175 if (!Inst->hasOneUse())
176 return false;
177
178 // Look for equality comparison.
179 auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
180 if (!Cmp || !Cmp->isEquality())
181 return false;
182
183 // Right hand side of comparison should be a constant.
184 auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
185 if (!C)
186 return false;
187
188 uint64_t Mask = Imm.getZExtValue();
189
190 // Mask should be of the form -(1 << C) in the lower 32 bits.
191 if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
192 return false;
193
194 // Comparison constant should be a subset of Mask.
195 uint64_t CmpC = C->getZExtValue();
196 if ((CmpC & Mask) != CmpC)
197 return false;
198
199 // We'll need to sign extend the comparison constant and shift it right. Make
200 // sure the new constant can use addi/xori+seqz/snez.
201 unsigned ShiftBits = llvm::countr_zero(Mask);
202 int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
203 return NewCmpC >= -2048 && NewCmpC <= 2048;
204}
205
207 const APInt &Imm, Type *Ty,
209 Instruction *Inst) const {
210 assert(Ty->isIntegerTy() &&
211 "getIntImmCost can only estimate cost of materialising integers");
212
213 // We have a Zero register, so 0 is always free.
214 if (Imm == 0)
215 return TTI::TCC_Free;
216
217 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
218 // commutative, in others the immediate comes from a specific argument index.
219 bool Takes12BitImm = false;
220 unsigned ImmArgIdx = ~0U;
221
222 switch (Opcode) {
223 case Instruction::GetElementPtr:
224 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
225 // split up large offsets in GEP into better parts than ConstantHoisting
226 // can.
227 return TTI::TCC_Free;
228 case Instruction::Store: {
229 // Use the materialization cost regardless of if it's the address or the
230 // value that is constant, except for if the store is misaligned and
231 // misaligned accesses are not legal (experience shows constant hoisting
232 // can sometimes be harmful in such cases).
233 if (Idx == 1 || !Inst)
234 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
235 /*FreeZeroes=*/true);
236
237 StoreInst *ST = cast<StoreInst>(Inst);
238 if (!getTLI()->allowsMemoryAccessForAlignment(
239 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
240 ST->getPointerAddressSpace(), ST->getAlign()))
241 return TTI::TCC_Free;
242
243 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
244 /*FreeZeroes=*/true);
245 }
246 case Instruction::Load:
247 // If the address is a constant, use the materialization cost.
248 return getIntImmCost(Imm, Ty, CostKind);
249 case Instruction::And:
250 // zext.h
251 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
252 return TTI::TCC_Free;
253 // zext.w
254 if (Imm == UINT64_C(0xffffffff) &&
255 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
256 return TTI::TCC_Free;
257 // bclri
258 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
259 return TTI::TCC_Free;
260 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
261 canUseShiftPair(Inst, Imm))
262 return TTI::TCC_Free;
263 if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
264 canUseShiftCmp(Inst, Imm))
265 return TTI::TCC_Free;
266 Takes12BitImm = true;
267 break;
268 case Instruction::Add:
269 Takes12BitImm = true;
270 break;
271 case Instruction::Or:
272 case Instruction::Xor:
273 // bseti/binvi
274 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
275 return TTI::TCC_Free;
276 Takes12BitImm = true;
277 break;
278 case Instruction::Mul:
279 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
280 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
281 return TTI::TCC_Free;
282 // One more or less than a power of 2 can use SLLI+ADD/SUB.
283 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
284 return TTI::TCC_Free;
285 // FIXME: There is no MULI instruction.
286 Takes12BitImm = true;
287 break;
288 case Instruction::Sub:
289 case Instruction::Shl:
290 case Instruction::LShr:
291 case Instruction::AShr:
292 Takes12BitImm = true;
293 ImmArgIdx = 1;
294 break;
295 default:
296 break;
297 }
298
299 if (Takes12BitImm) {
300 // Check immediate is the correct argument...
301 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
302 // ... and fits into the 12-bit immediate.
303 if (Imm.getSignificantBits() <= 64 &&
304 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
305 return TTI::TCC_Free;
306 }
307 }
308
309 // Otherwise, use the full materialisation cost.
310 return getIntImmCost(Imm, Ty, CostKind);
311 }
312
313 // By default, prevent hoisting.
314 return TTI::TCC_Free;
315}
316
319 const APInt &Imm, Type *Ty,
321 // Prevent hoisting in unknown cases.
322 return TTI::TCC_Free;
323}
324
326 return ST->hasVInstructions();
327}
328
330RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
331 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
332 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
333}
334
336 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
338 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
340
341 // zve32x is broken for partial_reduce_umla, but let's make sure we
342 // don't generate them.
343 if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 ||
344 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
345 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
346 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
348
349 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
350 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
351 // Note: Asuming all vqdot* variants are equal cost
352 return LT.first *
353 getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second, CostKind);
354}
355
357 // Currently, the ExpandReductions pass can't expand scalable-vector
358 // reductions, but we still request expansion as RVV doesn't support certain
359 // reductions and the SelectionDAG can't legalize them either.
360 switch (II->getIntrinsicID()) {
361 default:
362 return false;
363 // These reductions have no equivalent in RVV
364 case Intrinsic::vector_reduce_mul:
365 case Intrinsic::vector_reduce_fmul:
366 return true;
367 }
368}
369
370std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
371 if (ST->hasVInstructions())
372 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
373 return BaseT::getMaxVScale();
374}
375
376std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
377 if (ST->hasVInstructions())
378 if (unsigned MinVLen = ST->getRealMinVLen();
379 MinVLen >= RISCV::RVVBitsPerBlock)
380 return MinVLen / RISCV::RVVBitsPerBlock;
382}
383
386 unsigned LMUL =
387 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
388 switch (K) {
390 return TypeSize::getFixed(ST->getXLen());
392 return TypeSize::getFixed(
393 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
396 (ST->hasVInstructions() &&
397 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
399 : 0);
400 }
401
402 llvm_unreachable("Unsupported register kind");
403}
404
405InstructionCost RISCVTTIImpl::getStaticDataAddrGenerationCost(
406 const TTI::TargetCostKind CostKind) const {
407 switch (CostKind) {
410 // Always 2 instructions
411 return 2;
412 case TTI::TCK_Latency:
414 // Depending on the memory model the address generation will
415 // require AUIPC + ADDI (medany) or LUI + ADDI (medlow). Don't
416 // have a way of getting this information here, so conservatively
417 // require both.
418 // In practice, these are generally implemented together.
419 return (ST->hasAUIPCADDIFusion() && ST->hasLUIADDIFusion()) ? 1 : 2;
420 }
421 llvm_unreachable("Unsupported cost kind");
422}
423
425RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
427 // Add a cost of address generation + the cost of the load. The address
428 // is expected to be a PC relative offset to a constant pool entry
429 // using auipc/addi.
430 return getStaticDataAddrGenerationCost(CostKind) +
431 getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
432 /*AddressSpace=*/0, CostKind);
433}
434
435static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
436 unsigned Size = Mask.size();
437 if (!isPowerOf2_32(Size))
438 return false;
439 for (unsigned I = 0; I != Size; ++I) {
440 if (static_cast<unsigned>(Mask[I]) == I)
441 continue;
442 if (Mask[I] != 0)
443 return false;
444 if (Size % I != 0)
445 return false;
446 for (unsigned J = I + 1; J != Size; ++J)
447 // Check the pattern is repeated.
448 if (static_cast<unsigned>(Mask[J]) != J % I)
449 return false;
450 SubVectorSize = I;
451 return true;
452 }
453 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
454 return false;
455}
456
458 LLVMContext &C) {
459 assert((DataVT.getScalarSizeInBits() != 8 ||
460 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
461 MVT IndexVT = DataVT.changeTypeToInteger();
462 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
463 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
464 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
465}
466
467/// Attempt to approximate the cost of a shuffle which will require splitting
468/// during legalization. Note that processShuffleMasks is not an exact proxy
469/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
470/// reasonably close upperbound.
472 MVT LegalVT, VectorType *Tp,
473 ArrayRef<int> Mask,
475 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
476 "Expected fixed vector type and non-empty mask");
477 unsigned LegalNumElts = LegalVT.getVectorNumElements();
478 // Number of destination vectors after legalization:
479 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
480 // We are going to permute multiple sources and the result will be in
481 // multiple destinations. Providing an accurate cost only for splits where
482 // the element type remains the same.
483 if (NumOfDests <= 1 ||
485 Tp->getElementType()->getPrimitiveSizeInBits() ||
486 LegalNumElts >= Tp->getElementCount().getFixedValue())
488
489 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
490 unsigned LegalVTSize = LegalVT.getStoreSize();
491 // Number of source vectors after legalization:
492 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
493
494 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
495
496 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
497 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
498 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
499 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
500 assert(NormalizedVF >= Mask.size() &&
501 "Normalized mask expected to be not shorter than original mask.");
502 copy(Mask, NormalizedMask.begin());
503 InstructionCost Cost = 0;
504 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
506 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
507 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
508 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
509 return;
510 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
511 .second)
512 return;
513 Cost += TTI.getShuffleCost(
515 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
516 SingleOpTy, RegMask, CostKind, 0, nullptr);
517 },
518 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
519 Cost += TTI.getShuffleCost(
521 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
522 SingleOpTy, RegMask, CostKind, 0, nullptr);
523 });
524 return Cost;
525}
526
527/// Try to perform better estimation of the permutation.
528/// 1. Split the source/destination vectors into real registers.
529/// 2. Do the mask analysis to identify which real registers are
530/// permuted. If more than 1 source registers are used for the
531/// destination register building, the cost for this destination register
532/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
533/// source register is used, build mask and calculate the cost as a cost
534/// of PermuteSingleSrc.
535/// Also, for the single register permute we try to identify if the
536/// destination register is just a copy of the source register or the
537/// copy of the previous destination register (the cost is
538/// TTI::TCC_Basic). If the source register is just reused, the cost for
539/// this operation is 0.
540static InstructionCost
542 std::optional<unsigned> VLen, VectorType *Tp,
544 assert(LegalVT.isFixedLengthVector());
545 if (!VLen || Mask.empty())
547 MVT ElemVT = LegalVT.getVectorElementType();
548 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
549 LegalVT = TTI.getTypeLegalizationCost(
550 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
551 .second;
552 // Number of destination vectors after legalization:
553 InstructionCost NumOfDests =
554 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
555 if (NumOfDests <= 1 ||
557 Tp->getElementType()->getPrimitiveSizeInBits() ||
558 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
560
561 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
562 unsigned LegalVTSize = LegalVT.getStoreSize();
563 // Number of source vectors after legalization:
564 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
565
566 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
567 LegalVT.getVectorNumElements());
568
569 unsigned E = NumOfDests.getValue();
570 unsigned NormalizedVF =
571 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
572 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
573 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
574 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
575 assert(NormalizedVF >= Mask.size() &&
576 "Normalized mask expected to be not shorter than original mask.");
577 copy(Mask, NormalizedMask.begin());
578 InstructionCost Cost = 0;
579 int NumShuffles = 0;
580 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
582 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
583 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
584 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
585 return;
586 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
587 .second)
588 return;
589 ++NumShuffles;
590 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
591 SingleOpTy, RegMask, CostKind, 0, nullptr);
592 },
593 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
594 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
595 SingleOpTy, RegMask, CostKind, 0, nullptr);
596 NumShuffles += 2;
597 });
598 // Note: check that we do not emit too many shuffles here to prevent code
599 // size explosion.
600 // TODO: investigate, if it can be improved by extra analysis of the masks
601 // to check if the code is more profitable.
602 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
603 (NumOfDestRegs <= 2 && NumShuffles < 4))
604 return Cost;
606}
607
608InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
609 ArrayRef<int> Mask,
611 // Avoid missing masks and length changing shuffles
612 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
614
615 int NumElts = Tp->getNumElements();
616 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
617 // Avoid scalarization cases
618 if (!LT.second.isFixedLengthVector())
620
621 // Requires moving elements between parts, which requires additional
622 // unmodeled instructions.
623 if (LT.first != 1)
625
626 auto GetSlideOpcode = [&](int SlideAmt) {
627 assert(SlideAmt != 0);
628 bool IsVI = isUInt<5>(std::abs(SlideAmt));
629 if (SlideAmt < 0)
630 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
631 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
632 };
633
634 std::array<std::pair<int, int>, 2> SrcInfo;
635 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
637
638 if (SrcInfo[1].second == 0)
639 std::swap(SrcInfo[0], SrcInfo[1]);
640
641 InstructionCost FirstSlideCost = 0;
642 if (SrcInfo[0].second != 0) {
643 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
644 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
645 }
646
647 if (SrcInfo[1].first == -1)
648 return FirstSlideCost;
649
650 InstructionCost SecondSlideCost = 0;
651 if (SrcInfo[1].second != 0) {
652 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
653 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
654 } else {
655 SecondSlideCost =
656 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
657 }
658
659 auto EC = Tp->getElementCount();
660 VectorType *MaskTy =
662 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
663 return FirstSlideCost + SecondSlideCost + MaskCost;
664}
665
668 VectorType *SrcTy, ArrayRef<int> Mask,
669 TTI::TargetCostKind CostKind, int Index,
671 const Instruction *CxtI) const {
672 assert((Mask.empty() || DstTy->isScalableTy() ||
673 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
674 "Expected the Mask to match the return size if given");
675 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
676 "Expected the same scalar types");
677
678 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
679 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
680
681 // First, handle cases where having a fixed length vector enables us to
682 // give a more accurate cost than falling back to generic scalable codegen.
683 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
684 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
685 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
687 *this, LT.second, ST->getRealVLen(),
688 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
689 if (VRegSplittingCost.isValid())
690 return VRegSplittingCost;
691 switch (Kind) {
692 default:
693 break;
695 if (Mask.size() >= 2) {
696 MVT EltTp = LT.second.getVectorElementType();
697 // If the size of the element is < ELEN then shuffles of interleaves and
698 // deinterleaves of 2 vectors can be lowered into the following
699 // sequences
700 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
701 // Example sequence:
702 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
703 // vwaddu.vv v10, v8, v9
704 // li a0, -1 (ignored)
705 // vwmaccu.vx v10, a0, v9
706 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
707 return 2 * LT.first * TLI->getLMULCost(LT.second);
708
709 if (Mask[0] == 0 || Mask[0] == 1) {
710 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
711 // Example sequence:
712 // vnsrl.wi v10, v8, 0
713 if (equal(DeinterleaveMask, Mask))
714 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
715 LT.second, CostKind);
716 }
717 }
718 int SubVectorSize;
719 if (LT.second.getScalarSizeInBits() != 1 &&
720 isRepeatedConcatMask(Mask, SubVectorSize)) {
722 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
723 // The cost of extraction from a subvector is 0 if the index is 0.
724 for (unsigned I = 0; I != NumSlides; ++I) {
725 unsigned InsertIndex = SubVectorSize * (1 << I);
726 FixedVectorType *SubTp =
727 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
728 FixedVectorType *DestTp =
730 std::pair<InstructionCost, MVT> DestLT =
732 // Add the cost of whole vector register move because the
733 // destination vector register group for vslideup cannot overlap the
734 // source.
735 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
736 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
737 CostKind, InsertIndex, SubTp);
738 }
739 return Cost;
740 }
741 }
742
743 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
744 SlideCost.isValid())
745 return SlideCost;
746
747 // vrgather + cost of generating the mask constant.
748 // We model this for an unknown mask with a single vrgather.
749 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
750 LT.second.getVectorNumElements() <= 256)) {
751 VectorType *IdxTy =
752 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
753 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
754 return IndexCost +
755 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
756 }
757 break;
758 }
761
762 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
763 SlideCost.isValid())
764 return SlideCost;
765
766 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
767 // register for the second vrgather. We model this for an unknown
768 // (shuffle) mask.
769 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
770 LT.second.getVectorNumElements() <= 256)) {
771 auto &C = SrcTy->getContext();
772 auto EC = SrcTy->getElementCount();
773 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
775 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
776 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
777 return 2 * IndexCost +
778 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
779 LT.second, CostKind) +
780 MaskCost;
781 }
782 break;
783 }
784 }
785
786 auto shouldSplit = [](TTI::ShuffleKind Kind) {
787 switch (Kind) {
788 default:
789 return false;
793 return true;
794 }
795 };
796
797 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
798 shouldSplit(Kind)) {
799 InstructionCost SplitCost =
800 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
801 if (SplitCost.isValid())
802 return SplitCost;
803 }
804 }
805
806 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
807 switch (Kind) {
808 default:
809 // Fallthrough to generic handling.
810 // TODO: Most of these cases will return getInvalid in generic code, and
811 // must be implemented here.
812 break;
814 // Extract at zero is always a subregister extract
815 if (Index == 0)
816 return TTI::TCC_Free;
817
818 // If we're extracting a subvector of at most m1 size at a sub-register
819 // boundary - which unfortunately we need exact vlen to identify - this is
820 // a subregister extract at worst and thus won't require a vslidedown.
821 // TODO: Extend for aligned m2, m4 subvector extracts
822 // TODO: Extend for misalgined (but contained) extracts
823 // TODO: Extend for scalable subvector types
824 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
825 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
826 if (std::optional<unsigned> VLen = ST->getRealVLen();
827 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
828 SubLT.second.getSizeInBits() <= *VLen)
829 return TTI::TCC_Free;
830 }
831
832 // Example sequence:
833 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
834 // vslidedown.vi v8, v9, 2
835 return LT.first *
836 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
838 // Example sequence:
839 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
840 // vslideup.vi v8, v9, 2
841 LT = getTypeLegalizationCost(DstTy);
842 return LT.first *
843 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
844 case TTI::SK_Select: {
845 // Example sequence:
846 // li a0, 90
847 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
848 // vmv.s.x v0, a0
849 // vmerge.vvm v8, v9, v8, v0
850 // We use 2 for the cost of the mask materialization as this is the true
851 // cost for small masks and most shuffles are small. At worst, this cost
852 // should be a very small constant for the constant pool load. As such,
853 // we may bias towards large selects slightly more than truly warranted.
854 return LT.first *
855 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
856 LT.second, CostKind));
857 }
858 case TTI::SK_Broadcast: {
859 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
860 Instruction::InsertElement);
861 if (LT.second.getScalarSizeInBits() == 1) {
862 if (HasScalar) {
863 // Example sequence:
864 // andi a0, a0, 1
865 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
866 // vmv.v.x v8, a0
867 // vmsne.vi v0, v8, 0
868 return LT.first *
869 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
870 LT.second, CostKind));
871 }
872 // Example sequence:
873 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
874 // vmv.v.i v8, 0
875 // vmerge.vim v8, v8, 1, v0
876 // vmv.x.s a0, v8
877 // andi a0, a0, 1
878 // vmv.v.x v8, a0
879 // vmsne.vi v0, v8, 0
880
881 return LT.first *
882 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
883 RISCV::VMV_X_S, RISCV::VMV_V_X,
884 RISCV::VMSNE_VI},
885 LT.second, CostKind));
886 }
887
888 if (HasScalar) {
889 // Example sequence:
890 // vmv.v.x v8, a0
891 return LT.first *
892 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
893 }
894
895 // Example sequence:
896 // vrgather.vi v9, v8, 0
897 return LT.first *
898 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
899 }
900 case TTI::SK_Splice: {
901 // vslidedown+vslideup.
902 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
903 // of similar code, but I think we expand through memory.
904 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
905 if (Index >= 0 && Index < 32)
906 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
907 else if (Index < 0 && Index > -32)
908 Opcodes[1] = RISCV::VSLIDEUP_VI;
909 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
910 }
911 case TTI::SK_Reverse: {
912
913 if (!LT.second.isVector())
915
916 // TODO: Cases to improve here:
917 // * Illegal vector types
918 // * i64 on RV32
919 if (SrcTy->getElementType()->isIntegerTy(1)) {
920 VectorType *WideTy =
921 VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
922 cast<VectorType>(SrcTy)->getElementCount());
923 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
925 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
926 nullptr) +
927 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
929 }
930
931 MVT ContainerVT = LT.second;
932 if (LT.second.isFixedLengthVector())
933 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
934 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
935 if (ContainerVT.bitsLE(M1VT)) {
936 // Example sequence:
937 // csrr a0, vlenb
938 // srli a0, a0, 3
939 // addi a0, a0, -1
940 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
941 // vid.v v9
942 // vrsub.vx v10, v9, a0
943 // vrgather.vv v9, v8, v10
944 InstructionCost LenCost = 3;
945 if (LT.second.isFixedLengthVector())
946 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
947 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
948 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
949 if (LT.second.isFixedLengthVector() &&
950 isInt<5>(LT.second.getVectorNumElements() - 1))
951 Opcodes[1] = RISCV::VRSUB_VI;
952 InstructionCost GatherCost =
953 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
954 return LT.first * (LenCost + GatherCost);
955 }
956
957 // At high LMUL, we split into a series of M1 reverses (see
958 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
959 // the resulting gap at the bottom (for fixed vectors only). The important
960 // bit is that the cost scales linearly, not quadratically with LMUL.
961 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
962 InstructionCost FixedCost =
963 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
964 unsigned Ratio =
966 InstructionCost GatherCost =
967 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
968 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
969 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
970 return FixedCost + LT.first * (GatherCost + SlideCost);
971 }
972 }
973 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
974 SubTp);
975}
976
977static unsigned isM1OrSmaller(MVT VT) {
979 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
983}
984
986 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
987 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
988 ArrayRef<Value *> VL) const {
991
992 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
993 // For now, skip all fixed vector cost analysis when P extension is available
994 // to avoid crashes in getMinRVVVectorSizeInBits()
995 if (ST->enablePExtSIMDCodeGen() && isa<FixedVectorType>(Ty)) {
996 return 1; // Treat as single instruction cost for now
997 }
998
999 // A build_vector (which is m1 sized or smaller) can be done in no
1000 // worse than one vslide1down.vx per element in the type. We could
1001 // in theory do an explode_vector in the inverse manner, but our
1002 // lowering today does not have a first class node for this pattern.
1004 Ty, DemandedElts, Insert, Extract, CostKind);
1005 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1006 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
1007 if (Ty->getScalarSizeInBits() == 1) {
1008 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
1009 // Note: Implicit scalar anyextend is assumed to be free since the i1
1010 // must be stored in a GPR.
1011 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
1012 CostKind) +
1013 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
1015 }
1016
1017 assert(LT.second.isFixedLengthVector());
1018 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
1019 if (isM1OrSmaller(ContainerVT)) {
1020 InstructionCost BV =
1021 cast<FixedVectorType>(Ty)->getNumElements() *
1022 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
1023 if (BV < Cost)
1024 Cost = BV;
1025 }
1026 }
1027 return Cost;
1028}
1029
1033 Type *DataTy = MICA.getDataType();
1034 Align Alignment = MICA.getAlignment();
1035 switch (MICA.getID()) {
1036 case Intrinsic::vp_load_ff: {
1037 EVT DataTypeVT = TLI->getValueType(DL, DataTy);
1038 if (!TLI->isLegalFirstFaultLoad(DataTypeVT, Alignment))
1040
1041 unsigned AS = MICA.getAddressSpace();
1042 return getMemoryOpCost(Instruction::Load, DataTy, Alignment, AS, CostKind,
1043 {TTI::OK_AnyValue, TTI::OP_None}, nullptr);
1044 }
1045 case Intrinsic::experimental_vp_strided_load:
1046 case Intrinsic::experimental_vp_strided_store:
1047 return getStridedMemoryOpCost(MICA, CostKind);
1048 case Intrinsic::masked_compressstore:
1049 case Intrinsic::masked_expandload:
1051 case Intrinsic::vp_scatter:
1052 case Intrinsic::vp_gather:
1053 case Intrinsic::masked_scatter:
1054 case Intrinsic::masked_gather:
1055 return getGatherScatterOpCost(MICA, CostKind);
1056 case Intrinsic::vp_load:
1057 case Intrinsic::vp_store:
1058 case Intrinsic::masked_load:
1059 case Intrinsic::masked_store:
1060 return getMaskedMemoryOpCost(MICA, CostKind);
1061 }
1063}
1064
1068 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
1069 : Instruction::Store;
1070 Type *Src = MICA.getDataType();
1071 Align Alignment = MICA.getAlignment();
1072 unsigned AddressSpace = MICA.getAddressSpace();
1073
1074 if (!isLegalMaskedLoadStore(Src, Alignment) ||
1077
1078 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1079}
1080
1082 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1083 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1084 bool UseMaskForCond, bool UseMaskForGaps) const {
1085
1086 // The interleaved memory access pass will lower (de)interleave ops combined
1087 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
1088 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
1089 // gap).
1090 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1091 auto *VTy = cast<VectorType>(VecTy);
1092 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
1093 // Need to make sure type has't been scalarized
1094 if (LT.second.isVector()) {
1095 auto *SubVecTy =
1096 VectorType::get(VTy->getElementType(),
1097 VTy->getElementCount().divideCoefficientBy(Factor));
1098 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
1099 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
1100 AddressSpace, DL)) {
1101
1102 // Some processors optimize segment loads/stores as one wide memory op +
1103 // Factor * LMUL shuffle ops.
1104 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1106 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1107 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1108 Cost += Factor * TLI->getLMULCost(SubVecVT);
1109 return LT.first * Cost;
1110 }
1111
1112 // Otherwise, the cost is proportional to the number of elements (VL *
1113 // Factor ops).
1114 InstructionCost MemOpCost =
1115 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1116 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1117 unsigned NumLoads = getEstimatedVLFor(VTy);
1118 return NumLoads * MemOpCost;
1119 }
1120 }
1121 }
1122
1123 // TODO: Return the cost of interleaved accesses for scalable vector when
1124 // unable to convert to segment accesses instructions.
1125 if (isa<ScalableVectorType>(VecTy))
1127
1128 auto *FVTy = cast<FixedVectorType>(VecTy);
1129 InstructionCost MemCost =
1130 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1131 unsigned VF = FVTy->getNumElements() / Factor;
1132
1133 // An interleaved load will look like this for Factor=3:
1134 // %wide.vec = load <12 x i32>, ptr %3, align 4
1135 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1136 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1137 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1138 if (Opcode == Instruction::Load) {
1139 InstructionCost Cost = MemCost;
1140 for (unsigned Index : Indices) {
1141 FixedVectorType *VecTy =
1142 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1143 auto Mask = createStrideMask(Index, Factor, VF);
1144 Mask.resize(VF * Factor, -1);
1145 InstructionCost ShuffleCost =
1147 Mask, CostKind, 0, nullptr, {});
1148 Cost += ShuffleCost;
1149 }
1150 return Cost;
1151 }
1152
1153 // TODO: Model for NF > 2
1154 // We'll need to enhance getShuffleCost to model shuffles that are just
1155 // inserts and extracts into subvectors, since they won't have the full cost
1156 // of a vrgather.
1157 // An interleaved store for 3 vectors of 4 lanes will look like
1158 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1159 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1160 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1161 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1162 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1163 if (Factor != 2)
1164 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1165 Alignment, AddressSpace, CostKind,
1166 UseMaskForCond, UseMaskForGaps);
1167
1168 assert(Opcode == Instruction::Store && "Opcode must be a store");
1169 // For an interleaving store of 2 vectors, we perform one large interleaving
1170 // shuffle that goes into the wide store
1171 auto Mask = createInterleaveMask(VF, Factor);
1172 InstructionCost ShuffleCost =
1174 CostKind, 0, nullptr, {});
1175 return MemCost + ShuffleCost;
1176}
1177
1181
1182 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
1183 MICA.getID() == Intrinsic::vp_gather;
1184 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
1185 Type *DataTy = MICA.getDataType();
1186 Align Alignment = MICA.getAlignment();
1187 const Instruction *I = MICA.getInst();
1190
1191 if ((Opcode == Instruction::Load &&
1192 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1193 (Opcode == Instruction::Store &&
1194 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1196
1197 // Cost is proportional to the number of memory operations implied. For
1198 // scalable vectors, we use an estimate on that number since we don't
1199 // know exactly what VL will be.
1200 auto &VTy = *cast<VectorType>(DataTy);
1201 InstructionCost MemOpCost =
1202 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1203 {TTI::OK_AnyValue, TTI::OP_None}, I);
1204 unsigned NumLoads = getEstimatedVLFor(&VTy);
1205 return NumLoads * MemOpCost;
1206}
1207
1209 const MemIntrinsicCostAttributes &MICA,
1211 unsigned Opcode = MICA.getID() == Intrinsic::masked_expandload
1212 ? Instruction::Load
1213 : Instruction::Store;
1214 Type *DataTy = MICA.getDataType();
1215 bool VariableMask = MICA.getVariableMask();
1216 Align Alignment = MICA.getAlignment();
1217 bool IsLegal = (Opcode == Instruction::Store &&
1218 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1219 (Opcode == Instruction::Load &&
1220 isLegalMaskedExpandLoad(DataTy, Alignment));
1221 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1223 // Example compressstore sequence:
1224 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1225 // vcompress.vm v10, v8, v0
1226 // vcpop.m a1, v0
1227 // vsetvli zero, a1, e32, m2, ta, ma
1228 // vse32.v v10, (a0)
1229 // Example expandload sequence:
1230 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1231 // vcpop.m a1, v0
1232 // vsetvli zero, a1, e32, m2, ta, ma
1233 // vle32.v v10, (a0)
1234 // vsetivli zero, 8, e32, m2, ta, ma
1235 // viota.m v12, v0
1236 // vrgather.vv v8, v10, v12, v0.t
1237 auto MemOpCost =
1238 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1239 auto LT = getTypeLegalizationCost(DataTy);
1240 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1241 if (VariableMask)
1242 Opcodes.push_back(RISCV::VCPOP_M);
1243 if (Opcode == Instruction::Store)
1244 Opcodes.append({RISCV::VCOMPRESS_VM});
1245 else
1246 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1247 return MemOpCost +
1248 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1249}
1250
1254
1255 unsigned Opcode = MICA.getID() == Intrinsic::experimental_vp_strided_load
1256 ? Instruction::Load
1257 : Instruction::Store;
1258
1259 Type *DataTy = MICA.getDataType();
1260 Align Alignment = MICA.getAlignment();
1261 const Instruction *I = MICA.getInst();
1262
1263 if (!isLegalStridedLoadStore(DataTy, Alignment))
1265
1267 return TTI::TCC_Basic;
1268
1269 // Cost is proportional to the number of memory operations implied. For
1270 // scalable vectors, we use an estimate on that number since we don't
1271 // know exactly what VL will be.
1272 auto &VTy = *cast<VectorType>(DataTy);
1273 InstructionCost MemOpCost =
1274 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1275 {TTI::OK_AnyValue, TTI::OP_None}, I);
1276 unsigned NumLoads = getEstimatedVLFor(&VTy);
1277 return NumLoads * MemOpCost;
1278}
1279
1282 // FIXME: This is a property of the default vector convention, not
1283 // all possible calling conventions. Fixing that will require
1284 // some TTI API and SLP rework.
1287 for (auto *Ty : Tys) {
1288 if (!Ty->isVectorTy())
1289 continue;
1290 Align A = DL.getPrefTypeAlign(Ty);
1291 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1292 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1293 }
1294 return Cost;
1295}
1296
1297// Currently, these represent both throughput and codesize costs
1298// for the respective intrinsics. The costs in this table are simply
1299// instruction counts with the following adjustments made:
1300// * One vsetvli is considered free.
1302 {Intrinsic::floor, MVT::f32, 9},
1303 {Intrinsic::floor, MVT::f64, 9},
1304 {Intrinsic::ceil, MVT::f32, 9},
1305 {Intrinsic::ceil, MVT::f64, 9},
1306 {Intrinsic::trunc, MVT::f32, 7},
1307 {Intrinsic::trunc, MVT::f64, 7},
1308 {Intrinsic::round, MVT::f32, 9},
1309 {Intrinsic::round, MVT::f64, 9},
1310 {Intrinsic::roundeven, MVT::f32, 9},
1311 {Intrinsic::roundeven, MVT::f64, 9},
1312 {Intrinsic::rint, MVT::f32, 7},
1313 {Intrinsic::rint, MVT::f64, 7},
1314 {Intrinsic::nearbyint, MVT::f32, 9},
1315 {Intrinsic::nearbyint, MVT::f64, 9},
1316 {Intrinsic::bswap, MVT::i16, 3},
1317 {Intrinsic::bswap, MVT::i32, 12},
1318 {Intrinsic::bswap, MVT::i64, 31},
1319 {Intrinsic::vp_bswap, MVT::i16, 3},
1320 {Intrinsic::vp_bswap, MVT::i32, 12},
1321 {Intrinsic::vp_bswap, MVT::i64, 31},
1322 {Intrinsic::vp_fshl, MVT::i8, 7},
1323 {Intrinsic::vp_fshl, MVT::i16, 7},
1324 {Intrinsic::vp_fshl, MVT::i32, 7},
1325 {Intrinsic::vp_fshl, MVT::i64, 7},
1326 {Intrinsic::vp_fshr, MVT::i8, 7},
1327 {Intrinsic::vp_fshr, MVT::i16, 7},
1328 {Intrinsic::vp_fshr, MVT::i32, 7},
1329 {Intrinsic::vp_fshr, MVT::i64, 7},
1330 {Intrinsic::bitreverse, MVT::i8, 17},
1331 {Intrinsic::bitreverse, MVT::i16, 24},
1332 {Intrinsic::bitreverse, MVT::i32, 33},
1333 {Intrinsic::bitreverse, MVT::i64, 52},
1334 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1335 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1336 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1337 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1338 {Intrinsic::ctpop, MVT::i8, 12},
1339 {Intrinsic::ctpop, MVT::i16, 19},
1340 {Intrinsic::ctpop, MVT::i32, 20},
1341 {Intrinsic::ctpop, MVT::i64, 21},
1342 {Intrinsic::ctlz, MVT::i8, 19},
1343 {Intrinsic::ctlz, MVT::i16, 28},
1344 {Intrinsic::ctlz, MVT::i32, 31},
1345 {Intrinsic::ctlz, MVT::i64, 35},
1346 {Intrinsic::cttz, MVT::i8, 16},
1347 {Intrinsic::cttz, MVT::i16, 23},
1348 {Intrinsic::cttz, MVT::i32, 24},
1349 {Intrinsic::cttz, MVT::i64, 25},
1350 {Intrinsic::vp_ctpop, MVT::i8, 12},
1351 {Intrinsic::vp_ctpop, MVT::i16, 19},
1352 {Intrinsic::vp_ctpop, MVT::i32, 20},
1353 {Intrinsic::vp_ctpop, MVT::i64, 21},
1354 {Intrinsic::vp_ctlz, MVT::i8, 19},
1355 {Intrinsic::vp_ctlz, MVT::i16, 28},
1356 {Intrinsic::vp_ctlz, MVT::i32, 31},
1357 {Intrinsic::vp_ctlz, MVT::i64, 35},
1358 {Intrinsic::vp_cttz, MVT::i8, 16},
1359 {Intrinsic::vp_cttz, MVT::i16, 23},
1360 {Intrinsic::vp_cttz, MVT::i32, 24},
1361 {Intrinsic::vp_cttz, MVT::i64, 25},
1362};
1363
1367 auto *RetTy = ICA.getReturnType();
1368 switch (ICA.getID()) {
1369 case Intrinsic::lrint:
1370 case Intrinsic::llrint:
1371 case Intrinsic::lround:
1372 case Intrinsic::llround: {
1373 auto LT = getTypeLegalizationCost(RetTy);
1374 Type *SrcTy = ICA.getArgTypes().front();
1375 auto SrcLT = getTypeLegalizationCost(SrcTy);
1376 if (ST->hasVInstructions() && LT.second.isVector()) {
1378 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1379 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1380 if (LT.second.getVectorElementType() == MVT::bf16) {
1381 if (!ST->hasVInstructionsBF16Minimal())
1383 if (DstEltSz == 32)
1384 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1385 else
1386 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1387 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1388 !ST->hasVInstructionsF16()) {
1389 if (!ST->hasVInstructionsF16Minimal())
1391 if (DstEltSz == 32)
1392 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1393 else
1394 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1395
1396 } else if (SrcEltSz > DstEltSz) {
1397 Ops = {RISCV::VFNCVT_X_F_W};
1398 } else if (SrcEltSz < DstEltSz) {
1399 Ops = {RISCV::VFWCVT_X_F_V};
1400 } else {
1401 Ops = {RISCV::VFCVT_X_F_V};
1402 }
1403
1404 // We need to use the source LMUL in the case of a narrowing op, and the
1405 // destination LMUL otherwise.
1406 if (SrcEltSz > DstEltSz)
1407 return SrcLT.first *
1408 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1409 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1410 }
1411 break;
1412 }
1413 case Intrinsic::ceil:
1414 case Intrinsic::floor:
1415 case Intrinsic::trunc:
1416 case Intrinsic::rint:
1417 case Intrinsic::round:
1418 case Intrinsic::roundeven: {
1419 // These all use the same code.
1420 auto LT = getTypeLegalizationCost(RetTy);
1421 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1422 return LT.first * 8;
1423 break;
1424 }
1425 case Intrinsic::umin:
1426 case Intrinsic::umax:
1427 case Intrinsic::smin:
1428 case Intrinsic::smax: {
1429 auto LT = getTypeLegalizationCost(RetTy);
1430 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1431 return LT.first;
1432
1433 if (ST->hasVInstructions() && LT.second.isVector()) {
1434 unsigned Op;
1435 switch (ICA.getID()) {
1436 case Intrinsic::umin:
1437 Op = RISCV::VMINU_VV;
1438 break;
1439 case Intrinsic::umax:
1440 Op = RISCV::VMAXU_VV;
1441 break;
1442 case Intrinsic::smin:
1443 Op = RISCV::VMIN_VV;
1444 break;
1445 case Intrinsic::smax:
1446 Op = RISCV::VMAX_VV;
1447 break;
1448 }
1449 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1450 }
1451 break;
1452 }
1453 case Intrinsic::sadd_sat:
1454 case Intrinsic::ssub_sat:
1455 case Intrinsic::uadd_sat:
1456 case Intrinsic::usub_sat: {
1457 auto LT = getTypeLegalizationCost(RetTy);
1458 if (ST->hasVInstructions() && LT.second.isVector()) {
1459 unsigned Op;
1460 switch (ICA.getID()) {
1461 case Intrinsic::sadd_sat:
1462 Op = RISCV::VSADD_VV;
1463 break;
1464 case Intrinsic::ssub_sat:
1465 Op = RISCV::VSSUBU_VV;
1466 break;
1467 case Intrinsic::uadd_sat:
1468 Op = RISCV::VSADDU_VV;
1469 break;
1470 case Intrinsic::usub_sat:
1471 Op = RISCV::VSSUBU_VV;
1472 break;
1473 }
1474 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1475 }
1476 break;
1477 }
1478 case Intrinsic::fma:
1479 case Intrinsic::fmuladd: {
1480 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1481 auto LT = getTypeLegalizationCost(RetTy);
1482 if (ST->hasVInstructions() && LT.second.isVector())
1483 return LT.first *
1484 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1485 break;
1486 }
1487 case Intrinsic::fabs: {
1488 auto LT = getTypeLegalizationCost(RetTy);
1489 if (ST->hasVInstructions() && LT.second.isVector()) {
1490 // lui a0, 8
1491 // addi a0, a0, -1
1492 // vsetvli a1, zero, e16, m1, ta, ma
1493 // vand.vx v8, v8, a0
1494 // f16 with zvfhmin and bf16 with zvfhbmin
1495 if (LT.second.getVectorElementType() == MVT::bf16 ||
1496 (LT.second.getVectorElementType() == MVT::f16 &&
1497 !ST->hasVInstructionsF16()))
1498 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1499 CostKind) +
1500 2;
1501 else
1502 return LT.first *
1503 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1504 }
1505 break;
1506 }
1507 case Intrinsic::sqrt: {
1508 auto LT = getTypeLegalizationCost(RetTy);
1509 if (ST->hasVInstructions() && LT.second.isVector()) {
1512 MVT ConvType = LT.second;
1513 MVT FsqrtType = LT.second;
1514 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1515 // will be spilt.
1516 if (LT.second.getVectorElementType() == MVT::bf16) {
1517 if (LT.second == MVT::nxv32bf16) {
1518 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1519 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1520 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1521 ConvType = MVT::nxv16f16;
1522 FsqrtType = MVT::nxv16f32;
1523 } else {
1524 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1525 FsqrtOp = {RISCV::VFSQRT_V};
1526 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1527 }
1528 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1529 !ST->hasVInstructionsF16()) {
1530 if (LT.second == MVT::nxv32f16) {
1531 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1532 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1533 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1534 ConvType = MVT::nxv16f16;
1535 FsqrtType = MVT::nxv16f32;
1536 } else {
1537 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1538 FsqrtOp = {RISCV::VFSQRT_V};
1539 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1540 }
1541 } else {
1542 FsqrtOp = {RISCV::VFSQRT_V};
1543 }
1544
1545 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1546 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1547 }
1548 break;
1549 }
1550 case Intrinsic::cttz:
1551 case Intrinsic::ctlz:
1552 case Intrinsic::ctpop: {
1553 auto LT = getTypeLegalizationCost(RetTy);
1554 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1555 unsigned Op;
1556 switch (ICA.getID()) {
1557 case Intrinsic::cttz:
1558 Op = RISCV::VCTZ_V;
1559 break;
1560 case Intrinsic::ctlz:
1561 Op = RISCV::VCLZ_V;
1562 break;
1563 case Intrinsic::ctpop:
1564 Op = RISCV::VCPOP_V;
1565 break;
1566 }
1567 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1568 }
1569 break;
1570 }
1571 case Intrinsic::abs: {
1572 auto LT = getTypeLegalizationCost(RetTy);
1573 if (ST->hasVInstructions() && LT.second.isVector()) {
1574 // vrsub.vi v10, v8, 0
1575 // vmax.vv v8, v8, v10
1576 return LT.first *
1577 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1578 LT.second, CostKind);
1579 }
1580 break;
1581 }
1582 case Intrinsic::fshl:
1583 case Intrinsic::fshr: {
1584 if (ICA.getArgs().empty())
1585 break;
1586
1587 // Funnel-shifts are ROTL/ROTR when the first and second operand are equal.
1588 // When Zbb/Zbkb is enabled we can use a single ROL(W)/ROR(I)(W)
1589 // instruction.
1590 if ((ST->hasStdExtZbb() || ST->hasStdExtZbkb()) && RetTy->isIntegerTy() &&
1591 ICA.getArgs()[0] == ICA.getArgs()[1] &&
1592 (RetTy->getIntegerBitWidth() == 32 ||
1593 RetTy->getIntegerBitWidth() == 64) &&
1594 RetTy->getIntegerBitWidth() <= ST->getXLen()) {
1595 return 1;
1596 }
1597 break;
1598 }
1599 case Intrinsic::get_active_lane_mask: {
1600 if (ST->hasVInstructions()) {
1601 Type *ExpRetTy = VectorType::get(
1602 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1603 auto LT = getTypeLegalizationCost(ExpRetTy);
1604
1605 // vid.v v8 // considered hoisted
1606 // vsaddu.vx v8, v8, a0
1607 // vmsltu.vx v0, v8, a1
1608 return LT.first *
1609 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1610 LT.second, CostKind);
1611 }
1612 break;
1613 }
1614 // TODO: add more intrinsic
1615 case Intrinsic::stepvector: {
1616 auto LT = getTypeLegalizationCost(RetTy);
1617 // Legalisation of illegal types involves an `index' instruction plus
1618 // (LT.first - 1) vector adds.
1619 if (ST->hasVInstructions())
1620 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1621 (LT.first - 1) *
1622 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1623 return 1 + (LT.first - 1);
1624 }
1625 case Intrinsic::experimental_cttz_elts: {
1626 Type *ArgTy = ICA.getArgTypes()[0];
1627 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1628 if (getTLI()->shouldExpandCttzElements(ArgType))
1629 break;
1630 InstructionCost Cost = getRISCVInstructionCost(
1631 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1632
1633 // If zero_is_poison is false, then we will generate additional
1634 // cmp + select instructions to convert -1 to EVL.
1635 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1636 if (ICA.getArgs().size() > 1 &&
1637 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1638 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1640 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1642
1643 return Cost;
1644 }
1645 case Intrinsic::experimental_vp_splice: {
1646 // To support type-based query from vectorizer, set the index to 0.
1647 // Note that index only change the cost from vslide.vx to vslide.vi and in
1648 // current implementations they have same costs.
1650 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1652 }
1653 case Intrinsic::fptoui_sat:
1654 case Intrinsic::fptosi_sat: {
1656 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1657 Type *SrcTy = ICA.getArgTypes()[0];
1658
1659 auto SrcLT = getTypeLegalizationCost(SrcTy);
1660 auto DstLT = getTypeLegalizationCost(RetTy);
1661 if (!SrcTy->isVectorTy())
1662 break;
1663
1664 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1666
1667 Cost +=
1668 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1669 RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
1670
1671 // Handle NaN.
1672 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1673 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1674 Type *CondTy = RetTy->getWithNewBitWidth(1);
1675 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1677 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1679 return Cost;
1680 }
1681 }
1682
1683 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1684 if (auto LT = getTypeLegalizationCost(RetTy);
1685 LT.second.isVector()) {
1686 MVT EltTy = LT.second.getVectorElementType();
1687 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1688 ICA.getID(), EltTy))
1689 return LT.first * Entry->Cost;
1690 }
1691 }
1692
1694}
1695
1698 const SCEV *Ptr,
1700 // Address computations for vector indexed load/store likely require an offset
1701 // and/or scaling.
1702 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1703 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1704
1705 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1706}
1707
1709 Type *Src,
1712 const Instruction *I) const {
1713 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1714 if (!IsVectorType)
1715 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1716
1717 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1718 // For now, skip all fixed vector cost analysis when P extension is available
1719 // to avoid crashes in getMinRVVVectorSizeInBits()
1720 if (ST->enablePExtSIMDCodeGen() &&
1722 return 1; // Treat as single instruction cost for now
1723 }
1724
1725 // FIXME: Need to compute legalizing cost for illegal types. The current
1726 // code handles only legal types and those which can be trivially
1727 // promoted to legal.
1728 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1729 Dst->getScalarSizeInBits() > ST->getELen())
1730 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1731
1732 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1733 assert(ISD && "Invalid opcode");
1734 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1735 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1736
1737 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1738 // The shared implementation doesn't model vector widening during legalization
1739 // and instead assumes scalarization. In order to scalarize an <N x i1>
1740 // vector, we need to extend/trunc to/from i8. If we don't special case
1741 // this, we can get an infinite recursion cycle.
1742 switch (ISD) {
1743 default:
1744 break;
1745 case ISD::SIGN_EXTEND:
1746 case ISD::ZERO_EXTEND:
1747 if (Src->getScalarSizeInBits() == 1) {
1748 // We do not use vsext/vzext to extend from mask vector.
1749 // Instead we use the following instructions to extend from mask vector:
1750 // vmv.v.i v8, 0
1751 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1752 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1753 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1754 DstLT.second, CostKind) +
1755 DstLT.first - 1;
1756 }
1757 break;
1758 case ISD::TRUNCATE:
1759 if (Dst->getScalarSizeInBits() == 1) {
1760 // We do not use several vncvt to truncate to mask vector. So we could
1761 // not use PowDiff to calculate it.
1762 // Instead we use the following instructions to truncate to mask vector:
1763 // vand.vi v8, v8, 1
1764 // vmsne.vi v0, v8, 0
1765 return SrcLT.first *
1766 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1767 SrcLT.second, CostKind) +
1768 SrcLT.first - 1;
1769 }
1770 break;
1771 };
1772
1773 // Our actual lowering for the case where a wider legal type is available
1774 // uses promotion to the wider type. This is reflected in the result of
1775 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1776 // scalarized if the legalized Src and Dst are not equal sized.
1777 const DataLayout &DL = this->getDataLayout();
1778 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1779 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1780 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1781 SrcLT.second.getSizeInBits()) ||
1782 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1783 DstLT.second.getSizeInBits()) ||
1784 SrcLT.first > 1 || DstLT.first > 1)
1785 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1786
1787 // The split cost is handled by the base getCastInstrCost
1788 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1789
1790 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1791 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1792 switch (ISD) {
1793 case ISD::SIGN_EXTEND:
1794 case ISD::ZERO_EXTEND: {
1795 if ((PowDiff < 1) || (PowDiff > 3))
1796 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1797 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1798 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1799 unsigned Op =
1800 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1801 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1802 }
1803 case ISD::TRUNCATE:
1804 case ISD::FP_EXTEND:
1805 case ISD::FP_ROUND: {
1806 // Counts of narrow/widen instructions.
1807 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1808 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1809
1810 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1811 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1812 : RISCV::VFNCVT_F_F_W;
1814 for (; SrcEltSize != DstEltSize;) {
1815 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1816 ? MVT::getIntegerVT(DstEltSize)
1817 : MVT::getFloatingPointVT(DstEltSize);
1818 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1819 DstEltSize =
1820 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1821 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1822 }
1823 return Cost;
1824 }
1825 case ISD::FP_TO_SINT:
1826 case ISD::FP_TO_UINT: {
1827 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1828 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1829 unsigned FWCVT =
1830 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1831 unsigned FNCVT =
1832 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1833 unsigned SrcEltSize = Src->getScalarSizeInBits();
1834 unsigned DstEltSize = Dst->getScalarSizeInBits();
1836 if ((SrcEltSize == 16) &&
1837 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1838 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1839 // pre-widening to f32 and then convert f32 to integer
1840 VectorType *VecF32Ty =
1841 VectorType::get(Type::getFloatTy(Dst->getContext()),
1842 cast<VectorType>(Dst)->getElementCount());
1843 std::pair<InstructionCost, MVT> VecF32LT =
1844 getTypeLegalizationCost(VecF32Ty);
1845 Cost +=
1846 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1847 VecF32LT.second, CostKind);
1848 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1849 return Cost;
1850 }
1851 if (DstEltSize == SrcEltSize)
1852 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1853 else if (DstEltSize > SrcEltSize)
1854 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1855 else { // (SrcEltSize > DstEltSize)
1856 // First do a narrowing conversion to an integer half the size, then
1857 // truncate if needed.
1858 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1859 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1860 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1861 if ((SrcEltSize / 2) > DstEltSize) {
1862 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1863 Cost +=
1864 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1865 }
1866 }
1867 return Cost;
1868 }
1869 case ISD::SINT_TO_FP:
1870 case ISD::UINT_TO_FP: {
1871 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1872 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1873 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1874 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1875 unsigned SrcEltSize = Src->getScalarSizeInBits();
1876 unsigned DstEltSize = Dst->getScalarSizeInBits();
1877
1879 if ((DstEltSize == 16) &&
1880 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1881 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1882 // it is converted to f32 and then converted to f16
1883 VectorType *VecF32Ty =
1884 VectorType::get(Type::getFloatTy(Dst->getContext()),
1885 cast<VectorType>(Dst)->getElementCount());
1886 std::pair<InstructionCost, MVT> VecF32LT =
1887 getTypeLegalizationCost(VecF32Ty);
1888 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1889 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1890 DstLT.second, CostKind);
1891 return Cost;
1892 }
1893
1894 if (DstEltSize == SrcEltSize)
1895 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1896 else if (DstEltSize > SrcEltSize) {
1897 if ((DstEltSize / 2) > SrcEltSize) {
1898 VectorType *VecTy =
1899 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1900 cast<VectorType>(Dst)->getElementCount());
1901 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1902 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1903 }
1904 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1905 } else
1906 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1907 return Cost;
1908 }
1909 }
1910 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1911}
1912
1913unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
1914 if (isa<ScalableVectorType>(Ty)) {
1915 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1916 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1917 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1918 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1919 }
1920 return cast<FixedVectorType>(Ty)->getNumElements();
1921}
1922
1925 FastMathFlags FMF,
1927 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1928 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1929
1930 // Skip if scalar size of Ty is bigger than ELEN.
1931 if (Ty->getScalarSizeInBits() > ST->getELen())
1932 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1933
1934 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1935 if (Ty->getElementType()->isIntegerTy(1)) {
1936 // SelectionDAGBuilder does following transforms:
1937 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1938 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1939 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1940 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1941 else
1942 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1943 }
1944
1945 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1947 InstructionCost ExtraCost = 0;
1948 switch (IID) {
1949 case Intrinsic::maximum:
1950 if (FMF.noNaNs()) {
1951 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1952 } else {
1953 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1954 RISCV::VFMV_F_S};
1955 // Cost of Canonical Nan + branch
1956 // lui a0, 523264
1957 // fmv.w.x fa0, a0
1958 Type *DstTy = Ty->getScalarType();
1959 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1960 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1961 ExtraCost = 1 +
1962 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1964 getCFInstrCost(Instruction::Br, CostKind);
1965 }
1966 break;
1967
1968 case Intrinsic::minimum:
1969 if (FMF.noNaNs()) {
1970 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1971 } else {
1972 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1973 RISCV::VFMV_F_S};
1974 // Cost of Canonical Nan + branch
1975 // lui a0, 523264
1976 // fmv.w.x fa0, a0
1977 Type *DstTy = Ty->getScalarType();
1978 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1979 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1980 ExtraCost = 1 +
1981 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1983 getCFInstrCost(Instruction::Br, CostKind);
1984 }
1985 break;
1986 }
1987 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1988 }
1989
1990 // IR Reduction is composed by one rvv reduction instruction and vmv
1991 unsigned SplitOp;
1993 switch (IID) {
1994 default:
1995 llvm_unreachable("Unsupported intrinsic");
1996 case Intrinsic::smax:
1997 SplitOp = RISCV::VMAX_VV;
1998 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1999 break;
2000 case Intrinsic::smin:
2001 SplitOp = RISCV::VMIN_VV;
2002 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
2003 break;
2004 case Intrinsic::umax:
2005 SplitOp = RISCV::VMAXU_VV;
2006 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
2007 break;
2008 case Intrinsic::umin:
2009 SplitOp = RISCV::VMINU_VV;
2010 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
2011 break;
2012 case Intrinsic::maxnum:
2013 SplitOp = RISCV::VFMAX_VV;
2014 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2015 break;
2016 case Intrinsic::minnum:
2017 SplitOp = RISCV::VFMIN_VV;
2018 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2019 break;
2020 }
2021 // Add a cost for data larger than LMUL8
2022 InstructionCost SplitCost =
2023 (LT.first > 1) ? (LT.first - 1) *
2024 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2025 : 0;
2026 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2027}
2028
2031 std::optional<FastMathFlags> FMF,
2033 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2034 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2035
2036 // Skip if scalar size of Ty is bigger than ELEN.
2037 if (Ty->getScalarSizeInBits() > ST->getELen())
2038 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2039
2040 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2041 assert(ISD && "Invalid opcode");
2042
2043 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
2044 ISD != ISD::FADD)
2045 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2046
2047 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2048 Type *ElementTy = Ty->getElementType();
2049 if (ElementTy->isIntegerTy(1)) {
2050 // Example sequences:
2051 // vfirst.m a0, v0
2052 // seqz a0, a0
2053 if (LT.second == MVT::v1i1)
2054 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
2055 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2057
2058 if (ISD == ISD::AND) {
2059 // Example sequences:
2060 // vmand.mm v8, v9, v8 ; needed every time type is split
2061 // vmnot.m v8, v0 ; alias for vmnand
2062 // vcpop.m a0, v8
2063 // seqz a0, a0
2064
2065 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
2066 // For LMUL <= 8, there is no splitting,
2067 // the sequences are vmnot, vcpop and seqz.
2068 // When LMUL > 8 and split = 1,
2069 // the sequences are vmnand, vcpop and seqz.
2070 // When LMUL > 8 and split > 1,
2071 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
2072 return ((LT.first > 2) ? (LT.first - 2) : 0) *
2073 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
2074 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
2075 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2076 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2078 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
2079 // Example sequences:
2080 // vsetvli a0, zero, e8, mf8, ta, ma
2081 // vmxor.mm v8, v0, v8 ; needed every time type is split
2082 // vcpop.m a0, v8
2083 // andi a0, a0, 1
2084 return (LT.first - 1) *
2085 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
2086 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
2087 } else {
2088 assert(ISD == ISD::OR);
2089 // Example sequences:
2090 // vsetvli a0, zero, e8, mf8, ta, ma
2091 // vmor.mm v8, v9, v8 ; needed every time type is split
2092 // vcpop.m a0, v0
2093 // snez a0, a0
2094 return (LT.first - 1) *
2095 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
2096 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2097 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2099 }
2100 }
2101
2102 // IR Reduction of or/and is composed by one vmv and one rvv reduction
2103 // instruction, and others is composed by two vmv and one rvv reduction
2104 // instruction
2105 unsigned SplitOp;
2107 switch (ISD) {
2108 case ISD::ADD:
2109 SplitOp = RISCV::VADD_VV;
2110 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
2111 break;
2112 case ISD::OR:
2113 SplitOp = RISCV::VOR_VV;
2114 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
2115 break;
2116 case ISD::XOR:
2117 SplitOp = RISCV::VXOR_VV;
2118 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
2119 break;
2120 case ISD::AND:
2121 SplitOp = RISCV::VAND_VV;
2122 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
2123 break;
2124 case ISD::FADD:
2125 // We can't promote f16/bf16 fadd reductions.
2126 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
2127 LT.second.getScalarType() == MVT::bf16)
2128 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2130 Opcodes.push_back(RISCV::VFMV_S_F);
2131 for (unsigned i = 0; i < LT.first.getValue(); i++)
2132 Opcodes.push_back(RISCV::VFREDOSUM_VS);
2133 Opcodes.push_back(RISCV::VFMV_F_S);
2134 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2135 }
2136 SplitOp = RISCV::VFADD_VV;
2137 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2138 break;
2139 }
2140 // Add a cost for data larger than LMUL8
2141 InstructionCost SplitCost =
2142 (LT.first > 1) ? (LT.first - 1) *
2143 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2144 : 0;
2145 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2146}
2147
2149 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2150 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2151 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2152 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2153 FMF, CostKind);
2154
2155 // Skip if scalar size of ResTy is bigger than ELEN.
2156 if (ResTy->getScalarSizeInBits() > ST->getELen())
2157 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2158 FMF, CostKind);
2159
2160 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2161 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2162 FMF, CostKind);
2163
2164 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2165
2166 if (IsUnsigned && Opcode == Instruction::Add &&
2167 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2168 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2169 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2170 return LT.first *
2171 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2172 }
2173
2174 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2175 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2176 FMF, CostKind);
2177
2178 return (LT.first - 1) +
2179 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2180}
2181
2185 assert(OpInfo.isConstant() && "non constant operand?");
2186 if (!isa<VectorType>(Ty))
2187 // FIXME: We need to account for immediate materialization here, but doing
2188 // a decent job requires more knowledge about the immediate than we
2189 // currently have here.
2190 return 0;
2191
2192 if (OpInfo.isUniform())
2193 // vmv.v.i, vmv.v.x, or vfmv.v.f
2194 // We ignore the cost of the scalar constant materialization to be consistent
2195 // with how we treat scalar constants themselves just above.
2196 return 1;
2197
2198 return getConstantPoolLoadCost(Ty, CostKind);
2199}
2200
2202 Align Alignment,
2203 unsigned AddressSpace,
2205 TTI::OperandValueInfo OpInfo,
2206 const Instruction *I) const {
2207 EVT VT = TLI->getValueType(DL, Src, true);
2208 // Type legalization can't handle structs
2209 if (VT == MVT::Other)
2210 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2211 CostKind, OpInfo, I);
2212
2214 if (Opcode == Instruction::Store && OpInfo.isConstant())
2215 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2216
2217 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2218
2219 InstructionCost BaseCost = [&]() {
2220 InstructionCost Cost = LT.first;
2222 return Cost;
2223
2224 // Our actual lowering for the case where a wider legal type is available
2225 // uses the a VL predicated load on the wider type. This is reflected in
2226 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2227 // widened cases are scalarized.
2228 const DataLayout &DL = this->getDataLayout();
2229 if (Src->isVectorTy() && LT.second.isVector() &&
2230 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
2231 LT.second.getSizeInBits()))
2232 return Cost;
2233
2234 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2235 CostKind, OpInfo, I);
2236 }();
2237
2238 // Assume memory ops cost scale with the number of vector registers
2239 // possible accessed by the instruction. Note that BasicTTI already
2240 // handles the LT.first term for us.
2241 if (ST->hasVInstructions() && LT.second.isVector() &&
2243 BaseCost *= TLI->getLMULCost(LT.second);
2244 return Cost + BaseCost;
2245}
2246
2248 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2250 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2252 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2253 Op1Info, Op2Info, I);
2254
2255 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2256 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2257 Op1Info, Op2Info, I);
2258
2259 // Skip if scalar size of ValTy is bigger than ELEN.
2260 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2261 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2262 Op1Info, Op2Info, I);
2263
2264 auto GetConstantMatCost =
2265 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2266 if (OpInfo.isUniform())
2267 // We return 0 we currently ignore the cost of materializing scalar
2268 // constants in GPRs.
2269 return 0;
2270
2271 return getConstantPoolLoadCost(ValTy, CostKind);
2272 };
2273
2274 InstructionCost ConstantMatCost;
2275 if (Op1Info.isConstant())
2276 ConstantMatCost += GetConstantMatCost(Op1Info);
2277 if (Op2Info.isConstant())
2278 ConstantMatCost += GetConstantMatCost(Op2Info);
2279
2280 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2281 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2282 if (CondTy->isVectorTy()) {
2283 if (ValTy->getScalarSizeInBits() == 1) {
2284 // vmandn.mm v8, v8, v9
2285 // vmand.mm v9, v0, v9
2286 // vmor.mm v0, v9, v8
2287 return ConstantMatCost +
2288 LT.first *
2289 getRISCVInstructionCost(
2290 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2291 LT.second, CostKind);
2292 }
2293 // vselect and max/min are supported natively.
2294 return ConstantMatCost +
2295 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2296 CostKind);
2297 }
2298
2299 if (ValTy->getScalarSizeInBits() == 1) {
2300 // vmv.v.x v9, a0
2301 // vmsne.vi v9, v9, 0
2302 // vmandn.mm v8, v8, v9
2303 // vmand.mm v9, v0, v9
2304 // vmor.mm v0, v9, v8
2305 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2306 return ConstantMatCost +
2307 LT.first *
2308 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2309 InterimVT, CostKind) +
2310 LT.first * getRISCVInstructionCost(
2311 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2312 LT.second, CostKind);
2313 }
2314
2315 // vmv.v.x v10, a0
2316 // vmsne.vi v0, v10, 0
2317 // vmerge.vvm v8, v9, v8, v0
2318 return ConstantMatCost +
2319 LT.first * getRISCVInstructionCost(
2320 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2321 LT.second, CostKind);
2322 }
2323
2324 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2325 CmpInst::isIntPredicate(VecPred)) {
2326 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2327 // provided they incur the same cost across all implementations
2328 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2329 LT.second,
2330 CostKind);
2331 }
2332
2333 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2334 CmpInst::isFPPredicate(VecPred)) {
2335
2336 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2337 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2338 return ConstantMatCost +
2339 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2340
2341 // If we do not support the input floating point vector type, use the base
2342 // one which will calculate as:
2343 // ScalarizeCost + Num * Cost for fixed vector,
2344 // InvalidCost for scalable vector.
2345 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2346 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2347 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2348 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2349 Op1Info, Op2Info, I);
2350
2351 // Assuming vector fp compare and mask instructions are all the same cost
2352 // until a need arises to differentiate them.
2353 switch (VecPred) {
2354 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2355 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2356 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2357 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2358 return ConstantMatCost +
2359 LT.first * getRISCVInstructionCost(
2360 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2361 LT.second, CostKind);
2362
2363 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2364 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2365 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2366 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2367 return ConstantMatCost +
2368 LT.first *
2369 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2370 LT.second, CostKind);
2371
2372 case CmpInst::FCMP_OEQ: // vmfeq.vv
2373 case CmpInst::FCMP_OGT: // vmflt.vv
2374 case CmpInst::FCMP_OGE: // vmfle.vv
2375 case CmpInst::FCMP_OLT: // vmflt.vv
2376 case CmpInst::FCMP_OLE: // vmfle.vv
2377 case CmpInst::FCMP_UNE: // vmfne.vv
2378 return ConstantMatCost +
2379 LT.first *
2380 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2381 default:
2382 break;
2383 }
2384 }
2385
2386 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2387 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2388 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2389 // be (0 + select instr cost).
2390 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2391 ValTy->isIntegerTy() && !I->user_empty()) {
2392 if (all_of(I->users(), [&](const User *U) {
2393 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2394 U->getType()->isIntegerTy() &&
2395 !isa<ConstantData>(U->getOperand(1)) &&
2396 !isa<ConstantData>(U->getOperand(2));
2397 }))
2398 return 0;
2399 }
2400
2401 // TODO: Add cost for scalar type.
2402
2403 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2404 Op1Info, Op2Info, I);
2405}
2406
2409 const Instruction *I) const {
2411 return Opcode == Instruction::PHI ? 0 : 1;
2412 // Branches are assumed to be predicted.
2413 return 0;
2414}
2415
2418 unsigned Index,
2419 const Value *Op0,
2420 const Value *Op1) const {
2421 assert(Val->isVectorTy() && "This must be a vector type");
2422
2423 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
2424 // For now, skip all fixed vector cost analysis when P extension is available
2425 // to avoid crashes in getMinRVVVectorSizeInBits()
2426 if (ST->enablePExtSIMDCodeGen() && isa<FixedVectorType>(Val)) {
2427 return 1; // Treat as single instruction cost for now
2428 }
2429
2430 if (Opcode != Instruction::ExtractElement &&
2431 Opcode != Instruction::InsertElement)
2432 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
2433
2434 // Legalize the type.
2435 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2436
2437 // This type is legalized to a scalar type.
2438 if (!LT.second.isVector()) {
2439 auto *FixedVecTy = cast<FixedVectorType>(Val);
2440 // If Index is a known constant, cost is zero.
2441 if (Index != -1U)
2442 return 0;
2443 // Extract/InsertElement with non-constant index is very costly when
2444 // scalarized; estimate cost of loads/stores sequence via the stack:
2445 // ExtractElement cost: store vector to stack, load scalar;
2446 // InsertElement cost: store vector to stack, store scalar, load vector.
2447 Type *ElemTy = FixedVecTy->getElementType();
2448 auto NumElems = FixedVecTy->getNumElements();
2449 auto Align = DL.getPrefTypeAlign(ElemTy);
2450 InstructionCost LoadCost =
2451 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2452 InstructionCost StoreCost =
2453 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2454 return Opcode == Instruction::ExtractElement
2455 ? StoreCost * NumElems + LoadCost
2456 : (StoreCost + LoadCost) * NumElems + StoreCost;
2457 }
2458
2459 // For unsupported scalable vector.
2460 if (LT.second.isScalableVector() && !LT.first.isValid())
2461 return LT.first;
2462
2463 // Mask vector extract/insert is expanded via e8.
2464 if (Val->getScalarSizeInBits() == 1) {
2465 VectorType *WideTy =
2467 cast<VectorType>(Val)->getElementCount());
2468 if (Opcode == Instruction::ExtractElement) {
2469 InstructionCost ExtendCost
2470 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2472 InstructionCost ExtractCost
2473 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2474 return ExtendCost + ExtractCost;
2475 }
2476 InstructionCost ExtendCost
2477 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2479 InstructionCost InsertCost
2480 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2481 InstructionCost TruncCost
2482 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2484 return ExtendCost + InsertCost + TruncCost;
2485 }
2486
2487
2488 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2489 // and vslideup + vmv.s.x to insert element to vector.
2490 unsigned BaseCost = 1;
2491 // When insertelement we should add the index with 1 as the input of vslideup.
2492 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2493
2494 if (Index != -1U) {
2495 // The type may be split. For fixed-width vectors we can normalize the
2496 // index to the new type.
2497 if (LT.second.isFixedLengthVector()) {
2498 unsigned Width = LT.second.getVectorNumElements();
2499 Index = Index % Width;
2500 }
2501
2502 // If exact VLEN is known, we will insert/extract into the appropriate
2503 // subvector with no additional subvector insert/extract cost.
2504 if (auto VLEN = ST->getRealVLen()) {
2505 unsigned EltSize = LT.second.getScalarSizeInBits();
2506 unsigned M1Max = *VLEN / EltSize;
2507 Index = Index % M1Max;
2508 }
2509
2510 if (Index == 0)
2511 // We can extract/insert the first element without vslidedown/vslideup.
2512 SlideCost = 0;
2513 else if (ST->hasVendorXRivosVisni() && isUInt<5>(Index) &&
2514 Val->getScalarType()->isIntegerTy())
2515 SlideCost = 0; // With ri.vinsert/ri.vextract there is no slide needed
2516 else if (Opcode == Instruction::InsertElement)
2517 SlideCost = 1; // With a constant index, we do not need to use addi.
2518 }
2519
2520 // When the vector needs to split into multiple register groups and the index
2521 // exceeds single vector register group, we need to insert/extract the element
2522 // via stack.
2523 if (LT.first > 1 &&
2524 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2525 LT.second.isScalableVector()))) {
2526 Type *ScalarType = Val->getScalarType();
2527 Align VecAlign = DL.getPrefTypeAlign(Val);
2528 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2529 // Extra addi for unknown index.
2530 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2531
2532 // Store all split vectors into stack and load the target element.
2533 if (Opcode == Instruction::ExtractElement)
2534 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2535 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2536 CostKind) +
2537 IdxCost;
2538
2539 // Store all split vectors into stack and store the target element and load
2540 // vectors back.
2541 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2542 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2543 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2544 CostKind) +
2545 IdxCost;
2546 }
2547
2548 // Extract i64 in the target that has XLEN=32 need more instruction.
2549 if (Val->getScalarType()->isIntegerTy() &&
2550 ST->getXLen() < Val->getScalarSizeInBits()) {
2551 // For extractelement, we need the following instructions:
2552 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2553 // vslidedown.vx v8, v8, a0
2554 // vmv.x.s a0, v8
2555 // li a1, 32
2556 // vsrl.vx v8, v8, a1
2557 // vmv.x.s a1, v8
2558
2559 // For insertelement, we need the following instructions:
2560 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2561 // vmv.v.i v12, 0
2562 // vslide1up.vx v16, v12, a1
2563 // vslide1up.vx v12, v16, a0
2564 // addi a0, a2, 1
2565 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2566 // vslideup.vx v8, v12, a2
2567
2568 // TODO: should we count these special vsetvlis?
2569 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2570 }
2571 return BaseCost + SlideCost;
2572}
2573
2577 unsigned Index) const {
2578 if (isa<FixedVectorType>(Val))
2580 Index);
2581
2582 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2583 // for the cost of extracting the last lane of a scalable vector. It probably
2584 // needs a more accurate cost.
2585 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2586 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2587 return getVectorInstrCost(Opcode, Val, CostKind,
2588 EC.getKnownMinValue() - 1 - Index, nullptr,
2589 nullptr);
2590}
2591
2593 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2595 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2596
2597 // TODO: Handle more cost kinds.
2599 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2600 Args, CxtI);
2601
2602 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2603 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2604 Args, CxtI);
2605
2606 // Skip if scalar size of Ty is bigger than ELEN.
2607 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2608 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2609 Args, CxtI);
2610
2611 // Legalize the type.
2612 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2613
2614 // TODO: Handle scalar type.
2615 if (!LT.second.isVector())
2616 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2617 Args, CxtI);
2618
2619 // f16 with zvfhmin and bf16 will be promoted to f32.
2620 // FIXME: nxv32[b]f16 will be custom lowered and split.
2621 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2622 InstructionCost CastCost = 0;
2623 if ((LT.second.getVectorElementType() == MVT::f16 ||
2624 LT.second.getVectorElementType() == MVT::bf16) &&
2625 TLI->getOperationAction(ISDOpcode, LT.second) ==
2627 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2628 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2629 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2630 // Add cost of extending arguments
2631 CastCost += LT.first * Args.size() *
2632 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2634 // Add cost of truncating result
2635 CastCost +=
2636 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2638 // Compute cost of op in promoted type
2639 LT.second = PromotedVT;
2640 }
2641
2642 auto getConstantMatCost =
2643 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2644 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2645 // Two sub-cases:
2646 // * Has a 5 bit immediate operand which can be splatted.
2647 // * Has a larger immediate which must be materialized in scalar register
2648 // We return 0 for both as we currently ignore the cost of materializing
2649 // scalar constants in GPRs.
2650 return 0;
2651
2652 return getConstantPoolLoadCost(Ty, CostKind);
2653 };
2654
2655 // Add the cost of materializing any constant vectors required.
2656 InstructionCost ConstantMatCost = 0;
2657 if (Op1Info.isConstant())
2658 ConstantMatCost += getConstantMatCost(0, Op1Info);
2659 if (Op2Info.isConstant())
2660 ConstantMatCost += getConstantMatCost(1, Op2Info);
2661
2662 unsigned Op;
2663 switch (ISDOpcode) {
2664 case ISD::ADD:
2665 case ISD::SUB:
2666 Op = RISCV::VADD_VV;
2667 break;
2668 case ISD::SHL:
2669 case ISD::SRL:
2670 case ISD::SRA:
2671 Op = RISCV::VSLL_VV;
2672 break;
2673 case ISD::AND:
2674 case ISD::OR:
2675 case ISD::XOR:
2676 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2677 break;
2678 case ISD::MUL:
2679 case ISD::MULHS:
2680 case ISD::MULHU:
2681 Op = RISCV::VMUL_VV;
2682 break;
2683 case ISD::SDIV:
2684 case ISD::UDIV:
2685 Op = RISCV::VDIV_VV;
2686 break;
2687 case ISD::SREM:
2688 case ISD::UREM:
2689 Op = RISCV::VREM_VV;
2690 break;
2691 case ISD::FADD:
2692 case ISD::FSUB:
2693 Op = RISCV::VFADD_VV;
2694 break;
2695 case ISD::FMUL:
2696 Op = RISCV::VFMUL_VV;
2697 break;
2698 case ISD::FDIV:
2699 Op = RISCV::VFDIV_VV;
2700 break;
2701 case ISD::FNEG:
2702 Op = RISCV::VFSGNJN_VV;
2703 break;
2704 default:
2705 // Assuming all other instructions have the same cost until a need arises to
2706 // differentiate them.
2707 return CastCost + ConstantMatCost +
2708 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2709 Args, CxtI);
2710 }
2711
2712 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2713 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2714 // ops are twice as expensive as integer ops. Do the same for vectors so
2715 // scalar floating point ops aren't cheaper than their vector equivalents.
2716 if (Ty->isFPOrFPVectorTy())
2717 InstrCost *= 2;
2718 return CastCost + ConstantMatCost + LT.first * InstrCost;
2719}
2720
2721// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2723 ArrayRef<const Value *> Ptrs, const Value *Base,
2724 const TTI::PointersChainInfo &Info, Type *AccessTy,
2727 // In the basic model we take into account GEP instructions only
2728 // (although here can come alloca instruction, a value, constants and/or
2729 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2730 // pointer). Typically, if Base is a not a GEP-instruction and all the
2731 // pointers are relative to the same base address, all the rest are
2732 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2733 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2734 // any their index is a non-const.
2735 // If no known dependencies between the pointers cost is calculated as a sum
2736 // of costs of GEP instructions.
2737 for (auto [I, V] : enumerate(Ptrs)) {
2738 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2739 if (!GEP)
2740 continue;
2741 if (Info.isSameBase() && V != Base) {
2742 if (GEP->hasAllConstantIndices())
2743 continue;
2744 // If the chain is unit-stride and BaseReg + stride*i is a legal
2745 // addressing mode, then presume the base GEP is sitting around in a
2746 // register somewhere and check if we can fold the offset relative to
2747 // it.
2748 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2749 if (Info.isUnitStride() &&
2750 isLegalAddressingMode(AccessTy,
2751 /* BaseGV */ nullptr,
2752 /* BaseOffset */ Stride * I,
2753 /* HasBaseReg */ true,
2754 /* Scale */ 0,
2755 GEP->getType()->getPointerAddressSpace()))
2756 continue;
2757 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2758 {TTI::OK_AnyValue, TTI::OP_None},
2759 {TTI::OK_AnyValue, TTI::OP_None}, {});
2760 } else {
2761 SmallVector<const Value *> Indices(GEP->indices());
2762 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2763 Indices, AccessTy, CostKind);
2764 }
2765 }
2766 return Cost;
2767}
2768
2771 OptimizationRemarkEmitter *ORE) const {
2772 // TODO: More tuning on benchmarks and metrics with changes as needed
2773 // would apply to all settings below to enable performance.
2774
2775
2776 if (ST->enableDefaultUnroll())
2777 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2778
2779 // Enable Upper bound unrolling universally, not dependent upon the conditions
2780 // below.
2781 UP.UpperBound = true;
2782
2783 // Disable loop unrolling for Oz and Os.
2784 UP.OptSizeThreshold = 0;
2786 if (L->getHeader()->getParent()->hasOptSize())
2787 return;
2788
2789 SmallVector<BasicBlock *, 4> ExitingBlocks;
2790 L->getExitingBlocks(ExitingBlocks);
2791 LLVM_DEBUG(dbgs() << "Loop has:\n"
2792 << "Blocks: " << L->getNumBlocks() << "\n"
2793 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2794
2795 // Only allow another exit other than the latch. This acts as an early exit
2796 // as it mirrors the profitability calculation of the runtime unroller.
2797 if (ExitingBlocks.size() > 2)
2798 return;
2799
2800 // Limit the CFG of the loop body for targets with a branch predictor.
2801 // Allowing 4 blocks permits if-then-else diamonds in the body.
2802 if (L->getNumBlocks() > 4)
2803 return;
2804
2805 // Scan the loop: don't unroll loops with calls as this could prevent
2806 // inlining. Don't unroll auto-vectorized loops either, though do allow
2807 // unrolling of the scalar remainder.
2808 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2810 for (auto *BB : L->getBlocks()) {
2811 for (auto &I : *BB) {
2812 // Both auto-vectorized loops and the scalar remainder have the
2813 // isvectorized attribute, so differentiate between them by the presence
2814 // of vector instructions.
2815 if (IsVectorized && (I.getType()->isVectorTy() ||
2816 llvm::any_of(I.operand_values(), [](Value *V) {
2817 return V->getType()->isVectorTy();
2818 })))
2819 return;
2820
2821 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2822 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2823 if (!isLoweredToCall(F))
2824 continue;
2825 }
2826 return;
2827 }
2828
2829 SmallVector<const Value *> Operands(I.operand_values());
2830 Cost += getInstructionCost(&I, Operands,
2832 }
2833 }
2834
2835 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2836
2837 UP.Partial = true;
2838 UP.Runtime = true;
2839 UP.UnrollRemainder = true;
2840 UP.UnrollAndJam = true;
2841
2842 // Force unrolling small loops can be very useful because of the branch
2843 // taken cost of the backedge.
2844 if (Cost < 12)
2845 UP.Force = true;
2846}
2847
2852
2854 MemIntrinsicInfo &Info) const {
2855 const DataLayout &DL = getDataLayout();
2856 Intrinsic::ID IID = Inst->getIntrinsicID();
2857 LLVMContext &C = Inst->getContext();
2858 bool HasMask = false;
2859
2860 auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,
2861 bool IsWrite) -> int64_t {
2862 if (auto *TarExtTy =
2863 dyn_cast<TargetExtType>(II->getArgOperand(0)->getType()))
2864 return TarExtTy->getIntParameter(0);
2865
2866 return 1;
2867 };
2868
2869 switch (IID) {
2870 case Intrinsic::riscv_vle_mask:
2871 case Intrinsic::riscv_vse_mask:
2872 case Intrinsic::riscv_vlseg2_mask:
2873 case Intrinsic::riscv_vlseg3_mask:
2874 case Intrinsic::riscv_vlseg4_mask:
2875 case Intrinsic::riscv_vlseg5_mask:
2876 case Intrinsic::riscv_vlseg6_mask:
2877 case Intrinsic::riscv_vlseg7_mask:
2878 case Intrinsic::riscv_vlseg8_mask:
2879 case Intrinsic::riscv_vsseg2_mask:
2880 case Intrinsic::riscv_vsseg3_mask:
2881 case Intrinsic::riscv_vsseg4_mask:
2882 case Intrinsic::riscv_vsseg5_mask:
2883 case Intrinsic::riscv_vsseg6_mask:
2884 case Intrinsic::riscv_vsseg7_mask:
2885 case Intrinsic::riscv_vsseg8_mask:
2886 HasMask = true;
2887 [[fallthrough]];
2888 case Intrinsic::riscv_vle:
2889 case Intrinsic::riscv_vse:
2890 case Intrinsic::riscv_vlseg2:
2891 case Intrinsic::riscv_vlseg3:
2892 case Intrinsic::riscv_vlseg4:
2893 case Intrinsic::riscv_vlseg5:
2894 case Intrinsic::riscv_vlseg6:
2895 case Intrinsic::riscv_vlseg7:
2896 case Intrinsic::riscv_vlseg8:
2897 case Intrinsic::riscv_vsseg2:
2898 case Intrinsic::riscv_vsseg3:
2899 case Intrinsic::riscv_vsseg4:
2900 case Intrinsic::riscv_vsseg5:
2901 case Intrinsic::riscv_vsseg6:
2902 case Intrinsic::riscv_vsseg7:
2903 case Intrinsic::riscv_vsseg8: {
2904 // Intrinsic interface:
2905 // riscv_vle(merge, ptr, vl)
2906 // riscv_vle_mask(merge, ptr, mask, vl, policy)
2907 // riscv_vse(val, ptr, vl)
2908 // riscv_vse_mask(val, ptr, mask, vl, policy)
2909 // riscv_vlseg#(merge, ptr, vl, sew)
2910 // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)
2911 // riscv_vsseg#(val, ptr, vl, sew)
2912 // riscv_vsseg#_mask(val, ptr, mask, vl, sew)
2913 bool IsWrite = Inst->getType()->isVoidTy();
2914 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2915 // The results of segment loads are TargetExtType.
2916 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
2917 unsigned SEW =
2918 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
2919 ->getZExtValue();
2920 Ty = TarExtTy->getTypeParameter(0U);
2922 IntegerType::get(C, SEW),
2923 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
2924 }
2925 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2926 unsigned VLIndex = RVVIInfo->VLOperand;
2927 unsigned PtrOperandNo = VLIndex - 1 - HasMask;
2928 MaybeAlign Alignment =
2929 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
2930 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
2931 Value *Mask = ConstantInt::getTrue(MaskType);
2932 if (HasMask)
2933 Mask = Inst->getArgOperand(VLIndex - 1);
2934 Value *EVL = Inst->getArgOperand(VLIndex);
2935 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
2936 // RVV uses contiguous elements as a segment.
2937 if (SegNum > 1) {
2938 unsigned ElemSize = Ty->getScalarSizeInBits();
2939 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
2940 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
2941 }
2942 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
2943 Alignment, Mask, EVL);
2944 return true;
2945 }
2946 case Intrinsic::riscv_vlse_mask:
2947 case Intrinsic::riscv_vsse_mask:
2948 case Intrinsic::riscv_vlsseg2_mask:
2949 case Intrinsic::riscv_vlsseg3_mask:
2950 case Intrinsic::riscv_vlsseg4_mask:
2951 case Intrinsic::riscv_vlsseg5_mask:
2952 case Intrinsic::riscv_vlsseg6_mask:
2953 case Intrinsic::riscv_vlsseg7_mask:
2954 case Intrinsic::riscv_vlsseg8_mask:
2955 case Intrinsic::riscv_vssseg2_mask:
2956 case Intrinsic::riscv_vssseg3_mask:
2957 case Intrinsic::riscv_vssseg4_mask:
2958 case Intrinsic::riscv_vssseg5_mask:
2959 case Intrinsic::riscv_vssseg6_mask:
2960 case Intrinsic::riscv_vssseg7_mask:
2961 case Intrinsic::riscv_vssseg8_mask:
2962 HasMask = true;
2963 [[fallthrough]];
2964 case Intrinsic::riscv_vlse:
2965 case Intrinsic::riscv_vsse:
2966 case Intrinsic::riscv_vlsseg2:
2967 case Intrinsic::riscv_vlsseg3:
2968 case Intrinsic::riscv_vlsseg4:
2969 case Intrinsic::riscv_vlsseg5:
2970 case Intrinsic::riscv_vlsseg6:
2971 case Intrinsic::riscv_vlsseg7:
2972 case Intrinsic::riscv_vlsseg8:
2973 case Intrinsic::riscv_vssseg2:
2974 case Intrinsic::riscv_vssseg3:
2975 case Intrinsic::riscv_vssseg4:
2976 case Intrinsic::riscv_vssseg5:
2977 case Intrinsic::riscv_vssseg6:
2978 case Intrinsic::riscv_vssseg7:
2979 case Intrinsic::riscv_vssseg8: {
2980 // Intrinsic interface:
2981 // riscv_vlse(merge, ptr, stride, vl)
2982 // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
2983 // riscv_vsse(val, ptr, stride, vl)
2984 // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
2985 // riscv_vlsseg#(merge, ptr, offset, vl, sew)
2986 // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)
2987 // riscv_vssseg#(val, ptr, offset, vl, sew)
2988 // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)
2989 bool IsWrite = Inst->getType()->isVoidTy();
2990 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2991 // The results of segment loads are TargetExtType.
2992 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
2993 unsigned SEW =
2994 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
2995 ->getZExtValue();
2996 Ty = TarExtTy->getTypeParameter(0U);
2998 IntegerType::get(C, SEW),
2999 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3000 }
3001 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3002 unsigned VLIndex = RVVIInfo->VLOperand;
3003 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3004 MaybeAlign Alignment =
3005 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
3006
3007 Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);
3008 // Use the pointer alignment as the element alignment if the stride is a
3009 // multiple of the pointer alignment. Otherwise, the element alignment
3010 // should be the greatest common divisor of pointer alignment and stride.
3011 // For simplicity, just consider unalignment for elements.
3012 unsigned PointerAlign = Alignment.valueOrOne().value();
3013 if (!isa<ConstantInt>(Stride) ||
3014 cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
3015 Alignment = Align(1);
3016
3017 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3018 Value *Mask = ConstantInt::getTrue(MaskType);
3019 if (HasMask)
3020 Mask = Inst->getArgOperand(VLIndex - 1);
3021 Value *EVL = Inst->getArgOperand(VLIndex);
3022 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3023 // RVV uses contiguous elements as a segment.
3024 if (SegNum > 1) {
3025 unsigned ElemSize = Ty->getScalarSizeInBits();
3026 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3027 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3028 }
3029 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3030 Alignment, Mask, EVL, Stride);
3031 return true;
3032 }
3033 case Intrinsic::riscv_vloxei_mask:
3034 case Intrinsic::riscv_vluxei_mask:
3035 case Intrinsic::riscv_vsoxei_mask:
3036 case Intrinsic::riscv_vsuxei_mask:
3037 case Intrinsic::riscv_vloxseg2_mask:
3038 case Intrinsic::riscv_vloxseg3_mask:
3039 case Intrinsic::riscv_vloxseg4_mask:
3040 case Intrinsic::riscv_vloxseg5_mask:
3041 case Intrinsic::riscv_vloxseg6_mask:
3042 case Intrinsic::riscv_vloxseg7_mask:
3043 case Intrinsic::riscv_vloxseg8_mask:
3044 case Intrinsic::riscv_vluxseg2_mask:
3045 case Intrinsic::riscv_vluxseg3_mask:
3046 case Intrinsic::riscv_vluxseg4_mask:
3047 case Intrinsic::riscv_vluxseg5_mask:
3048 case Intrinsic::riscv_vluxseg6_mask:
3049 case Intrinsic::riscv_vluxseg7_mask:
3050 case Intrinsic::riscv_vluxseg8_mask:
3051 case Intrinsic::riscv_vsoxseg2_mask:
3052 case Intrinsic::riscv_vsoxseg3_mask:
3053 case Intrinsic::riscv_vsoxseg4_mask:
3054 case Intrinsic::riscv_vsoxseg5_mask:
3055 case Intrinsic::riscv_vsoxseg6_mask:
3056 case Intrinsic::riscv_vsoxseg7_mask:
3057 case Intrinsic::riscv_vsoxseg8_mask:
3058 case Intrinsic::riscv_vsuxseg2_mask:
3059 case Intrinsic::riscv_vsuxseg3_mask:
3060 case Intrinsic::riscv_vsuxseg4_mask:
3061 case Intrinsic::riscv_vsuxseg5_mask:
3062 case Intrinsic::riscv_vsuxseg6_mask:
3063 case Intrinsic::riscv_vsuxseg7_mask:
3064 case Intrinsic::riscv_vsuxseg8_mask:
3065 HasMask = true;
3066 [[fallthrough]];
3067 case Intrinsic::riscv_vloxei:
3068 case Intrinsic::riscv_vluxei:
3069 case Intrinsic::riscv_vsoxei:
3070 case Intrinsic::riscv_vsuxei:
3071 case Intrinsic::riscv_vloxseg2:
3072 case Intrinsic::riscv_vloxseg3:
3073 case Intrinsic::riscv_vloxseg4:
3074 case Intrinsic::riscv_vloxseg5:
3075 case Intrinsic::riscv_vloxseg6:
3076 case Intrinsic::riscv_vloxseg7:
3077 case Intrinsic::riscv_vloxseg8:
3078 case Intrinsic::riscv_vluxseg2:
3079 case Intrinsic::riscv_vluxseg3:
3080 case Intrinsic::riscv_vluxseg4:
3081 case Intrinsic::riscv_vluxseg5:
3082 case Intrinsic::riscv_vluxseg6:
3083 case Intrinsic::riscv_vluxseg7:
3084 case Intrinsic::riscv_vluxseg8:
3085 case Intrinsic::riscv_vsoxseg2:
3086 case Intrinsic::riscv_vsoxseg3:
3087 case Intrinsic::riscv_vsoxseg4:
3088 case Intrinsic::riscv_vsoxseg5:
3089 case Intrinsic::riscv_vsoxseg6:
3090 case Intrinsic::riscv_vsoxseg7:
3091 case Intrinsic::riscv_vsoxseg8:
3092 case Intrinsic::riscv_vsuxseg2:
3093 case Intrinsic::riscv_vsuxseg3:
3094 case Intrinsic::riscv_vsuxseg4:
3095 case Intrinsic::riscv_vsuxseg5:
3096 case Intrinsic::riscv_vsuxseg6:
3097 case Intrinsic::riscv_vsuxseg7:
3098 case Intrinsic::riscv_vsuxseg8: {
3099 // Intrinsic interface (only listed ordered version):
3100 // riscv_vloxei(merge, ptr, index, vl)
3101 // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)
3102 // riscv_vsoxei(val, ptr, index, vl)
3103 // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)
3104 // riscv_vloxseg#(merge, ptr, index, vl, sew)
3105 // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)
3106 // riscv_vsoxseg#(val, ptr, index, vl, sew)
3107 // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)
3108 bool IsWrite = Inst->getType()->isVoidTy();
3109 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3110 // The results of segment loads are TargetExtType.
3111 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3112 unsigned SEW =
3113 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3114 ->getZExtValue();
3115 Ty = TarExtTy->getTypeParameter(0U);
3117 IntegerType::get(C, SEW),
3118 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3119 }
3120 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3121 unsigned VLIndex = RVVIInfo->VLOperand;
3122 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3123 Value *Mask;
3124 if (HasMask) {
3125 Mask = Inst->getArgOperand(VLIndex - 1);
3126 } else {
3127 // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,
3128 // and casting that to scalar i64 triggers a vector/scalar mismatch
3129 // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it
3130 // via extractelement instead.
3131 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3132 Mask = ConstantInt::getTrue(MaskType);
3133 }
3134 Value *EVL = Inst->getArgOperand(VLIndex);
3135 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3136 // RVV uses contiguous elements as a segment.
3137 if (SegNum > 1) {
3138 unsigned ElemSize = Ty->getScalarSizeInBits();
3139 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3140 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3141 }
3142 Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);
3143 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3144 Align(1), Mask, EVL,
3145 /* Stride */ nullptr, OffsetOp);
3146 return true;
3147 }
3148 }
3149 return false;
3150}
3151
3153 if (Ty->isVectorTy()) {
3154 // f16 with only zvfhmin and bf16 will be promoted to f32
3155 Type *EltTy = cast<VectorType>(Ty)->getElementType();
3156 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
3157 EltTy->isBFloatTy())
3158 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
3159 cast<VectorType>(Ty));
3160
3161 TypeSize Size = DL.getTypeSizeInBits(Ty);
3162 if (Size.isScalable() && ST->hasVInstructions())
3163 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
3164
3165 if (ST->useRVVForFixedLengthVectors())
3166 return divideCeil(Size, ST->getRealMinVLen());
3167 }
3168
3169 return BaseT::getRegUsageForType(Ty);
3170}
3171
3172unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
3173 if (SLPMaxVF.getNumOccurrences())
3174 return SLPMaxVF;
3175
3176 // Return how many elements can fit in getRegisterBitwidth. This is the
3177 // same routine as used in LoopVectorizer. We should probably be
3178 // accounting for whether we actually have instructions with the right
3179 // lane type, but we don't have enough information to do that without
3180 // some additional plumbing which hasn't been justified yet.
3181 TypeSize RegWidth =
3183 // If no vector registers, or absurd element widths, disable
3184 // vectorization by returning 1.
3185 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
3186}
3187
3191
3193 return ST->enableUnalignedVectorMem();
3194}
3195
3198 ScalarEvolution *SE) const {
3199 if (ST->hasVendorXCVmem() && !ST->is64Bit())
3200 return TTI::AMK_PostIndexed;
3201
3203}
3204
3206 const TargetTransformInfo::LSRCost &C2) const {
3207 // RISC-V specific here are "instruction number 1st priority".
3208 // If we need to emit adds inside the loop to add up base registers, then
3209 // we need at least one extra temporary register.
3210 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
3211 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
3212 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
3213 C1.NumIVMuls, C1.NumBaseAdds,
3214 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3215 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
3216 C2.NumIVMuls, C2.NumBaseAdds,
3217 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3218}
3219
3221 Align Alignment) const {
3222 auto *VTy = dyn_cast<VectorType>(DataTy);
3223 if (!VTy || VTy->isScalableTy())
3224 return false;
3225
3226 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3227 return false;
3228
3229 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
3230 // scalarize these types with LMUL >= maximum fixed-length LMUL.
3231 if (VTy->getElementType()->isIntegerTy(8))
3232 if (VTy->getElementCount().getFixedValue() > 256)
3233 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
3234 ST->getMaxLMULForFixedLengthVectors();
3235 return true;
3236}
3237
3239 Align Alignment) const {
3240 auto *VTy = dyn_cast<VectorType>(DataTy);
3241 if (!VTy || VTy->isScalableTy())
3242 return false;
3243
3244 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3245 return false;
3246 return true;
3247}
3248
3249/// See if \p I should be considered for address type promotion. We check if \p
3250/// I is a sext with right type and used in memory accesses. If it used in a
3251/// "complex" getelementptr, we allow it to be promoted without finding other
3252/// sext instructions that sign extended the same initial value. A getelementptr
3253/// is considered as "complex" if it has more than 2 operands.
3255 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
3256 bool Considerable = false;
3257 AllowPromotionWithoutCommonHeader = false;
3258 if (!isa<SExtInst>(&I))
3259 return false;
3260 Type *ConsideredSExtType =
3261 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3262 if (I.getType() != ConsideredSExtType)
3263 return false;
3264 // See if the sext is the one with the right type and used in at least one
3265 // GetElementPtrInst.
3266 for (const User *U : I.users()) {
3267 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3268 Considerable = true;
3269 // A getelementptr is considered as "complex" if it has more than 2
3270 // operands. We will promote a SExt used in such complex GEP as we
3271 // expect some computation to be merged if they are done on 64 bits.
3272 if (GEPInst->getNumOperands() > 2) {
3273 AllowPromotionWithoutCommonHeader = true;
3274 break;
3275 }
3276 }
3277 }
3278 return Considerable;
3279}
3280
3281bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
3282 switch (Opcode) {
3283 case Instruction::Add:
3284 case Instruction::Sub:
3285 case Instruction::Mul:
3286 case Instruction::And:
3287 case Instruction::Or:
3288 case Instruction::Xor:
3289 case Instruction::FAdd:
3290 case Instruction::FSub:
3291 case Instruction::FMul:
3292 case Instruction::FDiv:
3293 case Instruction::ICmp:
3294 case Instruction::FCmp:
3295 return true;
3296 case Instruction::Shl:
3297 case Instruction::LShr:
3298 case Instruction::AShr:
3299 case Instruction::UDiv:
3300 case Instruction::SDiv:
3301 case Instruction::URem:
3302 case Instruction::SRem:
3303 case Instruction::Select:
3304 return Operand == 1;
3305 default:
3306 return false;
3307 }
3308}
3309
3311 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3312 return false;
3313
3314 if (canSplatOperand(I->getOpcode(), Operand))
3315 return true;
3316
3317 auto *II = dyn_cast<IntrinsicInst>(I);
3318 if (!II)
3319 return false;
3320
3321 switch (II->getIntrinsicID()) {
3322 case Intrinsic::fma:
3323 case Intrinsic::vp_fma:
3324 case Intrinsic::fmuladd:
3325 case Intrinsic::vp_fmuladd:
3326 return Operand == 0 || Operand == 1;
3327 case Intrinsic::vp_shl:
3328 case Intrinsic::vp_lshr:
3329 case Intrinsic::vp_ashr:
3330 case Intrinsic::vp_udiv:
3331 case Intrinsic::vp_sdiv:
3332 case Intrinsic::vp_urem:
3333 case Intrinsic::vp_srem:
3334 case Intrinsic::ssub_sat:
3335 case Intrinsic::vp_ssub_sat:
3336 case Intrinsic::usub_sat:
3337 case Intrinsic::vp_usub_sat:
3338 case Intrinsic::vp_select:
3339 return Operand == 1;
3340 // These intrinsics are commutative.
3341 case Intrinsic::vp_add:
3342 case Intrinsic::vp_mul:
3343 case Intrinsic::vp_and:
3344 case Intrinsic::vp_or:
3345 case Intrinsic::vp_xor:
3346 case Intrinsic::vp_fadd:
3347 case Intrinsic::vp_fmul:
3348 case Intrinsic::vp_icmp:
3349 case Intrinsic::vp_fcmp:
3350 case Intrinsic::smin:
3351 case Intrinsic::vp_smin:
3352 case Intrinsic::umin:
3353 case Intrinsic::vp_umin:
3354 case Intrinsic::smax:
3355 case Intrinsic::vp_smax:
3356 case Intrinsic::umax:
3357 case Intrinsic::vp_umax:
3358 case Intrinsic::sadd_sat:
3359 case Intrinsic::vp_sadd_sat:
3360 case Intrinsic::uadd_sat:
3361 case Intrinsic::vp_uadd_sat:
3362 // These intrinsics have 'vr' versions.
3363 case Intrinsic::vp_sub:
3364 case Intrinsic::vp_fsub:
3365 case Intrinsic::vp_fdiv:
3366 return Operand == 0 || Operand == 1;
3367 default:
3368 return false;
3369 }
3370}
3371
3372/// Check if sinking \p I's operands to I's basic block is profitable, because
3373/// the operands can be folded into a target instruction, e.g.
3374/// splats of scalars can fold into vector instructions.
3377 using namespace llvm::PatternMatch;
3378
3379 if (I->isBitwiseLogicOp()) {
3380 if (!I->getType()->isVectorTy()) {
3381 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
3382 for (auto &Op : I->operands()) {
3383 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
3384 if (match(Op.get(), m_Not(m_Value()))) {
3385 Ops.push_back(&Op);
3386 return true;
3387 }
3388 }
3389 }
3390 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
3391 for (auto &Op : I->operands()) {
3392 // (and X, (not Y)) -> (vandn.vv X, Y)
3393 if (match(Op.get(), m_Not(m_Value()))) {
3394 Ops.push_back(&Op);
3395 return true;
3396 }
3397 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
3399 m_ZeroInt()),
3400 m_Value(), m_ZeroMask()))) {
3401 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
3402 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
3403 Ops.push_back(&Not);
3404 Ops.push_back(&InsertElt);
3405 Ops.push_back(&Op);
3406 return true;
3407 }
3408 }
3409 }
3410 }
3411
3412 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3413 return false;
3414
3415 // Don't sink splat operands if the target prefers it. Some targets requires
3416 // S2V transfer buffers and we can run out of them copying the same value
3417 // repeatedly.
3418 // FIXME: It could still be worth doing if it would improve vector register
3419 // pressure and prevent a vector spill.
3420 if (!ST->sinkSplatOperands())
3421 return false;
3422
3423 for (auto OpIdx : enumerate(I->operands())) {
3424 if (!canSplatOperand(I, OpIdx.index()))
3425 continue;
3426
3427 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
3428 // Make sure we are not already sinking this operand
3429 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
3430 continue;
3431
3432 // We are looking for a splat that can be sunk.
3434 m_Value(), m_ZeroMask())))
3435 continue;
3436
3437 // Don't sink i1 splats.
3438 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
3439 continue;
3440
3441 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3442 // and vector registers
3443 for (Use &U : Op->uses()) {
3444 Instruction *Insn = cast<Instruction>(U.getUser());
3445 if (!canSplatOperand(Insn, U.getOperandNo()))
3446 return false;
3447 }
3448
3449 // Sink any fpexts since they might be used in a widening fp pattern.
3450 Use *InsertEltUse = &Op->getOperandUse(0);
3451 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3452 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3453 Ops.push_back(&InsertElt->getOperandUse(1));
3454 Ops.push_back(InsertEltUse);
3455 Ops.push_back(&OpIdx.value());
3456 }
3457 return true;
3458}
3459
3461RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3463 // TODO: Enable expansion when unaligned access is not supported after we fix
3464 // issues in ExpandMemcmp.
3465 if (!ST->enableUnalignedScalarMem())
3466 return Options;
3467
3468 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3469 return Options;
3470
3471 Options.AllowOverlappingLoads = true;
3472 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3473 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3474 if (ST->is64Bit()) {
3475 Options.LoadSizes = {8, 4, 2, 1};
3476 Options.AllowedTailExpansions = {3, 5, 6};
3477 } else {
3478 Options.LoadSizes = {4, 2, 1};
3479 Options.AllowedTailExpansions = {3};
3480 }
3481
3482 if (IsZeroCmp && ST->hasVInstructions()) {
3483 unsigned VLenB = ST->getRealMinVLen() / 8;
3484 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3485 // `VLenB * MaxLMUL` so that it fits in a single register group.
3486 unsigned MinSize = ST->getXLen() / 8 + 1;
3487 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3488 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3489 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3490 }
3491 return Options;
3492}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
bool isLegalAddImmediate(int64_t imm) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool noNaNs() const
Definition FMF.h:65
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
const Instruction * getInst() const
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
InstructionCost getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getExpandCompressMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
virtual const DataLayout & getDataLayout() const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:233
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:963
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1106
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:874
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:412
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:838
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:698
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:759
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:844
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:972
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:920
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:733
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:953
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:850
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
bool match(Val *V, const Pattern &P)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2530
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1883
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2136
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).