LLVM 23.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include "llvm/IR/IntrinsicsRISCV.h"
20#include <cmath>
21#include <optional>
22using namespace llvm;
23using namespace llvm::PatternMatch;
24
25#define DEBUG_TYPE "riscvtti"
26
28 "riscv-v-register-bit-width-lmul",
30 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
31 "by autovectorized code. Fractional LMULs are not supported."),
33
35 "riscv-v-slp-max-vf",
37 "Overrides result used for getMaximumVF query which is used "
38 "exclusively by SLP vectorizer."),
40
42 RVVMinTripCount("riscv-v-min-trip-count",
43 cl::desc("Set the lower bound of a trip count to decide on "
44 "vectorization while tail-folding."),
46
47static cl::opt<bool> EnableOrLikeSelectOpt("enable-riscv-or-like-select",
48 cl::init(true), cl::Hidden);
49
51RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
53 // Check if the type is valid for all CostKind
54 if (!VT.isVector())
56 size_t NumInstr = OpCodes.size();
58 return NumInstr;
59 InstructionCost LMULCost = TLI->getLMULCost(VT);
61 return LMULCost * NumInstr;
62 InstructionCost Cost = 0;
63 for (auto Op : OpCodes) {
64 switch (Op) {
65 case RISCV::VRGATHER_VI:
66 Cost += TLI->getVRGatherVICost(VT);
67 break;
68 case RISCV::VRGATHER_VV:
69 Cost += TLI->getVRGatherVVCost(VT);
70 break;
71 case RISCV::VSLIDEUP_VI:
72 case RISCV::VSLIDEDOWN_VI:
73 Cost += TLI->getVSlideVICost(VT);
74 break;
75 case RISCV::VSLIDEUP_VX:
76 case RISCV::VSLIDEDOWN_VX:
77 Cost += TLI->getVSlideVXCost(VT);
78 break;
79 case RISCV::VREDMAX_VS:
80 case RISCV::VREDMIN_VS:
81 case RISCV::VREDMAXU_VS:
82 case RISCV::VREDMINU_VS:
83 case RISCV::VREDSUM_VS:
84 case RISCV::VREDAND_VS:
85 case RISCV::VREDOR_VS:
86 case RISCV::VREDXOR_VS:
87 case RISCV::VFREDMAX_VS:
88 case RISCV::VFREDMIN_VS:
89 case RISCV::VFREDUSUM_VS: {
90 unsigned VL = VT.getVectorMinNumElements();
91 if (!VT.isFixedLengthVector())
92 VL *= *getVScaleForTuning();
93 Cost += Log2_32_Ceil(VL);
94 break;
95 }
96 case RISCV::VFREDOSUM_VS: {
97 unsigned VL = VT.getVectorMinNumElements();
98 if (!VT.isFixedLengthVector())
99 VL *= *getVScaleForTuning();
100 Cost += VL;
101 break;
102 }
103 case RISCV::VMV_X_S:
104 case RISCV::VMV_S_X:
105 case RISCV::VFMV_F_S:
106 case RISCV::VFMV_S_F:
107 case RISCV::VMOR_MM:
108 case RISCV::VMXOR_MM:
109 case RISCV::VMAND_MM:
110 case RISCV::VMANDN_MM:
111 case RISCV::VMNAND_MM:
112 case RISCV::VCPOP_M:
113 case RISCV::VFIRST_M:
114 Cost += 1;
115 break;
116 case RISCV::VDIV_VV:
117 case RISCV::VREM_VV:
118 Cost += LMULCost * TTI::TCC_Expensive;
119 break;
120 default:
121 Cost += LMULCost;
122 }
123 }
124 return Cost;
125}
126
128 const RISCVSubtarget *ST,
129 const APInt &Imm, Type *Ty,
131 bool FreeZeroes) {
132 assert(Ty->isIntegerTy() &&
133 "getIntImmCost can only estimate cost of materialising integers");
134
135 // We have a Zero register, so 0 is always free.
136 if (Imm == 0)
137 return TTI::TCC_Free;
138
139 // Otherwise, we check how many instructions it will take to materialise.
140 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
141 /*CompressionCost=*/false, FreeZeroes);
142}
143
147 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
148}
149
150// Look for patterns of shift followed by AND that can be turned into a pair of
151// shifts. We won't need to materialize an immediate for the AND so these can
152// be considered free.
153static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
154 uint64_t Mask = Imm.getZExtValue();
155 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
156 if (!BO || !BO->hasOneUse())
157 return false;
158
159 if (BO->getOpcode() != Instruction::Shl)
160 return false;
161
162 if (!isa<ConstantInt>(BO->getOperand(1)))
163 return false;
164
165 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
166 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
167 // is a mask shifted by c2 bits with c3 leading zeros.
168 if (isShiftedMask_64(Mask)) {
169 unsigned Trailing = llvm::countr_zero(Mask);
170 if (ShAmt == Trailing)
171 return true;
172 }
173
174 return false;
175}
176
177// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
178// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
179// the type will be split so only the lower 32 bits need to be compared using
180// (srai/srli X, C) == C2.
181static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
182 if (!Inst->hasOneUse())
183 return false;
184
185 // Look for equality comparison.
186 auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
187 if (!Cmp || !Cmp->isEquality())
188 return false;
189
190 // Right hand side of comparison should be a constant.
191 auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
192 if (!C)
193 return false;
194
195 uint64_t Mask = Imm.getZExtValue();
196
197 // Mask should be of the form -(1 << C) in the lower 32 bits.
198 if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
199 return false;
200
201 // Comparison constant should be a subset of Mask.
202 uint64_t CmpC = C->getZExtValue();
203 if ((CmpC & Mask) != CmpC)
204 return false;
205
206 // We'll need to sign extend the comparison constant and shift it right. Make
207 // sure the new constant can use addi/xori+seqz/snez.
208 unsigned ShiftBits = llvm::countr_zero(Mask);
209 int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
210 return NewCmpC >= -2048 && NewCmpC <= 2048;
211}
212
214 const APInt &Imm, Type *Ty,
216 Instruction *Inst) const {
217 assert(Ty->isIntegerTy() &&
218 "getIntImmCost can only estimate cost of materialising integers");
219
220 // We have a Zero register, so 0 is always free.
221 if (Imm == 0)
222 return TTI::TCC_Free;
223
224 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
225 // commutative, in others the immediate comes from a specific argument index.
226 bool Takes12BitImm = false;
227 unsigned ImmArgIdx = ~0U;
228
229 switch (Opcode) {
230 case Instruction::GetElementPtr:
231 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
232 // split up large offsets in GEP into better parts than ConstantHoisting
233 // can.
234 return TTI::TCC_Free;
235 case Instruction::Store: {
236 // Use the materialization cost regardless of if it's the address or the
237 // value that is constant, except for if the store is misaligned and
238 // misaligned accesses are not legal (experience shows constant hoisting
239 // can sometimes be harmful in such cases).
240 if (Idx == 1 || !Inst)
241 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
242 /*FreeZeroes=*/true);
243
244 StoreInst *ST = cast<StoreInst>(Inst);
245 if (!getTLI()->allowsMemoryAccessForAlignment(
246 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
247 ST->getPointerAddressSpace(), ST->getAlign()))
248 return TTI::TCC_Free;
249
250 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
251 /*FreeZeroes=*/true);
252 }
253 case Instruction::Load:
254 // If the address is a constant, use the materialization cost.
255 return getIntImmCost(Imm, Ty, CostKind);
256 case Instruction::And:
257 // zext.h
258 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
259 return TTI::TCC_Free;
260 // zext.w
261 if (Imm == UINT64_C(0xffffffff) &&
262 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
263 return TTI::TCC_Free;
264 // bclri
265 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
266 return TTI::TCC_Free;
267 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
268 canUseShiftPair(Inst, Imm))
269 return TTI::TCC_Free;
270 if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
271 canUseShiftCmp(Inst, Imm))
272 return TTI::TCC_Free;
273 Takes12BitImm = true;
274 break;
275 case Instruction::Add:
276 Takes12BitImm = true;
277 break;
278 case Instruction::Or:
279 case Instruction::Xor:
280 // bseti/binvi
281 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
282 return TTI::TCC_Free;
283 Takes12BitImm = true;
284 break;
285 case Instruction::Mul:
286 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
287 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
288 return TTI::TCC_Free;
289 // One more or less than a power of 2 can use SLLI+ADD/SUB.
290 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
291 return TTI::TCC_Free;
292 // FIXME: There is no MULI instruction.
293 Takes12BitImm = true;
294 break;
295 case Instruction::Sub:
296 case Instruction::Shl:
297 case Instruction::LShr:
298 case Instruction::AShr:
299 Takes12BitImm = true;
300 ImmArgIdx = 1;
301 break;
302 default:
303 break;
304 }
305
306 if (Takes12BitImm) {
307 // Check immediate is the correct argument...
308 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
309 // ... and fits into the 12-bit immediate.
310 if (Imm.getSignificantBits() <= 64 &&
311 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
312 return TTI::TCC_Free;
313 }
314 }
315
316 // Otherwise, use the full materialisation cost.
317 return getIntImmCost(Imm, Ty, CostKind);
318 }
319
320 // By default, prevent hoisting.
321 return TTI::TCC_Free;
322}
323
326 const APInt &Imm, Type *Ty,
328 // Prevent hoisting in unknown cases.
329 return TTI::TCC_Free;
330}
331
333 return ST->hasVInstructions();
334}
335
337RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
338 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
339 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
340}
341
343 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
345 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
346 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
347 if (Opcode == Instruction::FAdd)
349
350 // zve32x is broken for partial_reduce_umla, but let's make sure we
351 // don't generate them.
352 if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 ||
353 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
354 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
355 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
357
358 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
359 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
360 // Note: Asuming all vqdot* variants are equal cost
361 return LT.first *
362 getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second, CostKind);
363}
364
366 // Currently, the ExpandReductions pass can't expand scalable-vector
367 // reductions, but we still request expansion as RVV doesn't support certain
368 // reductions and the SelectionDAG can't legalize them either.
369 switch (II->getIntrinsicID()) {
370 default:
371 return false;
372 // These reductions have no equivalent in RVV
373 case Intrinsic::vector_reduce_mul:
374 case Intrinsic::vector_reduce_fmul:
375 return true;
376 }
377}
378
379std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
380 if (ST->hasVInstructions())
381 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
382 return BaseT::getMaxVScale();
383}
384
385std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
386 if (ST->hasVInstructions())
387 if (unsigned MinVLen = ST->getRealMinVLen();
388 MinVLen >= RISCV::RVVBitsPerBlock)
389 return MinVLen / RISCV::RVVBitsPerBlock;
391}
392
395 unsigned LMUL =
396 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
397 switch (K) {
399 return TypeSize::getFixed(ST->getXLen());
401 return TypeSize::getFixed(
402 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
405 (ST->hasVInstructions() &&
406 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
408 : 0);
409 }
410
411 llvm_unreachable("Unsupported register kind");
412}
413
414InstructionCost RISCVTTIImpl::getStaticDataAddrGenerationCost(
415 const TTI::TargetCostKind CostKind) const {
416 switch (CostKind) {
419 // Always 2 instructions
420 return 2;
421 case TTI::TCK_Latency:
423 // Depending on the memory model the address generation will
424 // require AUIPC + ADDI (medany) or LUI + ADDI (medlow). Don't
425 // have a way of getting this information here, so conservatively
426 // require both.
427 // In practice, these are generally implemented together.
428 return (ST->hasAUIPCADDIFusion() && ST->hasLUIADDIFusion()) ? 1 : 2;
429 }
430 llvm_unreachable("Unsupported cost kind");
431}
432
434RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
436 // Add a cost of address generation + the cost of the load. The address
437 // is expected to be a PC relative offset to a constant pool entry
438 // using auipc/addi.
439 return getStaticDataAddrGenerationCost(CostKind) +
440 getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
441 /*AddressSpace=*/0, CostKind);
442}
443
444static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
445 unsigned Size = Mask.size();
446 if (!isPowerOf2_32(Size))
447 return false;
448 for (unsigned I = 0; I != Size; ++I) {
449 if (static_cast<unsigned>(Mask[I]) == I)
450 continue;
451 if (Mask[I] != 0)
452 return false;
453 if (Size % I != 0)
454 return false;
455 for (unsigned J = I + 1; J != Size; ++J)
456 // Check the pattern is repeated.
457 if (static_cast<unsigned>(Mask[J]) != J % I)
458 return false;
459 SubVectorSize = I;
460 return true;
461 }
462 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
463 return false;
464}
465
467 LLVMContext &C) {
468 assert((DataVT.getScalarSizeInBits() != 8 ||
469 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
470 MVT IndexVT = DataVT.changeTypeToInteger();
471 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
472 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
473 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
474}
475
476/// Attempt to approximate the cost of a shuffle which will require splitting
477/// during legalization. Note that processShuffleMasks is not an exact proxy
478/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
479/// reasonably close upperbound.
481 MVT LegalVT, VectorType *Tp,
482 ArrayRef<int> Mask,
484 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
485 "Expected fixed vector type and non-empty mask");
486 unsigned LegalNumElts = LegalVT.getVectorNumElements();
487 // Number of destination vectors after legalization:
488 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
489 // We are going to permute multiple sources and the result will be in
490 // multiple destinations. Providing an accurate cost only for splits where
491 // the element type remains the same.
492 if (NumOfDests <= 1 ||
494 Tp->getElementType()->getPrimitiveSizeInBits() ||
495 LegalNumElts >= Tp->getElementCount().getFixedValue())
497
498 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
499 unsigned LegalVTSize = LegalVT.getStoreSize();
500 // Number of source vectors after legalization:
501 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
502
503 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
504
505 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
506 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
507 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
508 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
509 assert(NormalizedVF >= Mask.size() &&
510 "Normalized mask expected to be not shorter than original mask.");
511 copy(Mask, NormalizedMask.begin());
512 InstructionCost Cost = 0;
513 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
515 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
516 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
517 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
518 return;
519 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
520 .second)
521 return;
522 Cost += TTI.getShuffleCost(
524 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
525 SingleOpTy, RegMask, CostKind, 0, nullptr);
526 },
527 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
528 Cost += TTI.getShuffleCost(
530 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
531 SingleOpTy, RegMask, CostKind, 0, nullptr);
532 });
533 return Cost;
534}
535
536/// Try to perform better estimation of the permutation.
537/// 1. Split the source/destination vectors into real registers.
538/// 2. Do the mask analysis to identify which real registers are
539/// permuted. If more than 1 source registers are used for the
540/// destination register building, the cost for this destination register
541/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
542/// source register is used, build mask and calculate the cost as a cost
543/// of PermuteSingleSrc.
544/// Also, for the single register permute we try to identify if the
545/// destination register is just a copy of the source register or the
546/// copy of the previous destination register (the cost is
547/// TTI::TCC_Basic). If the source register is just reused, the cost for
548/// this operation is 0.
549static InstructionCost
551 std::optional<unsigned> VLen, VectorType *Tp,
553 assert(LegalVT.isFixedLengthVector());
554 if (!VLen || Mask.empty())
556 MVT ElemVT = LegalVT.getVectorElementType();
557 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
558 LegalVT = TTI.getTypeLegalizationCost(
559 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
560 .second;
561 // Number of destination vectors after legalization:
562 InstructionCost NumOfDests =
563 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
564 if (NumOfDests <= 1 ||
566 Tp->getElementType()->getPrimitiveSizeInBits() ||
567 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
569
570 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
571 unsigned LegalVTSize = LegalVT.getStoreSize();
572 // Number of source vectors after legalization:
573 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
574
575 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
576 LegalVT.getVectorNumElements());
577
578 unsigned E = NumOfDests.getValue();
579 unsigned NormalizedVF =
580 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
581 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
582 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
583 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
584 assert(NormalizedVF >= Mask.size() &&
585 "Normalized mask expected to be not shorter than original mask.");
586 copy(Mask, NormalizedMask.begin());
587 InstructionCost Cost = 0;
588 int NumShuffles = 0;
589 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
591 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
592 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
593 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
594 return;
595 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
596 .second)
597 return;
598 ++NumShuffles;
599 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
600 SingleOpTy, RegMask, CostKind, 0, nullptr);
601 },
602 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
603 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
604 SingleOpTy, RegMask, CostKind, 0, nullptr);
605 NumShuffles += 2;
606 });
607 // Note: check that we do not emit too many shuffles here to prevent code
608 // size explosion.
609 // TODO: investigate, if it can be improved by extra analysis of the masks
610 // to check if the code is more profitable.
611 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
612 (NumOfDestRegs <= 2 && NumShuffles < 4))
613 return Cost;
615}
616
617InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
618 ArrayRef<int> Mask,
620 // Avoid missing masks and length changing shuffles
621 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
623
624 int NumElts = Tp->getNumElements();
625 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
626 // Avoid scalarization cases
627 if (!LT.second.isFixedLengthVector())
629
630 // Requires moving elements between parts, which requires additional
631 // unmodeled instructions.
632 if (LT.first != 1)
634
635 auto GetSlideOpcode = [&](int SlideAmt) {
636 assert(SlideAmt != 0);
637 bool IsVI = isUInt<5>(std::abs(SlideAmt));
638 if (SlideAmt < 0)
639 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
640 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
641 };
642
643 std::array<std::pair<int, int>, 2> SrcInfo;
644 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
646
647 if (SrcInfo[1].second == 0)
648 std::swap(SrcInfo[0], SrcInfo[1]);
649
650 InstructionCost FirstSlideCost = 0;
651 if (SrcInfo[0].second != 0) {
652 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
653 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
654 }
655
656 if (SrcInfo[1].first == -1)
657 return FirstSlideCost;
658
659 InstructionCost SecondSlideCost = 0;
660 if (SrcInfo[1].second != 0) {
661 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
662 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
663 } else {
664 SecondSlideCost =
665 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
666 }
667
668 auto EC = Tp->getElementCount();
669 VectorType *MaskTy =
671 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
672 return FirstSlideCost + SecondSlideCost + MaskCost;
673}
674
677 VectorType *SrcTy, ArrayRef<int> Mask,
678 TTI::TargetCostKind CostKind, int Index,
680 const Instruction *CxtI) const {
681 assert((Mask.empty() || DstTy->isScalableTy() ||
682 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
683 "Expected the Mask to match the return size if given");
684 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
685 "Expected the same scalar types");
686
687 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
688 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
689
690 // First, handle cases where having a fixed length vector enables us to
691 // give a more accurate cost than falling back to generic scalable codegen.
692 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
693 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
694 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
696 *this, LT.second, ST->getRealVLen(),
697 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
698 if (VRegSplittingCost.isValid())
699 return VRegSplittingCost;
700 switch (Kind) {
701 default:
702 break;
704 if (Mask.size() >= 2) {
705 MVT EltTp = LT.second.getVectorElementType();
706 // If the size of the element is < ELEN then shuffles of interleaves and
707 // deinterleaves of 2 vectors can be lowered into the following
708 // sequences
709 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
710 // Example sequence:
711 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
712 // vwaddu.vv v10, v8, v9
713 // li a0, -1 (ignored)
714 // vwmaccu.vx v10, a0, v9
715 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
716 return 2 * LT.first * TLI->getLMULCost(LT.second);
717
718 if (Mask[0] == 0 || Mask[0] == 1) {
719 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
720 // Example sequence:
721 // vnsrl.wi v10, v8, 0
722 if (equal(DeinterleaveMask, Mask))
723 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
724 LT.second, CostKind);
725 }
726 }
727 int SubVectorSize;
728 if (LT.second.getScalarSizeInBits() != 1 &&
729 isRepeatedConcatMask(Mask, SubVectorSize)) {
731 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
732 // The cost of extraction from a subvector is 0 if the index is 0.
733 for (unsigned I = 0; I != NumSlides; ++I) {
734 unsigned InsertIndex = SubVectorSize * (1 << I);
735 FixedVectorType *SubTp =
736 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
737 FixedVectorType *DestTp =
739 std::pair<InstructionCost, MVT> DestLT =
741 // Add the cost of whole vector register move because the
742 // destination vector register group for vslideup cannot overlap the
743 // source.
744 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
745 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
746 CostKind, InsertIndex, SubTp);
747 }
748 return Cost;
749 }
750 }
751
752 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
753 SlideCost.isValid())
754 return SlideCost;
755
756 // vrgather + cost of generating the mask constant.
757 // We model this for an unknown mask with a single vrgather.
758 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
759 LT.second.getVectorNumElements() <= 256)) {
760 VectorType *IdxTy =
761 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
762 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
763 return IndexCost +
764 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
765 }
766 break;
767 }
770
771 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
772 SlideCost.isValid())
773 return SlideCost;
774
775 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
776 // register for the second vrgather. We model this for an unknown
777 // (shuffle) mask.
778 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
779 LT.second.getVectorNumElements() <= 256)) {
780 auto &C = SrcTy->getContext();
781 auto EC = SrcTy->getElementCount();
782 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
784 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
785 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
786 return 2 * IndexCost +
787 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
788 LT.second, CostKind) +
789 MaskCost;
790 }
791 break;
792 }
793 }
794
795 auto shouldSplit = [](TTI::ShuffleKind Kind) {
796 switch (Kind) {
797 default:
798 return false;
802 return true;
803 }
804 };
805
806 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
807 shouldSplit(Kind)) {
808 InstructionCost SplitCost =
809 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
810 if (SplitCost.isValid())
811 return SplitCost;
812 }
813 }
814
815 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
816 switch (Kind) {
817 default:
818 // Fallthrough to generic handling.
819 // TODO: Most of these cases will return getInvalid in generic code, and
820 // must be implemented here.
821 break;
823 // Extract at zero is always a subregister extract
824 if (Index == 0)
825 return TTI::TCC_Free;
826
827 // If we're extracting a subvector of at most m1 size at a sub-register
828 // boundary - which unfortunately we need exact vlen to identify - this is
829 // a subregister extract at worst and thus won't require a vslidedown.
830 // TODO: Extend for aligned m2, m4 subvector extracts
831 // TODO: Extend for misalgined (but contained) extracts
832 // TODO: Extend for scalable subvector types
833 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
834 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
835 if (std::optional<unsigned> VLen = ST->getRealVLen();
836 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
837 SubLT.second.getSizeInBits() <= *VLen)
838 return TTI::TCC_Free;
839 }
840
841 // Example sequence:
842 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
843 // vslidedown.vi v8, v9, 2
844 return LT.first *
845 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
847 // Example sequence:
848 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
849 // vslideup.vi v8, v9, 2
850 LT = getTypeLegalizationCost(DstTy);
851 return LT.first *
852 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
853 case TTI::SK_Select: {
854 // Example sequence:
855 // li a0, 90
856 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
857 // vmv.s.x v0, a0
858 // vmerge.vvm v8, v9, v8, v0
859 // We use 2 for the cost of the mask materialization as this is the true
860 // cost for small masks and most shuffles are small. At worst, this cost
861 // should be a very small constant for the constant pool load. As such,
862 // we may bias towards large selects slightly more than truly warranted.
863 return LT.first *
864 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
865 LT.second, CostKind));
866 }
867 case TTI::SK_Broadcast: {
868 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
869 Instruction::InsertElement);
870 if (LT.second.getScalarSizeInBits() == 1) {
871 if (HasScalar) {
872 // Example sequence:
873 // andi a0, a0, 1
874 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
875 // vmv.v.x v8, a0
876 // vmsne.vi v0, v8, 0
877 return LT.first *
878 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
879 LT.second, CostKind));
880 }
881 // Example sequence:
882 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
883 // vmv.v.i v8, 0
884 // vmerge.vim v8, v8, 1, v0
885 // vmv.x.s a0, v8
886 // andi a0, a0, 1
887 // vmv.v.x v8, a0
888 // vmsne.vi v0, v8, 0
889
890 return LT.first *
891 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
892 RISCV::VMV_X_S, RISCV::VMV_V_X,
893 RISCV::VMSNE_VI},
894 LT.second, CostKind));
895 }
896
897 if (HasScalar) {
898 // Example sequence:
899 // vmv.v.x v8, a0
900 return LT.first *
901 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
902 }
903
904 // Example sequence:
905 // vrgather.vi v9, v8, 0
906 return LT.first *
907 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
908 }
909 case TTI::SK_Splice: {
910 // vslidedown+vslideup.
911 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
912 // of similar code, but I think we expand through memory.
913 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
914 if (Index >= 0 && Index < 32)
915 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
916 else if (Index < 0 && Index > -32)
917 Opcodes[1] = RISCV::VSLIDEUP_VI;
918 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
919 }
920 case TTI::SK_Reverse: {
921
922 if (!LT.second.isVector())
924
925 // TODO: Cases to improve here:
926 // * Illegal vector types
927 // * i64 on RV32
928 if (SrcTy->getElementType()->isIntegerTy(1)) {
929 VectorType *WideTy =
930 VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
931 cast<VectorType>(SrcTy)->getElementCount());
932 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
934 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
935 nullptr) +
936 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
938 }
939
940 MVT ContainerVT = LT.second;
941 if (LT.second.isFixedLengthVector())
942 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
943 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
944 if (ContainerVT.bitsLE(M1VT)) {
945 // Example sequence:
946 // csrr a0, vlenb
947 // srli a0, a0, 3
948 // addi a0, a0, -1
949 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
950 // vid.v v9
951 // vrsub.vx v10, v9, a0
952 // vrgather.vv v9, v8, v10
953 InstructionCost LenCost = 3;
954 if (LT.second.isFixedLengthVector())
955 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
956 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
957 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
958 if (LT.second.isFixedLengthVector() &&
959 isInt<5>(LT.second.getVectorNumElements() - 1))
960 Opcodes[1] = RISCV::VRSUB_VI;
961 InstructionCost GatherCost =
962 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
963 return LT.first * (LenCost + GatherCost);
964 }
965
966 // At high LMUL, we split into a series of M1 reverses (see
967 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
968 // the resulting gap at the bottom (for fixed vectors only). The important
969 // bit is that the cost scales linearly, not quadratically with LMUL.
970 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
971 InstructionCost FixedCost =
972 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
973 unsigned Ratio =
975 InstructionCost GatherCost =
976 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
977 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
978 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
979 return FixedCost + LT.first * (GatherCost + SlideCost);
980 }
981 }
982 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
983 SubTp);
984}
985
986static unsigned isM1OrSmaller(MVT VT) {
988 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
992}
993
995 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
996 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
997 TTI::VectorInstrContext VIC) const {
1000
1001 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1002 // For now, skip all fixed vector cost analysis when P extension is available
1003 // to avoid crashes in getMinRVVVectorSizeInBits()
1004 if (ST->enablePExtSIMDCodeGen() && isa<FixedVectorType>(Ty)) {
1005 return 1; // Treat as single instruction cost for now
1006 }
1007
1008 // A build_vector (which is m1 sized or smaller) can be done in no
1009 // worse than one vslide1down.vx per element in the type. We could
1010 // in theory do an explode_vector in the inverse manner, but our
1011 // lowering today does not have a first class node for this pattern.
1013 Ty, DemandedElts, Insert, Extract, CostKind);
1014 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1015 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
1016 if (Ty->getScalarSizeInBits() == 1) {
1017 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
1018 // Note: Implicit scalar anyextend is assumed to be free since the i1
1019 // must be stored in a GPR.
1020 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
1021 CostKind) +
1022 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
1024 }
1025
1026 assert(LT.second.isFixedLengthVector());
1027 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
1028 if (isM1OrSmaller(ContainerVT)) {
1029 InstructionCost BV =
1030 cast<FixedVectorType>(Ty)->getNumElements() *
1031 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
1032 if (BV < Cost)
1033 Cost = BV;
1034 }
1035 }
1036 return Cost;
1037}
1038
1042 Type *DataTy = MICA.getDataType();
1043 Align Alignment = MICA.getAlignment();
1044 switch (MICA.getID()) {
1045 case Intrinsic::vp_load_ff: {
1046 EVT DataTypeVT = TLI->getValueType(DL, DataTy);
1047 if (!TLI->isLegalFirstFaultLoad(DataTypeVT, Alignment))
1049
1050 unsigned AS = MICA.getAddressSpace();
1051 return getMemoryOpCost(Instruction::Load, DataTy, Alignment, AS, CostKind,
1052 {TTI::OK_AnyValue, TTI::OP_None}, nullptr);
1053 }
1054 case Intrinsic::experimental_vp_strided_load:
1055 case Intrinsic::experimental_vp_strided_store:
1056 return getStridedMemoryOpCost(MICA, CostKind);
1057 case Intrinsic::masked_compressstore:
1058 case Intrinsic::masked_expandload:
1060 case Intrinsic::vp_scatter:
1061 case Intrinsic::vp_gather:
1062 case Intrinsic::masked_scatter:
1063 case Intrinsic::masked_gather:
1064 return getGatherScatterOpCost(MICA, CostKind);
1065 case Intrinsic::vp_load:
1066 case Intrinsic::vp_store:
1067 case Intrinsic::masked_load:
1068 case Intrinsic::masked_store:
1069 return getMaskedMemoryOpCost(MICA, CostKind);
1070 }
1072}
1073
1077 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
1078 : Instruction::Store;
1079 Type *Src = MICA.getDataType();
1080 Align Alignment = MICA.getAlignment();
1081 unsigned AddressSpace = MICA.getAddressSpace();
1082
1083 if (!isLegalMaskedLoadStore(Src, Alignment) ||
1086
1087 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1088}
1089
1091 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1092 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1093 bool UseMaskForCond, bool UseMaskForGaps) const {
1094
1095 // The interleaved memory access pass will lower (de)interleave ops combined
1096 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
1097 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
1098 // gap).
1099 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1100 auto *VTy = cast<VectorType>(VecTy);
1101 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
1102 // Need to make sure type has't been scalarized
1103 if (LT.second.isVector()) {
1104 auto *SubVecTy =
1105 VectorType::get(VTy->getElementType(),
1106 VTy->getElementCount().divideCoefficientBy(Factor));
1107 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
1108 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
1109 AddressSpace, DL)) {
1110
1111 // Some processors optimize segment loads/stores as one wide memory op +
1112 // Factor * LMUL shuffle ops.
1113 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1115 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1116 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1117 Cost += Factor * TLI->getLMULCost(SubVecVT);
1118 return LT.first * Cost;
1119 }
1120
1121 // Otherwise, the cost is proportional to the number of elements (VL *
1122 // Factor ops).
1123 InstructionCost MemOpCost =
1124 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1125 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1126 unsigned NumLoads = getEstimatedVLFor(VTy);
1127 return NumLoads * MemOpCost;
1128 }
1129 }
1130 }
1131
1132 // TODO: Return the cost of interleaved accesses for scalable vector when
1133 // unable to convert to segment accesses instructions.
1134 if (isa<ScalableVectorType>(VecTy))
1136
1137 auto *FVTy = cast<FixedVectorType>(VecTy);
1138 InstructionCost MemCost =
1139 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1140 unsigned VF = FVTy->getNumElements() / Factor;
1141
1142 // An interleaved load will look like this for Factor=3:
1143 // %wide.vec = load <12 x i32>, ptr %3, align 4
1144 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1145 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1146 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1147 if (Opcode == Instruction::Load) {
1148 InstructionCost Cost = MemCost;
1149 for (unsigned Index : Indices) {
1150 FixedVectorType *VecTy =
1151 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1152 auto Mask = createStrideMask(Index, Factor, VF);
1153 Mask.resize(VF * Factor, -1);
1154 InstructionCost ShuffleCost =
1156 Mask, CostKind, 0, nullptr, {});
1157 Cost += ShuffleCost;
1158 }
1159 return Cost;
1160 }
1161
1162 // TODO: Model for NF > 2
1163 // We'll need to enhance getShuffleCost to model shuffles that are just
1164 // inserts and extracts into subvectors, since they won't have the full cost
1165 // of a vrgather.
1166 // An interleaved store for 3 vectors of 4 lanes will look like
1167 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1168 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1169 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1170 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1171 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1172 if (Factor != 2)
1173 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1174 Alignment, AddressSpace, CostKind,
1175 UseMaskForCond, UseMaskForGaps);
1176
1177 assert(Opcode == Instruction::Store && "Opcode must be a store");
1178 // For an interleaving store of 2 vectors, we perform one large interleaving
1179 // shuffle that goes into the wide store
1180 auto Mask = createInterleaveMask(VF, Factor);
1181 InstructionCost ShuffleCost =
1183 CostKind, 0, nullptr, {});
1184 return MemCost + ShuffleCost;
1185}
1186
1190
1191 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
1192 MICA.getID() == Intrinsic::vp_gather;
1193 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
1194 Type *DataTy = MICA.getDataType();
1195 Align Alignment = MICA.getAlignment();
1198
1199 if ((Opcode == Instruction::Load &&
1200 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1201 (Opcode == Instruction::Store &&
1202 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1204
1205 // Cost is proportional to the number of memory operations implied. For
1206 // scalable vectors, we use an estimate on that number since we don't
1207 // know exactly what VL will be.
1208 auto &VTy = *cast<VectorType>(DataTy);
1209 unsigned NumLoads = getEstimatedVLFor(&VTy);
1210 return NumLoads * TTI::TCC_Basic;
1211}
1212
1214 const MemIntrinsicCostAttributes &MICA,
1216 unsigned Opcode = MICA.getID() == Intrinsic::masked_expandload
1217 ? Instruction::Load
1218 : Instruction::Store;
1219 Type *DataTy = MICA.getDataType();
1220 bool VariableMask = MICA.getVariableMask();
1221 Align Alignment = MICA.getAlignment();
1222 bool IsLegal = (Opcode == Instruction::Store &&
1223 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1224 (Opcode == Instruction::Load &&
1225 isLegalMaskedExpandLoad(DataTy, Alignment));
1226 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1228 // Example compressstore sequence:
1229 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1230 // vcompress.vm v10, v8, v0
1231 // vcpop.m a1, v0
1232 // vsetvli zero, a1, e32, m2, ta, ma
1233 // vse32.v v10, (a0)
1234 // Example expandload sequence:
1235 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1236 // vcpop.m a1, v0
1237 // vsetvli zero, a1, e32, m2, ta, ma
1238 // vle32.v v10, (a0)
1239 // vsetivli zero, 8, e32, m2, ta, ma
1240 // viota.m v12, v0
1241 // vrgather.vv v8, v10, v12, v0.t
1242 auto MemOpCost =
1243 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1244 auto LT = getTypeLegalizationCost(DataTy);
1245 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1246 if (VariableMask)
1247 Opcodes.push_back(RISCV::VCPOP_M);
1248 if (Opcode == Instruction::Store)
1249 Opcodes.append({RISCV::VCOMPRESS_VM});
1250 else
1251 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1252 return MemOpCost +
1253 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1254}
1255
1259
1260 unsigned Opcode = MICA.getID() == Intrinsic::experimental_vp_strided_load
1261 ? Instruction::Load
1262 : Instruction::Store;
1263
1264 Type *DataTy = MICA.getDataType();
1265 Align Alignment = MICA.getAlignment();
1266 const Instruction *I = MICA.getInst();
1267
1268 if (!isLegalStridedLoadStore(DataTy, Alignment))
1270
1272 return TTI::TCC_Basic;
1273
1274 // Cost is proportional to the number of memory operations implied. For
1275 // scalable vectors, we use an estimate on that number since we don't
1276 // know exactly what VL will be.
1277 // FIXME: This will overcost for i64 on rv32 with +zve64x.
1278 auto &VTy = *cast<VectorType>(DataTy);
1279 InstructionCost MemOpCost =
1280 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1281 {TTI::OK_AnyValue, TTI::OP_None}, I);
1282 unsigned NumLoads = getEstimatedVLFor(&VTy);
1283 return NumLoads * MemOpCost;
1284}
1285
1288 // FIXME: This is a property of the default vector convention, not
1289 // all possible calling conventions. Fixing that will require
1290 // some TTI API and SLP rework.
1293 for (auto *Ty : Tys) {
1294 if (!Ty->isVectorTy())
1295 continue;
1296 Align A = DL.getPrefTypeAlign(Ty);
1297 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1298 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1299 }
1300 return Cost;
1301}
1302
1303// Currently, these represent both throughput and codesize costs
1304// for the respective intrinsics. The costs in this table are simply
1305// instruction counts with the following adjustments made:
1306// * One vsetvli is considered free.
1308 {Intrinsic::floor, MVT::f32, 9},
1309 {Intrinsic::floor, MVT::f64, 9},
1310 {Intrinsic::ceil, MVT::f32, 9},
1311 {Intrinsic::ceil, MVT::f64, 9},
1312 {Intrinsic::trunc, MVT::f32, 7},
1313 {Intrinsic::trunc, MVT::f64, 7},
1314 {Intrinsic::round, MVT::f32, 9},
1315 {Intrinsic::round, MVT::f64, 9},
1316 {Intrinsic::roundeven, MVT::f32, 9},
1317 {Intrinsic::roundeven, MVT::f64, 9},
1318 {Intrinsic::rint, MVT::f32, 7},
1319 {Intrinsic::rint, MVT::f64, 7},
1320 {Intrinsic::nearbyint, MVT::f32, 9},
1321 {Intrinsic::nearbyint, MVT::f64, 9},
1322 {Intrinsic::bswap, MVT::i16, 3},
1323 {Intrinsic::bswap, MVT::i32, 12},
1324 {Intrinsic::bswap, MVT::i64, 31},
1325 {Intrinsic::vp_bswap, MVT::i16, 3},
1326 {Intrinsic::vp_bswap, MVT::i32, 12},
1327 {Intrinsic::vp_bswap, MVT::i64, 31},
1328 {Intrinsic::vp_fshl, MVT::i8, 7},
1329 {Intrinsic::vp_fshl, MVT::i16, 7},
1330 {Intrinsic::vp_fshl, MVT::i32, 7},
1331 {Intrinsic::vp_fshl, MVT::i64, 7},
1332 {Intrinsic::vp_fshr, MVT::i8, 7},
1333 {Intrinsic::vp_fshr, MVT::i16, 7},
1334 {Intrinsic::vp_fshr, MVT::i32, 7},
1335 {Intrinsic::vp_fshr, MVT::i64, 7},
1336 {Intrinsic::bitreverse, MVT::i8, 17},
1337 {Intrinsic::bitreverse, MVT::i16, 24},
1338 {Intrinsic::bitreverse, MVT::i32, 33},
1339 {Intrinsic::bitreverse, MVT::i64, 52},
1340 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1341 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1342 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1343 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1344 {Intrinsic::ctpop, MVT::i8, 12},
1345 {Intrinsic::ctpop, MVT::i16, 19},
1346 {Intrinsic::ctpop, MVT::i32, 20},
1347 {Intrinsic::ctpop, MVT::i64, 21},
1348 {Intrinsic::ctlz, MVT::i8, 19},
1349 {Intrinsic::ctlz, MVT::i16, 28},
1350 {Intrinsic::ctlz, MVT::i32, 31},
1351 {Intrinsic::ctlz, MVT::i64, 35},
1352 {Intrinsic::cttz, MVT::i8, 16},
1353 {Intrinsic::cttz, MVT::i16, 23},
1354 {Intrinsic::cttz, MVT::i32, 24},
1355 {Intrinsic::cttz, MVT::i64, 25},
1356 {Intrinsic::vp_ctpop, MVT::i8, 12},
1357 {Intrinsic::vp_ctpop, MVT::i16, 19},
1358 {Intrinsic::vp_ctpop, MVT::i32, 20},
1359 {Intrinsic::vp_ctpop, MVT::i64, 21},
1360 {Intrinsic::vp_ctlz, MVT::i8, 19},
1361 {Intrinsic::vp_ctlz, MVT::i16, 28},
1362 {Intrinsic::vp_ctlz, MVT::i32, 31},
1363 {Intrinsic::vp_ctlz, MVT::i64, 35},
1364 {Intrinsic::vp_cttz, MVT::i8, 16},
1365 {Intrinsic::vp_cttz, MVT::i16, 23},
1366 {Intrinsic::vp_cttz, MVT::i32, 24},
1367 {Intrinsic::vp_cttz, MVT::i64, 25},
1368};
1369
1373 auto *RetTy = ICA.getReturnType();
1374 switch (ICA.getID()) {
1375 case Intrinsic::lrint:
1376 case Intrinsic::llrint:
1377 case Intrinsic::lround:
1378 case Intrinsic::llround: {
1379 auto LT = getTypeLegalizationCost(RetTy);
1380 Type *SrcTy = ICA.getArgTypes().front();
1381 auto SrcLT = getTypeLegalizationCost(SrcTy);
1382 if (ST->hasVInstructions() && LT.second.isVector()) {
1384 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1385 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1386 if (LT.second.getVectorElementType() == MVT::bf16) {
1387 if (!ST->hasVInstructionsBF16Minimal())
1389 if (DstEltSz == 32)
1390 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1391 else
1392 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1393 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1394 !ST->hasVInstructionsF16()) {
1395 if (!ST->hasVInstructionsF16Minimal())
1397 if (DstEltSz == 32)
1398 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1399 else
1400 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1401
1402 } else if (SrcEltSz > DstEltSz) {
1403 Ops = {RISCV::VFNCVT_X_F_W};
1404 } else if (SrcEltSz < DstEltSz) {
1405 Ops = {RISCV::VFWCVT_X_F_V};
1406 } else {
1407 Ops = {RISCV::VFCVT_X_F_V};
1408 }
1409
1410 // We need to use the source LMUL in the case of a narrowing op, and the
1411 // destination LMUL otherwise.
1412 if (SrcEltSz > DstEltSz)
1413 return SrcLT.first *
1414 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1415 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1416 }
1417 break;
1418 }
1419 case Intrinsic::ceil:
1420 case Intrinsic::floor:
1421 case Intrinsic::trunc:
1422 case Intrinsic::rint:
1423 case Intrinsic::round:
1424 case Intrinsic::roundeven: {
1425 // These all use the same code.
1426 auto LT = getTypeLegalizationCost(RetTy);
1427 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1428 return LT.first * 8;
1429 break;
1430 }
1431 case Intrinsic::umin:
1432 case Intrinsic::umax:
1433 case Intrinsic::smin:
1434 case Intrinsic::smax: {
1435 auto LT = getTypeLegalizationCost(RetTy);
1436 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1437 return LT.first;
1438
1439 if (ST->hasVInstructions() && LT.second.isVector()) {
1440 unsigned Op;
1441 switch (ICA.getID()) {
1442 case Intrinsic::umin:
1443 Op = RISCV::VMINU_VV;
1444 break;
1445 case Intrinsic::umax:
1446 Op = RISCV::VMAXU_VV;
1447 break;
1448 case Intrinsic::smin:
1449 Op = RISCV::VMIN_VV;
1450 break;
1451 case Intrinsic::smax:
1452 Op = RISCV::VMAX_VV;
1453 break;
1454 }
1455 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1456 }
1457 break;
1458 }
1459 case Intrinsic::sadd_sat:
1460 case Intrinsic::ssub_sat:
1461 case Intrinsic::uadd_sat:
1462 case Intrinsic::usub_sat: {
1463 auto LT = getTypeLegalizationCost(RetTy);
1464 if (ST->hasVInstructions() && LT.second.isVector()) {
1465 unsigned Op;
1466 switch (ICA.getID()) {
1467 case Intrinsic::sadd_sat:
1468 Op = RISCV::VSADD_VV;
1469 break;
1470 case Intrinsic::ssub_sat:
1471 Op = RISCV::VSSUBU_VV;
1472 break;
1473 case Intrinsic::uadd_sat:
1474 Op = RISCV::VSADDU_VV;
1475 break;
1476 case Intrinsic::usub_sat:
1477 Op = RISCV::VSSUBU_VV;
1478 break;
1479 }
1480 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1481 }
1482 break;
1483 }
1484 case Intrinsic::fma:
1485 case Intrinsic::fmuladd: {
1486 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1487 auto LT = getTypeLegalizationCost(RetTy);
1488 if (ST->hasVInstructions() && LT.second.isVector())
1489 return LT.first *
1490 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1491 break;
1492 }
1493 case Intrinsic::fabs: {
1494 auto LT = getTypeLegalizationCost(RetTy);
1495 if (ST->hasVInstructions() && LT.second.isVector()) {
1496 // lui a0, 8
1497 // addi a0, a0, -1
1498 // vsetvli a1, zero, e16, m1, ta, ma
1499 // vand.vx v8, v8, a0
1500 // f16 with zvfhmin and bf16 with zvfhbmin
1501 if (LT.second.getVectorElementType() == MVT::bf16 ||
1502 (LT.second.getVectorElementType() == MVT::f16 &&
1503 !ST->hasVInstructionsF16()))
1504 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1505 CostKind) +
1506 2;
1507 else
1508 return LT.first *
1509 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1510 }
1511 break;
1512 }
1513 case Intrinsic::sqrt: {
1514 auto LT = getTypeLegalizationCost(RetTy);
1515 if (ST->hasVInstructions() && LT.second.isVector()) {
1518 MVT ConvType = LT.second;
1519 MVT FsqrtType = LT.second;
1520 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1521 // will be spilt.
1522 if (LT.second.getVectorElementType() == MVT::bf16) {
1523 if (LT.second == MVT::nxv32bf16) {
1524 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1525 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1526 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1527 ConvType = MVT::nxv16f16;
1528 FsqrtType = MVT::nxv16f32;
1529 } else {
1530 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1531 FsqrtOp = {RISCV::VFSQRT_V};
1532 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1533 }
1534 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1535 !ST->hasVInstructionsF16()) {
1536 if (LT.second == MVT::nxv32f16) {
1537 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1538 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1539 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1540 ConvType = MVT::nxv16f16;
1541 FsqrtType = MVT::nxv16f32;
1542 } else {
1543 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1544 FsqrtOp = {RISCV::VFSQRT_V};
1545 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1546 }
1547 } else {
1548 FsqrtOp = {RISCV::VFSQRT_V};
1549 }
1550
1551 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1552 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1553 }
1554 break;
1555 }
1556 case Intrinsic::cttz:
1557 case Intrinsic::ctlz:
1558 case Intrinsic::ctpop: {
1559 auto LT = getTypeLegalizationCost(RetTy);
1560 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1561 unsigned Op;
1562 switch (ICA.getID()) {
1563 case Intrinsic::cttz:
1564 Op = RISCV::VCTZ_V;
1565 break;
1566 case Intrinsic::ctlz:
1567 Op = RISCV::VCLZ_V;
1568 break;
1569 case Intrinsic::ctpop:
1570 Op = RISCV::VCPOP_V;
1571 break;
1572 }
1573 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1574 }
1575 break;
1576 }
1577 case Intrinsic::abs: {
1578 auto LT = getTypeLegalizationCost(RetTy);
1579 if (ST->hasVInstructions() && LT.second.isVector()) {
1580 // vrsub.vi v10, v8, 0
1581 // vmax.vv v8, v8, v10
1582 return LT.first *
1583 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1584 LT.second, CostKind);
1585 }
1586 break;
1587 }
1588 case Intrinsic::fshl:
1589 case Intrinsic::fshr: {
1590 if (ICA.getArgs().empty())
1591 break;
1592
1593 // Funnel-shifts are ROTL/ROTR when the first and second operand are equal.
1594 // When Zbb/Zbkb is enabled we can use a single ROL(W)/ROR(I)(W)
1595 // instruction.
1596 if ((ST->hasStdExtZbb() || ST->hasStdExtZbkb()) && RetTy->isIntegerTy() &&
1597 ICA.getArgs()[0] == ICA.getArgs()[1] &&
1598 (RetTy->getIntegerBitWidth() == 32 ||
1599 RetTy->getIntegerBitWidth() == 64) &&
1600 RetTy->getIntegerBitWidth() <= ST->getXLen()) {
1601 return 1;
1602 }
1603 break;
1604 }
1605 case Intrinsic::get_active_lane_mask: {
1606 if (ST->hasVInstructions()) {
1607 Type *ExpRetTy = VectorType::get(
1608 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1609 auto LT = getTypeLegalizationCost(ExpRetTy);
1610
1611 // vid.v v8 // considered hoisted
1612 // vsaddu.vx v8, v8, a0
1613 // vmsltu.vx v0, v8, a1
1614 return LT.first *
1615 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1616 LT.second, CostKind);
1617 }
1618 break;
1619 }
1620 // TODO: add more intrinsic
1621 case Intrinsic::stepvector: {
1622 auto LT = getTypeLegalizationCost(RetTy);
1623 // Legalisation of illegal types involves an `index' instruction plus
1624 // (LT.first - 1) vector adds.
1625 if (ST->hasVInstructions())
1626 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1627 (LT.first - 1) *
1628 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1629 return 1 + (LT.first - 1);
1630 }
1631 case Intrinsic::experimental_cttz_elts: {
1632 Type *ArgTy = ICA.getArgTypes()[0];
1633 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1634 if (getTLI()->shouldExpandCttzElements(ArgType))
1635 break;
1636 InstructionCost Cost = getRISCVInstructionCost(
1637 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1638
1639 // If zero_is_poison is false, then we will generate additional
1640 // cmp + select instructions to convert -1 to EVL.
1641 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1642 if (ICA.getArgs().size() > 1 &&
1643 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1644 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1646 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1648
1649 return Cost;
1650 }
1651 case Intrinsic::experimental_vp_splice: {
1652 // To support type-based query from vectorizer, set the index to 0.
1653 // Note that index only change the cost from vslide.vx to vslide.vi and in
1654 // current implementations they have same costs.
1656 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1658 }
1659 case Intrinsic::fptoui_sat:
1660 case Intrinsic::fptosi_sat: {
1662 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1663 Type *SrcTy = ICA.getArgTypes()[0];
1664
1665 auto SrcLT = getTypeLegalizationCost(SrcTy);
1666 auto DstLT = getTypeLegalizationCost(RetTy);
1667 if (!SrcTy->isVectorTy())
1668 break;
1669
1670 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1672
1673 Cost +=
1674 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1675 RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
1676
1677 // Handle NaN.
1678 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1679 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1680 Type *CondTy = RetTy->getWithNewBitWidth(1);
1681 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1683 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1685 return Cost;
1686 }
1687 }
1688
1689 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1690 if (auto LT = getTypeLegalizationCost(RetTy);
1691 LT.second.isVector()) {
1692 MVT EltTy = LT.second.getVectorElementType();
1693 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1694 ICA.getID(), EltTy))
1695 return LT.first * Entry->Cost;
1696 }
1697 }
1698
1700}
1701
1704 const SCEV *Ptr,
1706 // Address computations for vector indexed load/store likely require an offset
1707 // and/or scaling.
1708 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1709 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1710
1711 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1712}
1713
1715 Type *Src,
1718 const Instruction *I) const {
1719 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1720 if (!IsVectorType)
1721 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1722
1723 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1724 // For now, skip all fixed vector cost analysis when P extension is available
1725 // to avoid crashes in getMinRVVVectorSizeInBits()
1726 if (ST->enablePExtSIMDCodeGen() &&
1728 return 1; // Treat as single instruction cost for now
1729 }
1730
1731 // FIXME: Need to compute legalizing cost for illegal types. The current
1732 // code handles only legal types and those which can be trivially
1733 // promoted to legal.
1734 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1735 Dst->getScalarSizeInBits() > ST->getELen())
1736 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1737
1738 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1739 assert(ISD && "Invalid opcode");
1740 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1741 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1742
1743 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1744 // The shared implementation doesn't model vector widening during legalization
1745 // and instead assumes scalarization. In order to scalarize an <N x i1>
1746 // vector, we need to extend/trunc to/from i8. If we don't special case
1747 // this, we can get an infinite recursion cycle.
1748 switch (ISD) {
1749 default:
1750 break;
1751 case ISD::SIGN_EXTEND:
1752 case ISD::ZERO_EXTEND:
1753 if (Src->getScalarSizeInBits() == 1) {
1754 // We do not use vsext/vzext to extend from mask vector.
1755 // Instead we use the following instructions to extend from mask vector:
1756 // vmv.v.i v8, 0
1757 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1758 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1759 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1760 DstLT.second, CostKind) +
1761 DstLT.first - 1;
1762 }
1763 break;
1764 case ISD::TRUNCATE:
1765 if (Dst->getScalarSizeInBits() == 1) {
1766 // We do not use several vncvt to truncate to mask vector. So we could
1767 // not use PowDiff to calculate it.
1768 // Instead we use the following instructions to truncate to mask vector:
1769 // vand.vi v8, v8, 1
1770 // vmsne.vi v0, v8, 0
1771 return SrcLT.first *
1772 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1773 SrcLT.second, CostKind) +
1774 SrcLT.first - 1;
1775 }
1776 break;
1777 };
1778
1779 // Our actual lowering for the case where a wider legal type is available
1780 // uses promotion to the wider type. This is reflected in the result of
1781 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1782 // scalarized if the legalized Src and Dst are not equal sized.
1783 const DataLayout &DL = this->getDataLayout();
1784 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1785 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1786 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1787 SrcLT.second.getSizeInBits()) ||
1788 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1789 DstLT.second.getSizeInBits()) ||
1790 SrcLT.first > 1 || DstLT.first > 1)
1791 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1792
1793 // The split cost is handled by the base getCastInstrCost
1794 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1795
1796 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1797 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1798 switch (ISD) {
1799 case ISD::SIGN_EXTEND:
1800 case ISD::ZERO_EXTEND: {
1801 if ((PowDiff < 1) || (PowDiff > 3))
1802 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1803 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1804 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1805 unsigned Op =
1806 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1807 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1808 }
1809 case ISD::TRUNCATE:
1810 case ISD::FP_EXTEND:
1811 case ISD::FP_ROUND: {
1812 // Counts of narrow/widen instructions.
1813 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1814 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1815
1816 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1817 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1818 : RISCV::VFNCVT_F_F_W;
1820 for (; SrcEltSize != DstEltSize;) {
1821 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1822 ? MVT::getIntegerVT(DstEltSize)
1823 : MVT::getFloatingPointVT(DstEltSize);
1824 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1825 DstEltSize =
1826 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1827 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1828 }
1829 return Cost;
1830 }
1831 case ISD::FP_TO_SINT:
1832 case ISD::FP_TO_UINT: {
1833 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1834 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1835 unsigned FWCVT =
1836 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1837 unsigned FNCVT =
1838 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1839 unsigned SrcEltSize = Src->getScalarSizeInBits();
1840 unsigned DstEltSize = Dst->getScalarSizeInBits();
1842 if ((SrcEltSize == 16) &&
1843 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1844 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1845 // pre-widening to f32 and then convert f32 to integer
1846 VectorType *VecF32Ty =
1847 VectorType::get(Type::getFloatTy(Dst->getContext()),
1848 cast<VectorType>(Dst)->getElementCount());
1849 std::pair<InstructionCost, MVT> VecF32LT =
1850 getTypeLegalizationCost(VecF32Ty);
1851 Cost +=
1852 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1853 VecF32LT.second, CostKind);
1854 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1855 return Cost;
1856 }
1857 if (DstEltSize == SrcEltSize)
1858 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1859 else if (DstEltSize > SrcEltSize)
1860 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1861 else { // (SrcEltSize > DstEltSize)
1862 // First do a narrowing conversion to an integer half the size, then
1863 // truncate if needed.
1864 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1865 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1866 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1867 if ((SrcEltSize / 2) > DstEltSize) {
1868 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1869 Cost +=
1870 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1871 }
1872 }
1873 return Cost;
1874 }
1875 case ISD::SINT_TO_FP:
1876 case ISD::UINT_TO_FP: {
1877 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1878 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1879 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1880 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1881 unsigned SrcEltSize = Src->getScalarSizeInBits();
1882 unsigned DstEltSize = Dst->getScalarSizeInBits();
1883
1885 if ((DstEltSize == 16) &&
1886 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1887 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1888 // it is converted to f32 and then converted to f16
1889 VectorType *VecF32Ty =
1890 VectorType::get(Type::getFloatTy(Dst->getContext()),
1891 cast<VectorType>(Dst)->getElementCount());
1892 std::pair<InstructionCost, MVT> VecF32LT =
1893 getTypeLegalizationCost(VecF32Ty);
1894 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1895 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1896 DstLT.second, CostKind);
1897 return Cost;
1898 }
1899
1900 if (DstEltSize == SrcEltSize)
1901 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1902 else if (DstEltSize > SrcEltSize) {
1903 if ((DstEltSize / 2) > SrcEltSize) {
1904 VectorType *VecTy =
1905 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1906 cast<VectorType>(Dst)->getElementCount());
1907 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1908 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1909 }
1910 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1911 } else
1912 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1913 return Cost;
1914 }
1915 }
1916 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1917}
1918
1919unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
1920 if (isa<ScalableVectorType>(Ty)) {
1921 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1922 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1923 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1924 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1925 }
1926 return cast<FixedVectorType>(Ty)->getNumElements();
1927}
1928
1931 FastMathFlags FMF,
1933 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1934 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1935
1936 // Skip if scalar size of Ty is bigger than ELEN.
1937 if (Ty->getScalarSizeInBits() > ST->getELen())
1938 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1939
1940 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1941 if (Ty->getElementType()->isIntegerTy(1)) {
1942 // SelectionDAGBuilder does following transforms:
1943 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1944 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1945 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1946 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1947 else
1948 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1949 }
1950
1951 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1953 InstructionCost ExtraCost = 0;
1954 switch (IID) {
1955 case Intrinsic::maximum:
1956 if (FMF.noNaNs()) {
1957 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1958 } else {
1959 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1960 RISCV::VFMV_F_S};
1961 // Cost of Canonical Nan + branch
1962 // lui a0, 523264
1963 // fmv.w.x fa0, a0
1964 Type *DstTy = Ty->getScalarType();
1965 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1966 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1967 ExtraCost = 1 +
1968 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1970 getCFInstrCost(Instruction::Br, CostKind);
1971 }
1972 break;
1973
1974 case Intrinsic::minimum:
1975 if (FMF.noNaNs()) {
1976 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1977 } else {
1978 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1979 RISCV::VFMV_F_S};
1980 // Cost of Canonical Nan + branch
1981 // lui a0, 523264
1982 // fmv.w.x fa0, a0
1983 Type *DstTy = Ty->getScalarType();
1984 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1985 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1986 ExtraCost = 1 +
1987 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1989 getCFInstrCost(Instruction::Br, CostKind);
1990 }
1991 break;
1992 }
1993 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1994 }
1995
1996 // IR Reduction is composed by one rvv reduction instruction and vmv
1997 unsigned SplitOp;
1999 switch (IID) {
2000 default:
2001 llvm_unreachable("Unsupported intrinsic");
2002 case Intrinsic::smax:
2003 SplitOp = RISCV::VMAX_VV;
2004 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
2005 break;
2006 case Intrinsic::smin:
2007 SplitOp = RISCV::VMIN_VV;
2008 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
2009 break;
2010 case Intrinsic::umax:
2011 SplitOp = RISCV::VMAXU_VV;
2012 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
2013 break;
2014 case Intrinsic::umin:
2015 SplitOp = RISCV::VMINU_VV;
2016 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
2017 break;
2018 case Intrinsic::maxnum:
2019 SplitOp = RISCV::VFMAX_VV;
2020 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2021 break;
2022 case Intrinsic::minnum:
2023 SplitOp = RISCV::VFMIN_VV;
2024 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2025 break;
2026 }
2027 // Add a cost for data larger than LMUL8
2028 InstructionCost SplitCost =
2029 (LT.first > 1) ? (LT.first - 1) *
2030 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2031 : 0;
2032 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2033}
2034
2037 std::optional<FastMathFlags> FMF,
2039 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2040 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2041
2042 // Skip if scalar size of Ty is bigger than ELEN.
2043 if (Ty->getScalarSizeInBits() > ST->getELen())
2044 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2045
2046 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2047 assert(ISD && "Invalid opcode");
2048
2049 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
2050 ISD != ISD::FADD)
2051 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2052
2053 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2054 Type *ElementTy = Ty->getElementType();
2055 if (ElementTy->isIntegerTy(1)) {
2056 // Example sequences:
2057 // vfirst.m a0, v0
2058 // seqz a0, a0
2059 if (LT.second == MVT::v1i1)
2060 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
2061 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2063
2064 if (ISD == ISD::AND) {
2065 // Example sequences:
2066 // vmand.mm v8, v9, v8 ; needed every time type is split
2067 // vmnot.m v8, v0 ; alias for vmnand
2068 // vcpop.m a0, v8
2069 // seqz a0, a0
2070
2071 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
2072 // For LMUL <= 8, there is no splitting,
2073 // the sequences are vmnot, vcpop and seqz.
2074 // When LMUL > 8 and split = 1,
2075 // the sequences are vmnand, vcpop and seqz.
2076 // When LMUL > 8 and split > 1,
2077 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
2078 return ((LT.first > 2) ? (LT.first - 2) : 0) *
2079 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
2080 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
2081 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2082 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2084 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
2085 // Example sequences:
2086 // vsetvli a0, zero, e8, mf8, ta, ma
2087 // vmxor.mm v8, v0, v8 ; needed every time type is split
2088 // vcpop.m a0, v8
2089 // andi a0, a0, 1
2090 return (LT.first - 1) *
2091 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
2092 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
2093 } else {
2094 assert(ISD == ISD::OR);
2095 // Example sequences:
2096 // vsetvli a0, zero, e8, mf8, ta, ma
2097 // vmor.mm v8, v9, v8 ; needed every time type is split
2098 // vcpop.m a0, v0
2099 // snez a0, a0
2100 return (LT.first - 1) *
2101 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
2102 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2103 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2105 }
2106 }
2107
2108 // IR Reduction of or/and is composed by one vmv and one rvv reduction
2109 // instruction, and others is composed by two vmv and one rvv reduction
2110 // instruction
2111 unsigned SplitOp;
2113 switch (ISD) {
2114 case ISD::ADD:
2115 SplitOp = RISCV::VADD_VV;
2116 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
2117 break;
2118 case ISD::OR:
2119 SplitOp = RISCV::VOR_VV;
2120 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
2121 break;
2122 case ISD::XOR:
2123 SplitOp = RISCV::VXOR_VV;
2124 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
2125 break;
2126 case ISD::AND:
2127 SplitOp = RISCV::VAND_VV;
2128 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
2129 break;
2130 case ISD::FADD:
2131 // We can't promote f16/bf16 fadd reductions.
2132 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
2133 LT.second.getScalarType() == MVT::bf16)
2134 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2136 Opcodes.push_back(RISCV::VFMV_S_F);
2137 for (unsigned i = 0; i < LT.first.getValue(); i++)
2138 Opcodes.push_back(RISCV::VFREDOSUM_VS);
2139 Opcodes.push_back(RISCV::VFMV_F_S);
2140 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2141 }
2142 SplitOp = RISCV::VFADD_VV;
2143 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2144 break;
2145 }
2146 // Add a cost for data larger than LMUL8
2147 InstructionCost SplitCost =
2148 (LT.first > 1) ? (LT.first - 1) *
2149 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2150 : 0;
2151 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2152}
2153
2155 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2156 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2157 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2158 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2159 FMF, CostKind);
2160
2161 // Skip if scalar size of ResTy is bigger than ELEN.
2162 if (ResTy->getScalarSizeInBits() > ST->getELen())
2163 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2164 FMF, CostKind);
2165
2166 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2167 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2168 FMF, CostKind);
2169
2170 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2171
2172 if (IsUnsigned && Opcode == Instruction::Add &&
2173 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2174 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2175 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2176 return LT.first *
2177 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2178 }
2179
2180 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2181 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2182 FMF, CostKind);
2183
2184 return (LT.first - 1) +
2185 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2186}
2187
2191 assert(OpInfo.isConstant() && "non constant operand?");
2192 if (!isa<VectorType>(Ty))
2193 // FIXME: We need to account for immediate materialization here, but doing
2194 // a decent job requires more knowledge about the immediate than we
2195 // currently have here.
2196 return 0;
2197
2198 if (OpInfo.isUniform())
2199 // vmv.v.i, vmv.v.x, or vfmv.v.f
2200 // We ignore the cost of the scalar constant materialization to be consistent
2201 // with how we treat scalar constants themselves just above.
2202 return 1;
2203
2204 return getConstantPoolLoadCost(Ty, CostKind);
2205}
2206
2208 Align Alignment,
2209 unsigned AddressSpace,
2211 TTI::OperandValueInfo OpInfo,
2212 const Instruction *I) const {
2213 EVT VT = TLI->getValueType(DL, Src, true);
2214 // Type legalization can't handle structs
2215 if (VT == MVT::Other)
2216 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2217 CostKind, OpInfo, I);
2218
2220 if (Opcode == Instruction::Store && OpInfo.isConstant())
2221 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2222
2223 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2224
2225 InstructionCost BaseCost = [&]() {
2226 InstructionCost Cost = LT.first;
2228 return Cost;
2229
2230 // Our actual lowering for the case where a wider legal type is available
2231 // uses the a VL predicated load on the wider type. This is reflected in
2232 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2233 // widened cases are scalarized.
2234 const DataLayout &DL = this->getDataLayout();
2235 if (Src->isVectorTy() && LT.second.isVector() &&
2236 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
2237 LT.second.getSizeInBits()))
2238 return Cost;
2239
2240 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2241 CostKind, OpInfo, I);
2242 }();
2243
2244 // Assume memory ops cost scale with the number of vector registers
2245 // possible accessed by the instruction. Note that BasicTTI already
2246 // handles the LT.first term for us.
2247 if (ST->hasVInstructions() && LT.second.isVector() &&
2249 BaseCost *= TLI->getLMULCost(LT.second);
2250 return Cost + BaseCost;
2251}
2252
2254 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2256 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2258 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2259 Op1Info, Op2Info, I);
2260
2261 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2262 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2263 Op1Info, Op2Info, I);
2264
2265 // Skip if scalar size of ValTy is bigger than ELEN.
2266 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2267 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2268 Op1Info, Op2Info, I);
2269
2270 auto GetConstantMatCost =
2271 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2272 if (OpInfo.isUniform())
2273 // We return 0 we currently ignore the cost of materializing scalar
2274 // constants in GPRs.
2275 return 0;
2276
2277 return getConstantPoolLoadCost(ValTy, CostKind);
2278 };
2279
2280 InstructionCost ConstantMatCost;
2281 if (Op1Info.isConstant())
2282 ConstantMatCost += GetConstantMatCost(Op1Info);
2283 if (Op2Info.isConstant())
2284 ConstantMatCost += GetConstantMatCost(Op2Info);
2285
2286 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2287 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2288 if (CondTy->isVectorTy()) {
2289 if (ValTy->getScalarSizeInBits() == 1) {
2290 // vmandn.mm v8, v8, v9
2291 // vmand.mm v9, v0, v9
2292 // vmor.mm v0, v9, v8
2293 return ConstantMatCost +
2294 LT.first *
2295 getRISCVInstructionCost(
2296 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2297 LT.second, CostKind);
2298 }
2299 // vselect and max/min are supported natively.
2300 return ConstantMatCost +
2301 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2302 CostKind);
2303 }
2304
2305 if (ValTy->getScalarSizeInBits() == 1) {
2306 // vmv.v.x v9, a0
2307 // vmsne.vi v9, v9, 0
2308 // vmandn.mm v8, v8, v9
2309 // vmand.mm v9, v0, v9
2310 // vmor.mm v0, v9, v8
2311 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2312 return ConstantMatCost +
2313 LT.first *
2314 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2315 InterimVT, CostKind) +
2316 LT.first * getRISCVInstructionCost(
2317 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2318 LT.second, CostKind);
2319 }
2320
2321 // vmv.v.x v10, a0
2322 // vmsne.vi v0, v10, 0
2323 // vmerge.vvm v8, v9, v8, v0
2324 return ConstantMatCost +
2325 LT.first * getRISCVInstructionCost(
2326 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2327 LT.second, CostKind);
2328 }
2329
2330 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2331 CmpInst::isIntPredicate(VecPred)) {
2332 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2333 // provided they incur the same cost across all implementations
2334 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2335 LT.second,
2336 CostKind);
2337 }
2338
2339 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2340 CmpInst::isFPPredicate(VecPred)) {
2341
2342 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2343 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2344 return ConstantMatCost +
2345 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2346
2347 // If we do not support the input floating point vector type, use the base
2348 // one which will calculate as:
2349 // ScalarizeCost + Num * Cost for fixed vector,
2350 // InvalidCost for scalable vector.
2351 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2352 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2353 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2354 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2355 Op1Info, Op2Info, I);
2356
2357 // Assuming vector fp compare and mask instructions are all the same cost
2358 // until a need arises to differentiate them.
2359 switch (VecPred) {
2360 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2361 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2362 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2363 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2364 return ConstantMatCost +
2365 LT.first * getRISCVInstructionCost(
2366 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2367 LT.second, CostKind);
2368
2369 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2370 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2371 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2372 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2373 return ConstantMatCost +
2374 LT.first *
2375 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2376 LT.second, CostKind);
2377
2378 case CmpInst::FCMP_OEQ: // vmfeq.vv
2379 case CmpInst::FCMP_OGT: // vmflt.vv
2380 case CmpInst::FCMP_OGE: // vmfle.vv
2381 case CmpInst::FCMP_OLT: // vmflt.vv
2382 case CmpInst::FCMP_OLE: // vmfle.vv
2383 case CmpInst::FCMP_UNE: // vmfne.vv
2384 return ConstantMatCost +
2385 LT.first *
2386 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2387 default:
2388 break;
2389 }
2390 }
2391
2392 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2393 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2394 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2395 // be (0 + select instr cost).
2396 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2397 ValTy->isIntegerTy() && !I->user_empty()) {
2398 if (all_of(I->users(), [&](const User *U) {
2399 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2400 U->getType()->isIntegerTy() &&
2401 !isa<ConstantData>(U->getOperand(1)) &&
2402 !isa<ConstantData>(U->getOperand(2));
2403 }))
2404 return 0;
2405 }
2406
2407 // TODO: Add cost for scalar type.
2408
2409 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2410 Op1Info, Op2Info, I);
2411}
2412
2415 const Instruction *I) const {
2417 return Opcode == Instruction::PHI ? 0 : 1;
2418 // Branches are assumed to be predicted.
2419 return 0;
2420}
2421
2423 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
2424 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
2425 assert(Val->isVectorTy() && "This must be a vector type");
2426
2427 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
2428 // For now, skip all fixed vector cost analysis when P extension is available
2429 // to avoid crashes in getMinRVVVectorSizeInBits()
2430 if (ST->enablePExtSIMDCodeGen() && isa<FixedVectorType>(Val)) {
2431 return 1; // Treat as single instruction cost for now
2432 }
2433
2434 if (Opcode != Instruction::ExtractElement &&
2435 Opcode != Instruction::InsertElement)
2436 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1,
2437 VIC);
2438
2439 // Legalize the type.
2440 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2441
2442 // This type is legalized to a scalar type.
2443 if (!LT.second.isVector()) {
2444 auto *FixedVecTy = cast<FixedVectorType>(Val);
2445 // If Index is a known constant, cost is zero.
2446 if (Index != -1U)
2447 return 0;
2448 // Extract/InsertElement with non-constant index is very costly when
2449 // scalarized; estimate cost of loads/stores sequence via the stack:
2450 // ExtractElement cost: store vector to stack, load scalar;
2451 // InsertElement cost: store vector to stack, store scalar, load vector.
2452 Type *ElemTy = FixedVecTy->getElementType();
2453 auto NumElems = FixedVecTy->getNumElements();
2454 auto Align = DL.getPrefTypeAlign(ElemTy);
2455 InstructionCost LoadCost =
2456 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2457 InstructionCost StoreCost =
2458 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2459 return Opcode == Instruction::ExtractElement
2460 ? StoreCost * NumElems + LoadCost
2461 : (StoreCost + LoadCost) * NumElems + StoreCost;
2462 }
2463
2464 // For unsupported scalable vector.
2465 if (LT.second.isScalableVector() && !LT.first.isValid())
2466 return LT.first;
2467
2468 // Mask vector extract/insert is expanded via e8.
2469 if (Val->getScalarSizeInBits() == 1) {
2470 VectorType *WideTy =
2472 cast<VectorType>(Val)->getElementCount());
2473 if (Opcode == Instruction::ExtractElement) {
2474 InstructionCost ExtendCost
2475 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2477 InstructionCost ExtractCost
2478 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2479 return ExtendCost + ExtractCost;
2480 }
2481 InstructionCost ExtendCost
2482 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2484 InstructionCost InsertCost
2485 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2486 InstructionCost TruncCost
2487 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2489 return ExtendCost + InsertCost + TruncCost;
2490 }
2491
2492
2493 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2494 // and vslideup + vmv.s.x to insert element to vector.
2495 unsigned BaseCost = 1;
2496 // When insertelement we should add the index with 1 as the input of vslideup.
2497 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2498
2499 if (Index != -1U) {
2500 // The type may be split. For fixed-width vectors we can normalize the
2501 // index to the new type.
2502 if (LT.second.isFixedLengthVector()) {
2503 unsigned Width = LT.second.getVectorNumElements();
2504 Index = Index % Width;
2505 }
2506
2507 // If exact VLEN is known, we will insert/extract into the appropriate
2508 // subvector with no additional subvector insert/extract cost.
2509 if (auto VLEN = ST->getRealVLen()) {
2510 unsigned EltSize = LT.second.getScalarSizeInBits();
2511 unsigned M1Max = *VLEN / EltSize;
2512 Index = Index % M1Max;
2513 }
2514
2515 if (Index == 0)
2516 // We can extract/insert the first element without vslidedown/vslideup.
2517 SlideCost = 0;
2518 else if (ST->hasVendorXRivosVisni() && isUInt<5>(Index) &&
2519 Val->getScalarType()->isIntegerTy())
2520 SlideCost = 0; // With ri.vinsert/ri.vextract there is no slide needed
2521 else if (Opcode == Instruction::InsertElement)
2522 SlideCost = 1; // With a constant index, we do not need to use addi.
2523 }
2524
2525 // When the vector needs to split into multiple register groups and the index
2526 // exceeds single vector register group, we need to insert/extract the element
2527 // via stack.
2528 if (LT.first > 1 &&
2529 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2530 LT.second.isScalableVector()))) {
2531 Type *ScalarType = Val->getScalarType();
2532 Align VecAlign = DL.getPrefTypeAlign(Val);
2533 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2534 // Extra addi for unknown index.
2535 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2536
2537 // Store all split vectors into stack and load the target element.
2538 if (Opcode == Instruction::ExtractElement)
2539 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2540 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2541 CostKind) +
2542 IdxCost;
2543
2544 // Store all split vectors into stack and store the target element and load
2545 // vectors back.
2546 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2547 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2548 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2549 CostKind) +
2550 IdxCost;
2551 }
2552
2553 // Extract i64 in the target that has XLEN=32 need more instruction.
2554 if (Val->getScalarType()->isIntegerTy() &&
2555 ST->getXLen() < Val->getScalarSizeInBits()) {
2556 // For extractelement, we need the following instructions:
2557 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2558 // vslidedown.vx v8, v8, a0
2559 // vmv.x.s a0, v8
2560 // li a1, 32
2561 // vsrl.vx v8, v8, a1
2562 // vmv.x.s a1, v8
2563
2564 // For insertelement, we need the following instructions:
2565 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2566 // vmv.v.i v12, 0
2567 // vslide1up.vx v16, v12, a1
2568 // vslide1up.vx v12, v16, a0
2569 // addi a0, a2, 1
2570 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2571 // vslideup.vx v8, v12, a2
2572
2573 // TODO: should we count these special vsetvlis?
2574 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2575 }
2576 return BaseCost + SlideCost;
2577}
2578
2582 unsigned Index) const {
2583 if (isa<FixedVectorType>(Val))
2585 Index);
2586
2587 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2588 // for the cost of extracting the last lane of a scalable vector. It probably
2589 // needs a more accurate cost.
2590 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2591 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2592 return getVectorInstrCost(Opcode, Val, CostKind,
2593 EC.getKnownMinValue() - 1 - Index, nullptr,
2594 nullptr);
2595}
2596
2598 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2600 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2601
2602 // TODO: Handle more cost kinds.
2604 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2605 Args, CxtI);
2606
2607 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2608 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2609 Args, CxtI);
2610
2611 // Skip if scalar size of Ty is bigger than ELEN.
2612 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2613 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2614 Args, CxtI);
2615
2616 // Legalize the type.
2617 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2618 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2619
2620 // TODO: Handle scalar type.
2621 if (!LT.second.isVector()) {
2622 static const CostTblEntry DivTbl[]{
2623 {ISD::UDIV, MVT::i32, TTI::TCC_Expensive},
2624 {ISD::UDIV, MVT::i64, TTI::TCC_Expensive},
2625 {ISD::SDIV, MVT::i32, TTI::TCC_Expensive},
2626 {ISD::SDIV, MVT::i64, TTI::TCC_Expensive},
2627 {ISD::UREM, MVT::i32, TTI::TCC_Expensive},
2628 {ISD::UREM, MVT::i64, TTI::TCC_Expensive},
2629 {ISD::SREM, MVT::i32, TTI::TCC_Expensive},
2630 {ISD::SREM, MVT::i64, TTI::TCC_Expensive}};
2631 if (TLI->isOperationLegalOrPromote(ISDOpcode, LT.second))
2632 if (const auto *Entry = CostTableLookup(DivTbl, ISDOpcode, LT.second))
2633 return Entry->Cost * LT.first;
2634
2635 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2636 Args, CxtI);
2637 }
2638
2639 // f16 with zvfhmin and bf16 will be promoted to f32.
2640 // FIXME: nxv32[b]f16 will be custom lowered and split.
2641 InstructionCost CastCost = 0;
2642 if ((LT.second.getVectorElementType() == MVT::f16 ||
2643 LT.second.getVectorElementType() == MVT::bf16) &&
2644 TLI->getOperationAction(ISDOpcode, LT.second) ==
2646 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2647 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2648 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2649 // Add cost of extending arguments
2650 CastCost += LT.first * Args.size() *
2651 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2653 // Add cost of truncating result
2654 CastCost +=
2655 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2657 // Compute cost of op in promoted type
2658 LT.second = PromotedVT;
2659 }
2660
2661 auto getConstantMatCost =
2662 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2663 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2664 // Two sub-cases:
2665 // * Has a 5 bit immediate operand which can be splatted.
2666 // * Has a larger immediate which must be materialized in scalar register
2667 // We return 0 for both as we currently ignore the cost of materializing
2668 // scalar constants in GPRs.
2669 return 0;
2670
2671 return getConstantPoolLoadCost(Ty, CostKind);
2672 };
2673
2674 // Add the cost of materializing any constant vectors required.
2675 InstructionCost ConstantMatCost = 0;
2676 if (Op1Info.isConstant())
2677 ConstantMatCost += getConstantMatCost(0, Op1Info);
2678 if (Op2Info.isConstant())
2679 ConstantMatCost += getConstantMatCost(1, Op2Info);
2680
2681 unsigned Op;
2682 switch (ISDOpcode) {
2683 case ISD::ADD:
2684 case ISD::SUB:
2685 Op = RISCV::VADD_VV;
2686 break;
2687 case ISD::SHL:
2688 case ISD::SRL:
2689 case ISD::SRA:
2690 Op = RISCV::VSLL_VV;
2691 break;
2692 case ISD::AND:
2693 case ISD::OR:
2694 case ISD::XOR:
2695 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2696 break;
2697 case ISD::MUL:
2698 case ISD::MULHS:
2699 case ISD::MULHU:
2700 Op = RISCV::VMUL_VV;
2701 break;
2702 case ISD::SDIV:
2703 case ISD::UDIV:
2704 Op = RISCV::VDIV_VV;
2705 break;
2706 case ISD::SREM:
2707 case ISD::UREM:
2708 Op = RISCV::VREM_VV;
2709 break;
2710 case ISD::FADD:
2711 case ISD::FSUB:
2712 Op = RISCV::VFADD_VV;
2713 break;
2714 case ISD::FMUL:
2715 Op = RISCV::VFMUL_VV;
2716 break;
2717 case ISD::FDIV:
2718 Op = RISCV::VFDIV_VV;
2719 break;
2720 case ISD::FNEG:
2721 Op = RISCV::VFSGNJN_VV;
2722 break;
2723 default:
2724 // Assuming all other instructions have the same cost until a need arises to
2725 // differentiate them.
2726 return CastCost + ConstantMatCost +
2727 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2728 Args, CxtI);
2729 }
2730
2731 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2732 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2733 // ops are twice as expensive as integer ops. Do the same for vectors so
2734 // scalar floating point ops aren't cheaper than their vector equivalents.
2735 if (Ty->isFPOrFPVectorTy())
2736 InstrCost *= 2;
2737 return CastCost + ConstantMatCost + LT.first * InstrCost;
2738}
2739
2740// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2742 ArrayRef<const Value *> Ptrs, const Value *Base,
2743 const TTI::PointersChainInfo &Info, Type *AccessTy,
2746 // In the basic model we take into account GEP instructions only
2747 // (although here can come alloca instruction, a value, constants and/or
2748 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2749 // pointer). Typically, if Base is a not a GEP-instruction and all the
2750 // pointers are relative to the same base address, all the rest are
2751 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2752 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2753 // any their index is a non-const.
2754 // If no known dependencies between the pointers cost is calculated as a sum
2755 // of costs of GEP instructions.
2756 for (auto [I, V] : enumerate(Ptrs)) {
2757 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2758 if (!GEP)
2759 continue;
2760 if (Info.isSameBase() && V != Base) {
2761 if (GEP->hasAllConstantIndices())
2762 continue;
2763 // If the chain is unit-stride and BaseReg + stride*i is a legal
2764 // addressing mode, then presume the base GEP is sitting around in a
2765 // register somewhere and check if we can fold the offset relative to
2766 // it.
2767 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2768 if (Info.isUnitStride() &&
2769 isLegalAddressingMode(AccessTy,
2770 /* BaseGV */ nullptr,
2771 /* BaseOffset */ Stride * I,
2772 /* HasBaseReg */ true,
2773 /* Scale */ 0,
2774 GEP->getType()->getPointerAddressSpace()))
2775 continue;
2776 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2777 {TTI::OK_AnyValue, TTI::OP_None},
2778 {TTI::OK_AnyValue, TTI::OP_None}, {});
2779 } else {
2780 SmallVector<const Value *> Indices(GEP->indices());
2781 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2782 Indices, AccessTy, CostKind);
2783 }
2784 }
2785 return Cost;
2786}
2787
2790 OptimizationRemarkEmitter *ORE) const {
2791 // TODO: More tuning on benchmarks and metrics with changes as needed
2792 // would apply to all settings below to enable performance.
2793
2794
2795 if (ST->enableDefaultUnroll())
2796 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2797
2798 // Enable Upper bound unrolling universally, not dependent upon the conditions
2799 // below.
2800 UP.UpperBound = true;
2801
2802 // Disable loop unrolling for Oz and Os.
2803 UP.OptSizeThreshold = 0;
2805 if (L->getHeader()->getParent()->hasOptSize())
2806 return;
2807
2808 SmallVector<BasicBlock *, 4> ExitingBlocks;
2809 L->getExitingBlocks(ExitingBlocks);
2810 LLVM_DEBUG(dbgs() << "Loop has:\n"
2811 << "Blocks: " << L->getNumBlocks() << "\n"
2812 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2813
2814 // Only allow another exit other than the latch. This acts as an early exit
2815 // as it mirrors the profitability calculation of the runtime unroller.
2816 if (ExitingBlocks.size() > 2)
2817 return;
2818
2819 // Limit the CFG of the loop body for targets with a branch predictor.
2820 // Allowing 4 blocks permits if-then-else diamonds in the body.
2821 if (L->getNumBlocks() > 4)
2822 return;
2823
2824 // Scan the loop: don't unroll loops with calls as this could prevent
2825 // inlining. Don't unroll auto-vectorized loops either, though do allow
2826 // unrolling of the scalar remainder.
2827 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2829 for (auto *BB : L->getBlocks()) {
2830 for (auto &I : *BB) {
2831 // Both auto-vectorized loops and the scalar remainder have the
2832 // isvectorized attribute, so differentiate between them by the presence
2833 // of vector instructions.
2834 if (IsVectorized && (I.getType()->isVectorTy() ||
2835 llvm::any_of(I.operand_values(), [](Value *V) {
2836 return V->getType()->isVectorTy();
2837 })))
2838 return;
2839
2840 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2841 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2842 if (!isLoweredToCall(F))
2843 continue;
2844 }
2845 return;
2846 }
2847
2848 SmallVector<const Value *> Operands(I.operand_values());
2849 Cost += getInstructionCost(&I, Operands,
2851 }
2852 }
2853
2854 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2855
2856 UP.Partial = true;
2857 UP.Runtime = true;
2858 UP.UnrollRemainder = true;
2859 UP.UnrollAndJam = true;
2860
2861 // Force unrolling small loops can be very useful because of the branch
2862 // taken cost of the backedge.
2863 if (Cost < 12)
2864 UP.Force = true;
2865}
2866
2871
2873 MemIntrinsicInfo &Info) const {
2874 const DataLayout &DL = getDataLayout();
2875 Intrinsic::ID IID = Inst->getIntrinsicID();
2876 LLVMContext &C = Inst->getContext();
2877 bool HasMask = false;
2878
2879 auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,
2880 bool IsWrite) -> int64_t {
2881 if (auto *TarExtTy =
2882 dyn_cast<TargetExtType>(II->getArgOperand(0)->getType()))
2883 return TarExtTy->getIntParameter(0);
2884
2885 return 1;
2886 };
2887
2888 switch (IID) {
2889 case Intrinsic::riscv_vle_mask:
2890 case Intrinsic::riscv_vse_mask:
2891 case Intrinsic::riscv_vlseg2_mask:
2892 case Intrinsic::riscv_vlseg3_mask:
2893 case Intrinsic::riscv_vlseg4_mask:
2894 case Intrinsic::riscv_vlseg5_mask:
2895 case Intrinsic::riscv_vlseg6_mask:
2896 case Intrinsic::riscv_vlseg7_mask:
2897 case Intrinsic::riscv_vlseg8_mask:
2898 case Intrinsic::riscv_vsseg2_mask:
2899 case Intrinsic::riscv_vsseg3_mask:
2900 case Intrinsic::riscv_vsseg4_mask:
2901 case Intrinsic::riscv_vsseg5_mask:
2902 case Intrinsic::riscv_vsseg6_mask:
2903 case Intrinsic::riscv_vsseg7_mask:
2904 case Intrinsic::riscv_vsseg8_mask:
2905 HasMask = true;
2906 [[fallthrough]];
2907 case Intrinsic::riscv_vle:
2908 case Intrinsic::riscv_vse:
2909 case Intrinsic::riscv_vlseg2:
2910 case Intrinsic::riscv_vlseg3:
2911 case Intrinsic::riscv_vlseg4:
2912 case Intrinsic::riscv_vlseg5:
2913 case Intrinsic::riscv_vlseg6:
2914 case Intrinsic::riscv_vlseg7:
2915 case Intrinsic::riscv_vlseg8:
2916 case Intrinsic::riscv_vsseg2:
2917 case Intrinsic::riscv_vsseg3:
2918 case Intrinsic::riscv_vsseg4:
2919 case Intrinsic::riscv_vsseg5:
2920 case Intrinsic::riscv_vsseg6:
2921 case Intrinsic::riscv_vsseg7:
2922 case Intrinsic::riscv_vsseg8: {
2923 // Intrinsic interface:
2924 // riscv_vle(merge, ptr, vl)
2925 // riscv_vle_mask(merge, ptr, mask, vl, policy)
2926 // riscv_vse(val, ptr, vl)
2927 // riscv_vse_mask(val, ptr, mask, vl, policy)
2928 // riscv_vlseg#(merge, ptr, vl, sew)
2929 // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)
2930 // riscv_vsseg#(val, ptr, vl, sew)
2931 // riscv_vsseg#_mask(val, ptr, mask, vl, sew)
2932 bool IsWrite = Inst->getType()->isVoidTy();
2933 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2934 // The results of segment loads are TargetExtType.
2935 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
2936 unsigned SEW =
2937 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
2938 ->getZExtValue();
2939 Ty = TarExtTy->getTypeParameter(0U);
2941 IntegerType::get(C, SEW),
2942 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
2943 }
2944 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2945 unsigned VLIndex = RVVIInfo->VLOperand;
2946 unsigned PtrOperandNo = VLIndex - 1 - HasMask;
2947 MaybeAlign Alignment =
2948 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
2949 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
2950 Value *Mask = ConstantInt::getTrue(MaskType);
2951 if (HasMask)
2952 Mask = Inst->getArgOperand(VLIndex - 1);
2953 Value *EVL = Inst->getArgOperand(VLIndex);
2954 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
2955 // RVV uses contiguous elements as a segment.
2956 if (SegNum > 1) {
2957 unsigned ElemSize = Ty->getScalarSizeInBits();
2958 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
2959 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
2960 }
2961 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
2962 Alignment, Mask, EVL);
2963 return true;
2964 }
2965 case Intrinsic::riscv_vlse_mask:
2966 case Intrinsic::riscv_vsse_mask:
2967 case Intrinsic::riscv_vlsseg2_mask:
2968 case Intrinsic::riscv_vlsseg3_mask:
2969 case Intrinsic::riscv_vlsseg4_mask:
2970 case Intrinsic::riscv_vlsseg5_mask:
2971 case Intrinsic::riscv_vlsseg6_mask:
2972 case Intrinsic::riscv_vlsseg7_mask:
2973 case Intrinsic::riscv_vlsseg8_mask:
2974 case Intrinsic::riscv_vssseg2_mask:
2975 case Intrinsic::riscv_vssseg3_mask:
2976 case Intrinsic::riscv_vssseg4_mask:
2977 case Intrinsic::riscv_vssseg5_mask:
2978 case Intrinsic::riscv_vssseg6_mask:
2979 case Intrinsic::riscv_vssseg7_mask:
2980 case Intrinsic::riscv_vssseg8_mask:
2981 HasMask = true;
2982 [[fallthrough]];
2983 case Intrinsic::riscv_vlse:
2984 case Intrinsic::riscv_vsse:
2985 case Intrinsic::riscv_vlsseg2:
2986 case Intrinsic::riscv_vlsseg3:
2987 case Intrinsic::riscv_vlsseg4:
2988 case Intrinsic::riscv_vlsseg5:
2989 case Intrinsic::riscv_vlsseg6:
2990 case Intrinsic::riscv_vlsseg7:
2991 case Intrinsic::riscv_vlsseg8:
2992 case Intrinsic::riscv_vssseg2:
2993 case Intrinsic::riscv_vssseg3:
2994 case Intrinsic::riscv_vssseg4:
2995 case Intrinsic::riscv_vssseg5:
2996 case Intrinsic::riscv_vssseg6:
2997 case Intrinsic::riscv_vssseg7:
2998 case Intrinsic::riscv_vssseg8: {
2999 // Intrinsic interface:
3000 // riscv_vlse(merge, ptr, stride, vl)
3001 // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
3002 // riscv_vsse(val, ptr, stride, vl)
3003 // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
3004 // riscv_vlsseg#(merge, ptr, offset, vl, sew)
3005 // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)
3006 // riscv_vssseg#(val, ptr, offset, vl, sew)
3007 // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)
3008 bool IsWrite = Inst->getType()->isVoidTy();
3009 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3010 // The results of segment loads are TargetExtType.
3011 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3012 unsigned SEW =
3013 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3014 ->getZExtValue();
3015 Ty = TarExtTy->getTypeParameter(0U);
3017 IntegerType::get(C, SEW),
3018 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3019 }
3020 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3021 unsigned VLIndex = RVVIInfo->VLOperand;
3022 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3023 MaybeAlign Alignment =
3024 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
3025
3026 Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);
3027 // Use the pointer alignment as the element alignment if the stride is a
3028 // multiple of the pointer alignment. Otherwise, the element alignment
3029 // should be the greatest common divisor of pointer alignment and stride.
3030 // For simplicity, just consider unalignment for elements.
3031 unsigned PointerAlign = Alignment.valueOrOne().value();
3032 if (!isa<ConstantInt>(Stride) ||
3033 cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
3034 Alignment = Align(1);
3035
3036 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3037 Value *Mask = ConstantInt::getTrue(MaskType);
3038 if (HasMask)
3039 Mask = Inst->getArgOperand(VLIndex - 1);
3040 Value *EVL = Inst->getArgOperand(VLIndex);
3041 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3042 // RVV uses contiguous elements as a segment.
3043 if (SegNum > 1) {
3044 unsigned ElemSize = Ty->getScalarSizeInBits();
3045 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3046 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3047 }
3048 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3049 Alignment, Mask, EVL, Stride);
3050 return true;
3051 }
3052 case Intrinsic::riscv_vloxei_mask:
3053 case Intrinsic::riscv_vluxei_mask:
3054 case Intrinsic::riscv_vsoxei_mask:
3055 case Intrinsic::riscv_vsuxei_mask:
3056 case Intrinsic::riscv_vloxseg2_mask:
3057 case Intrinsic::riscv_vloxseg3_mask:
3058 case Intrinsic::riscv_vloxseg4_mask:
3059 case Intrinsic::riscv_vloxseg5_mask:
3060 case Intrinsic::riscv_vloxseg6_mask:
3061 case Intrinsic::riscv_vloxseg7_mask:
3062 case Intrinsic::riscv_vloxseg8_mask:
3063 case Intrinsic::riscv_vluxseg2_mask:
3064 case Intrinsic::riscv_vluxseg3_mask:
3065 case Intrinsic::riscv_vluxseg4_mask:
3066 case Intrinsic::riscv_vluxseg5_mask:
3067 case Intrinsic::riscv_vluxseg6_mask:
3068 case Intrinsic::riscv_vluxseg7_mask:
3069 case Intrinsic::riscv_vluxseg8_mask:
3070 case Intrinsic::riscv_vsoxseg2_mask:
3071 case Intrinsic::riscv_vsoxseg3_mask:
3072 case Intrinsic::riscv_vsoxseg4_mask:
3073 case Intrinsic::riscv_vsoxseg5_mask:
3074 case Intrinsic::riscv_vsoxseg6_mask:
3075 case Intrinsic::riscv_vsoxseg7_mask:
3076 case Intrinsic::riscv_vsoxseg8_mask:
3077 case Intrinsic::riscv_vsuxseg2_mask:
3078 case Intrinsic::riscv_vsuxseg3_mask:
3079 case Intrinsic::riscv_vsuxseg4_mask:
3080 case Intrinsic::riscv_vsuxseg5_mask:
3081 case Intrinsic::riscv_vsuxseg6_mask:
3082 case Intrinsic::riscv_vsuxseg7_mask:
3083 case Intrinsic::riscv_vsuxseg8_mask:
3084 HasMask = true;
3085 [[fallthrough]];
3086 case Intrinsic::riscv_vloxei:
3087 case Intrinsic::riscv_vluxei:
3088 case Intrinsic::riscv_vsoxei:
3089 case Intrinsic::riscv_vsuxei:
3090 case Intrinsic::riscv_vloxseg2:
3091 case Intrinsic::riscv_vloxseg3:
3092 case Intrinsic::riscv_vloxseg4:
3093 case Intrinsic::riscv_vloxseg5:
3094 case Intrinsic::riscv_vloxseg6:
3095 case Intrinsic::riscv_vloxseg7:
3096 case Intrinsic::riscv_vloxseg8:
3097 case Intrinsic::riscv_vluxseg2:
3098 case Intrinsic::riscv_vluxseg3:
3099 case Intrinsic::riscv_vluxseg4:
3100 case Intrinsic::riscv_vluxseg5:
3101 case Intrinsic::riscv_vluxseg6:
3102 case Intrinsic::riscv_vluxseg7:
3103 case Intrinsic::riscv_vluxseg8:
3104 case Intrinsic::riscv_vsoxseg2:
3105 case Intrinsic::riscv_vsoxseg3:
3106 case Intrinsic::riscv_vsoxseg4:
3107 case Intrinsic::riscv_vsoxseg5:
3108 case Intrinsic::riscv_vsoxseg6:
3109 case Intrinsic::riscv_vsoxseg7:
3110 case Intrinsic::riscv_vsoxseg8:
3111 case Intrinsic::riscv_vsuxseg2:
3112 case Intrinsic::riscv_vsuxseg3:
3113 case Intrinsic::riscv_vsuxseg4:
3114 case Intrinsic::riscv_vsuxseg5:
3115 case Intrinsic::riscv_vsuxseg6:
3116 case Intrinsic::riscv_vsuxseg7:
3117 case Intrinsic::riscv_vsuxseg8: {
3118 // Intrinsic interface (only listed ordered version):
3119 // riscv_vloxei(merge, ptr, index, vl)
3120 // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)
3121 // riscv_vsoxei(val, ptr, index, vl)
3122 // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)
3123 // riscv_vloxseg#(merge, ptr, index, vl, sew)
3124 // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)
3125 // riscv_vsoxseg#(val, ptr, index, vl, sew)
3126 // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)
3127 bool IsWrite = Inst->getType()->isVoidTy();
3128 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3129 // The results of segment loads are TargetExtType.
3130 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3131 unsigned SEW =
3132 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3133 ->getZExtValue();
3134 Ty = TarExtTy->getTypeParameter(0U);
3136 IntegerType::get(C, SEW),
3137 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3138 }
3139 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3140 unsigned VLIndex = RVVIInfo->VLOperand;
3141 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3142 Value *Mask;
3143 if (HasMask) {
3144 Mask = Inst->getArgOperand(VLIndex - 1);
3145 } else {
3146 // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,
3147 // and casting that to scalar i64 triggers a vector/scalar mismatch
3148 // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it
3149 // via extractelement instead.
3150 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3151 Mask = ConstantInt::getTrue(MaskType);
3152 }
3153 Value *EVL = Inst->getArgOperand(VLIndex);
3154 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3155 // RVV uses contiguous elements as a segment.
3156 if (SegNum > 1) {
3157 unsigned ElemSize = Ty->getScalarSizeInBits();
3158 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3159 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3160 }
3161 Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);
3162 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3163 Align(1), Mask, EVL,
3164 /* Stride */ nullptr, OffsetOp);
3165 return true;
3166 }
3167 }
3168 return false;
3169}
3170
3172 if (Ty->isVectorTy()) {
3173 // f16 with only zvfhmin and bf16 will be promoted to f32
3174 Type *EltTy = cast<VectorType>(Ty)->getElementType();
3175 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
3176 EltTy->isBFloatTy())
3177 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
3178 cast<VectorType>(Ty));
3179
3180 TypeSize Size = DL.getTypeSizeInBits(Ty);
3181 if (Size.isScalable() && ST->hasVInstructions())
3182 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
3183
3184 if (ST->useRVVForFixedLengthVectors())
3185 return divideCeil(Size, ST->getRealMinVLen());
3186 }
3187
3188 return BaseT::getRegUsageForType(Ty);
3189}
3190
3191unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
3192 if (SLPMaxVF.getNumOccurrences())
3193 return SLPMaxVF;
3194
3195 // Return how many elements can fit in getRegisterBitwidth. This is the
3196 // same routine as used in LoopVectorizer. We should probably be
3197 // accounting for whether we actually have instructions with the right
3198 // lane type, but we don't have enough information to do that without
3199 // some additional plumbing which hasn't been justified yet.
3200 TypeSize RegWidth =
3202 // If no vector registers, or absurd element widths, disable
3203 // vectorization by returning 1.
3204 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
3205}
3206
3210
3212 return ST->enableUnalignedVectorMem();
3213}
3214
3217 ScalarEvolution *SE) const {
3218 if (ST->hasVendorXCVmem() && !ST->is64Bit())
3219 return TTI::AMK_PostIndexed;
3220
3222}
3223
3225 const TargetTransformInfo::LSRCost &C2) const {
3226 // RISC-V specific here are "instruction number 1st priority".
3227 // If we need to emit adds inside the loop to add up base registers, then
3228 // we need at least one extra temporary register.
3229 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
3230 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
3231 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
3232 C1.NumIVMuls, C1.NumBaseAdds,
3233 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3234 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
3235 C2.NumIVMuls, C2.NumBaseAdds,
3236 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3237}
3238
3240 Align Alignment) const {
3241 auto *VTy = dyn_cast<VectorType>(DataTy);
3242 if (!VTy || VTy->isScalableTy())
3243 return false;
3244
3245 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3246 return false;
3247
3248 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
3249 // scalarize these types with LMUL >= maximum fixed-length LMUL.
3250 if (VTy->getElementType()->isIntegerTy(8))
3251 if (VTy->getElementCount().getFixedValue() > 256)
3252 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
3253 ST->getMaxLMULForFixedLengthVectors();
3254 return true;
3255}
3256
3258 Align Alignment) const {
3259 auto *VTy = dyn_cast<VectorType>(DataTy);
3260 if (!VTy || VTy->isScalableTy())
3261 return false;
3262
3263 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3264 return false;
3265 return true;
3266}
3267
3268/// See if \p I should be considered for address type promotion. We check if \p
3269/// I is a sext with right type and used in memory accesses. If it used in a
3270/// "complex" getelementptr, we allow it to be promoted without finding other
3271/// sext instructions that sign extended the same initial value. A getelementptr
3272/// is considered as "complex" if it has more than 2 operands.
3274 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
3275 bool Considerable = false;
3276 AllowPromotionWithoutCommonHeader = false;
3277 if (!isa<SExtInst>(&I))
3278 return false;
3279 Type *ConsideredSExtType =
3280 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3281 if (I.getType() != ConsideredSExtType)
3282 return false;
3283 // See if the sext is the one with the right type and used in at least one
3284 // GetElementPtrInst.
3285 for (const User *U : I.users()) {
3286 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3287 Considerable = true;
3288 // A getelementptr is considered as "complex" if it has more than 2
3289 // operands. We will promote a SExt used in such complex GEP as we
3290 // expect some computation to be merged if they are done on 64 bits.
3291 if (GEPInst->getNumOperands() > 2) {
3292 AllowPromotionWithoutCommonHeader = true;
3293 break;
3294 }
3295 }
3296 }
3297 return Considerable;
3298}
3299
3300bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
3301 switch (Opcode) {
3302 case Instruction::Add:
3303 case Instruction::Sub:
3304 case Instruction::Mul:
3305 case Instruction::And:
3306 case Instruction::Or:
3307 case Instruction::Xor:
3308 case Instruction::FAdd:
3309 case Instruction::FSub:
3310 case Instruction::FMul:
3311 case Instruction::FDiv:
3312 case Instruction::ICmp:
3313 case Instruction::FCmp:
3314 return true;
3315 case Instruction::Shl:
3316 case Instruction::LShr:
3317 case Instruction::AShr:
3318 case Instruction::UDiv:
3319 case Instruction::SDiv:
3320 case Instruction::URem:
3321 case Instruction::SRem:
3322 case Instruction::Select:
3323 return Operand == 1;
3324 default:
3325 return false;
3326 }
3327}
3328
3330 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3331 return false;
3332
3333 if (canSplatOperand(I->getOpcode(), Operand))
3334 return true;
3335
3336 auto *II = dyn_cast<IntrinsicInst>(I);
3337 if (!II)
3338 return false;
3339
3340 switch (II->getIntrinsicID()) {
3341 case Intrinsic::fma:
3342 case Intrinsic::vp_fma:
3343 case Intrinsic::fmuladd:
3344 case Intrinsic::vp_fmuladd:
3345 return Operand == 0 || Operand == 1;
3346 case Intrinsic::vp_shl:
3347 case Intrinsic::vp_lshr:
3348 case Intrinsic::vp_ashr:
3349 case Intrinsic::vp_udiv:
3350 case Intrinsic::vp_sdiv:
3351 case Intrinsic::vp_urem:
3352 case Intrinsic::vp_srem:
3353 case Intrinsic::ssub_sat:
3354 case Intrinsic::vp_ssub_sat:
3355 case Intrinsic::usub_sat:
3356 case Intrinsic::vp_usub_sat:
3357 case Intrinsic::vp_select:
3358 return Operand == 1;
3359 // These intrinsics are commutative.
3360 case Intrinsic::vp_add:
3361 case Intrinsic::vp_mul:
3362 case Intrinsic::vp_and:
3363 case Intrinsic::vp_or:
3364 case Intrinsic::vp_xor:
3365 case Intrinsic::vp_fadd:
3366 case Intrinsic::vp_fmul:
3367 case Intrinsic::vp_icmp:
3368 case Intrinsic::vp_fcmp:
3369 case Intrinsic::smin:
3370 case Intrinsic::vp_smin:
3371 case Intrinsic::umin:
3372 case Intrinsic::vp_umin:
3373 case Intrinsic::smax:
3374 case Intrinsic::vp_smax:
3375 case Intrinsic::umax:
3376 case Intrinsic::vp_umax:
3377 case Intrinsic::sadd_sat:
3378 case Intrinsic::vp_sadd_sat:
3379 case Intrinsic::uadd_sat:
3380 case Intrinsic::vp_uadd_sat:
3381 // These intrinsics have 'vr' versions.
3382 case Intrinsic::vp_sub:
3383 case Intrinsic::vp_fsub:
3384 case Intrinsic::vp_fdiv:
3385 return Operand == 0 || Operand == 1;
3386 default:
3387 return false;
3388 }
3389}
3390
3391/// Check if sinking \p I's operands to I's basic block is profitable, because
3392/// the operands can be folded into a target instruction, e.g.
3393/// splats of scalars can fold into vector instructions.
3396 using namespace llvm::PatternMatch;
3397
3398 if (I->isBitwiseLogicOp()) {
3399 if (!I->getType()->isVectorTy()) {
3400 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
3401 for (auto &Op : I->operands()) {
3402 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
3403 if (match(Op.get(), m_Not(m_Value()))) {
3404 Ops.push_back(&Op);
3405 return true;
3406 }
3407 }
3408 }
3409 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
3410 for (auto &Op : I->operands()) {
3411 // (and X, (not Y)) -> (vandn.vv X, Y)
3412 if (match(Op.get(), m_Not(m_Value()))) {
3413 Ops.push_back(&Op);
3414 return true;
3415 }
3416 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
3418 m_ZeroInt()),
3419 m_Value(), m_ZeroMask()))) {
3420 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
3421 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
3422 Ops.push_back(&Not);
3423 Ops.push_back(&InsertElt);
3424 Ops.push_back(&Op);
3425 return true;
3426 }
3427 }
3428 }
3429 }
3430
3431 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3432 return false;
3433
3434 // Don't sink splat operands if the target prefers it. Some targets requires
3435 // S2V transfer buffers and we can run out of them copying the same value
3436 // repeatedly.
3437 // FIXME: It could still be worth doing if it would improve vector register
3438 // pressure and prevent a vector spill.
3439 if (!ST->sinkSplatOperands())
3440 return false;
3441
3442 for (auto OpIdx : enumerate(I->operands())) {
3443 if (!canSplatOperand(I, OpIdx.index()))
3444 continue;
3445
3446 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
3447 // Make sure we are not already sinking this operand
3448 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
3449 continue;
3450
3451 // We are looking for a splat that can be sunk.
3453 m_Value(), m_ZeroMask())))
3454 continue;
3455
3456 // Don't sink i1 splats.
3457 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
3458 continue;
3459
3460 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3461 // and vector registers
3462 for (Use &U : Op->uses()) {
3463 Instruction *Insn = cast<Instruction>(U.getUser());
3464 if (!canSplatOperand(Insn, U.getOperandNo()))
3465 return false;
3466 }
3467
3468 // Sink any fpexts since they might be used in a widening fp pattern.
3469 Use *InsertEltUse = &Op->getOperandUse(0);
3470 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3471 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3472 Ops.push_back(&InsertElt->getOperandUse(1));
3473 Ops.push_back(InsertEltUse);
3474 Ops.push_back(&OpIdx.value());
3475 }
3476 return true;
3477}
3478
3480RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3482 // TODO: Enable expansion when unaligned access is not supported after we fix
3483 // issues in ExpandMemcmp.
3484 if (!ST->enableUnalignedScalarMem())
3485 return Options;
3486
3487 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3488 return Options;
3489
3490 Options.AllowOverlappingLoads = true;
3491 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3492 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3493 if (ST->is64Bit()) {
3494 Options.LoadSizes = {8, 4, 2, 1};
3495 Options.AllowedTailExpansions = {3, 5, 6};
3496 } else {
3497 Options.LoadSizes = {4, 2, 1};
3498 Options.AllowedTailExpansions = {3};
3499 }
3500
3501 if (IsZeroCmp && ST->hasVInstructions()) {
3502 unsigned VLenB = ST->getRealMinVLen() / 8;
3503 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3504 // `VLenB * MaxLMUL` so that it fits in a single register group.
3505 unsigned MinSize = ST->getXLen() / 8 + 1;
3506 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3507 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3508 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3509 }
3510 return Options;
3511}
3512
3514 const Instruction *I) const {
3516 // For the binary operators (e.g. or) we need to be more careful than
3517 // selects, here we only transform them if they are already at a natural
3518 // break point in the code - the end of a block with an unconditional
3519 // terminator.
3520 if (I->getOpcode() == Instruction::Or &&
3521 isa<BranchInst>(I->getNextNode()) &&
3522 cast<BranchInst>(I->getNextNode())->isUnconditional())
3523 return true;
3524
3525 if (I->getOpcode() == Instruction::Add ||
3526 I->getOpcode() == Instruction::Sub)
3527 return true;
3528 }
3530}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-riscv-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
bool isLegalAddImmediate(int64_t imm) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool noNaNs() const
Definition FMF.h:65
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
const Instruction * getInst() const
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
InstructionCost getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
InstructionCost getExpandCompressMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:962
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
bool match(Val *V, const Pattern &P)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1883
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2136
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).