doxygen/RISCVTargetTransformInfo_8cpp_source.html

//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//


#include "RISCVTargetTransformInfo.h"

#include "MCTargetDesc/RISCVMatInt.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/Analysis/TargetTransformInfo.h"

#include "llvm/CodeGen/BasicTTIImpl.h"

#include "llvm/CodeGen/CostTable.h"

#include "llvm/CodeGen/TargetLowering.h"

#include "llvm/CodeGen/ValueTypes.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/IntrinsicsRISCV.h"

#include "llvm/IR/PatternMatch.h"

#include <cmath>

#include <optional>

using namespace llvm;

using namespace llvm::PatternMatch;


#define DEBUG_TYPE "riscvtti"


static cl::opt<unsigned> RVVRegisterWidthLMUL(

    "riscv-v-register-bit-width-lmul",

    cl::desc(

        "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "

        "by autovectorized code. Fractional LMULs are not supported."),

    cl::init(2), cl::Hidden);


static cl::opt<unsigned> SLPMaxVF(

    "riscv-v-slp-max-vf",

    cl::desc(

        "Overrides result used for getMaximumVF query which is used "

        "exclusively by SLP vectorizer."),

    cl::Hidden);


static cl::opt<unsigned>

    RVVMinTripCount("riscv-v-min-trip-count",

                    cl::desc("Set the lower bound of a trip count to decide on "

                             "vectorization while tail-folding."),

                    cl::init(5), cl::Hidden);


InstructionCost

RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,

                                      TTI::TargetCostKind CostKind) const {

  // Check if the type is valid for all CostKind

  if (!VT.isVector())

    return InstructionCost::getInvalid();

  size_t NumInstr = OpCodes.size();

  if (CostKind == TTI::TCK_CodeSize)

    return NumInstr;

  InstructionCost LMULCost = TLI->getLMULCost(VT);

  if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency))

    return LMULCost * NumInstr;

  InstructionCost Cost = 0;

  for (auto Op : OpCodes) {

    switch (Op) {

    case RISCV::VRGATHER_VI:

      Cost += TLI->getVRGatherVICost(VT);

      break;

    case RISCV::VRGATHER_VV:

      Cost += TLI->getVRGatherVVCost(VT);

      break;

    case RISCV::VSLIDEUP_VI:

    case RISCV::VSLIDEDOWN_VI:

      Cost += TLI->getVSlideVICost(VT);

      break;

    case RISCV::VSLIDEUP_VX:

    case RISCV::VSLIDEDOWN_VX:

      Cost += TLI->getVSlideVXCost(VT);

      break;

    case RISCV::VREDMAX_VS:

    case RISCV::VREDMIN_VS:

    case RISCV::VREDMAXU_VS:

    case RISCV::VREDMINU_VS:

    case RISCV::VREDSUM_VS:

    case RISCV::VREDAND_VS:

    case RISCV::VREDOR_VS:

    case RISCV::VREDXOR_VS:

    case RISCV::VFREDMAX_VS:

    case RISCV::VFREDMIN_VS:

    case RISCV::VFREDUSUM_VS: {

      unsigned VL = VT.getVectorMinNumElements();

      if (!VT.isFixedLengthVector())

        VL *= *getVScaleForTuning();

      Cost += Log2_32_Ceil(VL);

      break;

    }

    case RISCV::VFREDOSUM_VS: {

      unsigned VL = VT.getVectorMinNumElements();

      if (!VT.isFixedLengthVector())

        VL *= *getVScaleForTuning();

      Cost += VL;

      break;

    }

    case RISCV::VMV_X_S:

    case RISCV::VMV_S_X:

    case RISCV::VFMV_F_S:

    case RISCV::VFMV_S_F:

    case RISCV::VMOR_MM:

    case RISCV::VMXOR_MM:

    case RISCV::VMAND_MM:

    case RISCV::VMANDN_MM:

    case RISCV::VMNAND_MM:

    case RISCV::VCPOP_M:

    case RISCV::VFIRST_M:

      Cost += 1;

      break;

    default:

      Cost += LMULCost;

    }

  }

  return Cost;

}


static InstructionCost getIntImmCostImpl(const DataLayout &DL,

                                         const RISCVSubtarget *ST,

                                         const APInt &Imm, Type *Ty,

                                         TTI::TargetCostKind CostKind,

                                         bool FreeZeroes) {

  assert(Ty->isIntegerTy() &&

         "getIntImmCost can only estimate cost of materialising integers");


  // We have a Zero register, so 0 is always free.

  if (Imm == 0)

    return TTI::TCC_Free;


  // Otherwise, we check how many instructions it will take to materialise.

  return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,

                                    /*CompressionCost=*/false, FreeZeroes);

}


InstructionCost


RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,

                            TTI::TargetCostKind CostKind) const {

  return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);

}


// Look for patterns of shift followed by AND that can be turned into a pair of

// shifts. We won't need to materialize an immediate for the AND so these can

// be considered free.


static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {

  uint64_t Mask = Imm.getZExtValue();

  auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));

  if (!BO || !BO->hasOneUse())

    return false;


  if (BO->getOpcode() != Instruction::Shl)

    return false;


  if (!isa<ConstantInt>(BO->getOperand(1)))

    return false;


  unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();

  // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1

  // is a mask shifted by c2 bits with c3 leading zeros.

  if (isShiftedMask_64(Mask)) {

    unsigned Trailing = llvm::countr_zero(Mask);

    if (ShAmt == Trailing)

      return true;

  }


  return false;

}


// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),

// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,

// the type will be split so only the lower 32 bits need to be compared using

// (srai/srli X, C) == C2.


static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {

  if (!Inst->hasOneUse())

    return false;


  // Look for equality comparison.

  auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());

  if (!Cmp || !Cmp->isEquality())

    return false;


  // Right hand side of comparison should be a constant.

  auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));

  if (!C)

    return false;


  uint64_t Mask = Imm.getZExtValue();


  // Mask should be of the form -(1 << C) in the lower 32 bits.

  if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))

    return false;


  // Comparison constant should be a subset of Mask.

  uint64_t CmpC = C->getZExtValue();

  if ((CmpC & Mask) != CmpC)

    return false;


  // We'll need to sign extend the comparison constant and shift it right. Make

  // sure the new constant can use addi/xori+seqz/snez.

  unsigned ShiftBits = llvm::countr_zero(Mask);

  int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;

  return NewCmpC >= -2048 && NewCmpC <= 2048;

}


InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,

                                                const APInt &Imm, Type *Ty,

                                                TTI::TargetCostKind CostKind,

                                                Instruction *Inst) const {

  assert(Ty->isIntegerTy() &&

         "getIntImmCost can only estimate cost of materialising integers");


  // We have a Zero register, so 0 is always free.

  if (Imm == 0)

    return TTI::TCC_Free;


  // Some instructions in RISC-V can take a 12-bit immediate. Some of these are

  // commutative, in others the immediate comes from a specific argument index.

  bool Takes12BitImm = false;

  unsigned ImmArgIdx = ~0U;


  switch (Opcode) {

  case Instruction::GetElementPtr:

    // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will

    // split up large offsets in GEP into better parts than ConstantHoisting

    // can.

    return TTI::TCC_Free;

  case Instruction::Store: {

    // Use the materialization cost regardless of if it's the address or the

    // value that is constant, except for if the store is misaligned and

    // misaligned accesses are not legal (experience shows constant hoisting

    // can sometimes be harmful in such cases).

    if (Idx == 1 || !Inst)

      return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,

                               /*FreeZeroes=*/true);


    StoreInst *ST = cast<StoreInst>(Inst);

    if (!getTLI()->allowsMemoryAccessForAlignment(

            Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),

            ST->getPointerAddressSpace(), ST->getAlign()))

      return TTI::TCC_Free;


    return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,

                             /*FreeZeroes=*/true);

  }

  case Instruction::Load:

    // If the address is a constant, use the materialization cost.

    return getIntImmCost(Imm, Ty, CostKind);

  case Instruction::And:

    // zext.h

    if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())

      return TTI::TCC_Free;

    // zext.w

    if (Imm == UINT64_C(0xffffffff) &&

        ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))

      return TTI::TCC_Free;

    // bclri

    if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())

      return TTI::TCC_Free;

    if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&

        canUseShiftPair(Inst, Imm))

      return TTI::TCC_Free;

    if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&

        canUseShiftCmp(Inst, Imm))

      return TTI::TCC_Free;

    Takes12BitImm = true;

    break;

  case Instruction::Add:

    Takes12BitImm = true;

    break;

  case Instruction::Or:

  case Instruction::Xor:

    // bseti/binvi

    if (ST->hasStdExtZbs() && Imm.isPowerOf2())

      return TTI::TCC_Free;

    Takes12BitImm = true;

    break;

  case Instruction::Mul:

    // Power of 2 is a shift. Negated power of 2 is a shift and a negate.

    if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())

      return TTI::TCC_Free;

    // One more or less than a power of 2 can use SLLI+ADD/SUB.

    if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())

      return TTI::TCC_Free;

    // FIXME: There is no MULI instruction.

    Takes12BitImm = true;

    break;

  case Instruction::Sub:

  case Instruction::Shl:

  case Instruction::LShr:

  case Instruction::AShr:

    Takes12BitImm = true;

    ImmArgIdx = 1;

    break;

  default:

    break;

  }


  if (Takes12BitImm) {

    // Check immediate is the correct argument...

    if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {

      // ... and fits into the 12-bit immediate.

      if (Imm.getSignificantBits() <= 64 &&

          getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {

        return TTI::TCC_Free;

      }

    }


    // Otherwise, use the full materialisation cost.

    return getIntImmCost(Imm, Ty, CostKind);

  }


  // By default, prevent hoisting.

  return TTI::TCC_Free;

}


InstructionCost


RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,

                                  const APInt &Imm, Type *Ty,

                                  TTI::TargetCostKind CostKind) const {

  // Prevent hoisting in unknown cases.

  return TTI::TCC_Free;

}


bool RISCVTTIImpl::hasActiveVectorLength() const {

  return ST->hasVInstructions();

}


TargetTransformInfo::PopcntSupportKind


RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {

  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");

  return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;

}


InstructionCost RISCVTTIImpl::getPartialReductionCost(

    unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,

    ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,

    TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,

    TTI::TargetCostKind CostKind) const {


  // zve32x is broken for partial_reduce_umla, but let's make sure we

  // don't generate them.

  if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 ||

      Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||

      InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||

      !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))

    return InstructionCost::getInvalid();


  Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);

  // Note: Asuming all vqdot* variants are equal cost

  return LT.first *

         getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second, CostKind);

}


bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {

  // Currently, the ExpandReductions pass can't expand scalable-vector

  // reductions, but we still request expansion as RVV doesn't support certain

  // reductions and the SelectionDAG can't legalize them either.

  switch (II->getIntrinsicID()) {

  default:

    return false;

  // These reductions have no equivalent in RVV

  case Intrinsic::vector_reduce_mul:

  case Intrinsic::vector_reduce_fmul:

    return true;

  }

}


std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {

  if (ST->hasVInstructions())

    return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;

  return BaseT::getMaxVScale();

}


std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {

  if (ST->hasVInstructions())

    if (unsigned MinVLen = ST->getRealMinVLen();

        MinVLen >= RISCV::RVVBitsPerBlock)

      return MinVLen / RISCV::RVVBitsPerBlock;

  return BaseT::getVScaleForTuning();

}


TypeSize


RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {

  unsigned LMUL =

      llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));

  switch (K) {

  case TargetTransformInfo::RGK_Scalar:

    return TypeSize::getFixed(ST->getXLen());

  case TargetTransformInfo::RGK_FixedWidthVector:

    return TypeSize::getFixed(

        ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);

  case TargetTransformInfo::RGK_ScalableVector:

    return TypeSize::getScalable(

        (ST->hasVInstructions() &&

         ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)

            ? LMUL * RISCV::RVVBitsPerBlock

            : 0);

  }


  llvm_unreachable("Unsupported register kind");

}


InstructionCost

RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,

                                      TTI::TargetCostKind CostKind) const {

  // Add a cost of address generation + the cost of the load. The address

  // is expected to be a PC relative offset to a constant pool entry

  // using auipc/addi.

  return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),

                             /*AddressSpace=*/0, CostKind);

}


static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {

  unsigned Size = Mask.size();

  if (!isPowerOf2_32(Size))

    return false;

  for (unsigned I = 0; I != Size; ++I) {

    if (static_cast<unsigned>(Mask[I]) == I)

      continue;

    if (Mask[I] != 0)

      return false;

    if (Size % I != 0)

      return false;

    for (unsigned J = I + 1; J != Size; ++J)

      // Check the pattern is repeated.

      if (static_cast<unsigned>(Mask[J]) != J % I)

        return false;

    SubVectorSize = I;

    return true;

  }

  // That means Mask is <0, 1, 2, 3>. This is not a concatenation.

  return false;

}


static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST,

                                        LLVMContext &C) {

  assert((DataVT.getScalarSizeInBits() != 8 ||

          DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");

  MVT IndexVT = DataVT.changeTypeToInteger();

  if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))

    IndexVT = IndexVT.changeVectorElementType(MVT::i16);

  return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));

}


/// Attempt to approximate the cost of a shuffle which will require splitting

/// during legalization.  Note that processShuffleMasks is not an exact proxy

/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a

/// reasonably close upperbound.


static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI,

                                               MVT LegalVT, VectorType *Tp,

                                               ArrayRef<int> Mask,

                                               TTI::TargetCostKind CostKind) {

  assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&

         "Expected fixed vector type and non-empty mask");

  unsigned LegalNumElts = LegalVT.getVectorNumElements();

  // Number of destination vectors after legalization:

  unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);

  // We are going to permute multiple sources and the result will be in

  // multiple destinations. Providing an accurate cost only for splits where

  // the element type remains the same.

  if (NumOfDests <= 1 ||

      LegalVT.getVectorElementType().getSizeInBits() !=

          Tp->getElementType()->getPrimitiveSizeInBits() ||

      LegalNumElts >= Tp->getElementCount().getFixedValue())

    return InstructionCost::getInvalid();


  unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);

  unsigned LegalVTSize = LegalVT.getStoreSize();

  // Number of source vectors after legalization:

  unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);


  auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);


  unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);

  unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;

  unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;

  SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);

  assert(NormalizedVF >= Mask.size() &&

         "Normalized mask expected to be not shorter than original mask.");

  copy(Mask, NormalizedMask.begin());

  InstructionCost Cost = 0;

  SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;

  processShuffleMasks(

      NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},

      [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {

        if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))

          return;

        if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))

                 .second)

          return;

        Cost += TTI.getShuffleCost(

            TTI::SK_PermuteSingleSrc,

            FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),

            SingleOpTy, RegMask, CostKind, 0, nullptr);

      },

      [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {

        Cost += TTI.getShuffleCost(

            TTI::SK_PermuteTwoSrc,

            FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),

            SingleOpTy, RegMask, CostKind, 0, nullptr);

      });

  return Cost;

}


/// Try to perform better estimation of the permutation.

/// 1. Split the source/destination vectors into real registers.

/// 2. Do the mask analysis to identify which real registers are

/// permuted. If more than 1 source registers are used for the

/// destination register building, the cost for this destination register

/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one

/// source register is used, build mask and calculate the cost as a cost

/// of PermuteSingleSrc.

/// Also, for the single register permute we try to identify if the

/// destination register is just a copy of the source register or the

/// copy of the previous destination register (the cost is

/// TTI::TCC_Basic). If the source register is just reused, the cost for

/// this operation is 0.

static InstructionCost


costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT,

                            std::optional<unsigned> VLen, VectorType *Tp,

                            ArrayRef<int> Mask, TTI::TargetCostKind CostKind) {

  assert(LegalVT.isFixedLengthVector());

  if (!VLen || Mask.empty())

    return InstructionCost::getInvalid();

  MVT ElemVT = LegalVT.getVectorElementType();

  unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();

  LegalVT = TTI.getTypeLegalizationCost(

                   FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))

                .second;

  // Number of destination vectors after legalization:

  InstructionCost NumOfDests =

      divideCeil(Mask.size(), LegalVT.getVectorNumElements());

  if (NumOfDests <= 1 ||

      LegalVT.getVectorElementType().getSizeInBits() !=

          Tp->getElementType()->getPrimitiveSizeInBits() ||

      LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())

    return InstructionCost::getInvalid();


  unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);

  unsigned LegalVTSize = LegalVT.getStoreSize();

  // Number of source vectors after legalization:

  unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);


  auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),

                                          LegalVT.getVectorNumElements());


  unsigned E = NumOfDests.getValue();

  unsigned NormalizedVF =

      LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);

  unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();

  unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();

  SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);

  assert(NormalizedVF >= Mask.size() &&

         "Normalized mask expected to be not shorter than original mask.");

  copy(Mask, NormalizedMask.begin());

  InstructionCost Cost = 0;

  int NumShuffles = 0;

  SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;

  processShuffleMasks(

      NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},

      [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {

        if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))

          return;

        if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))

                 .second)

          return;

        ++NumShuffles;

        Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,

                                   SingleOpTy, RegMask, CostKind, 0, nullptr);

      },

      [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {

        Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,

                                   SingleOpTy, RegMask, CostKind, 0, nullptr);

        NumShuffles += 2;

      });

  // Note: check that we do not emit too many shuffles here to prevent code

  // size explosion.

  // TODO: investigate, if it can be improved by extra analysis of the masks

  // to check if the code is more profitable.

  if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||

      (NumOfDestRegs <= 2 && NumShuffles < 4))

    return Cost;

  return InstructionCost::getInvalid();

}


InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,

                                           ArrayRef<int> Mask,

                                           TTI::TargetCostKind CostKind) const {

  // Avoid missing masks and length changing shuffles

  if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())

    return InstructionCost::getInvalid();


  int NumElts = Tp->getNumElements();

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);

  // Avoid scalarization cases

  if (!LT.second.isFixedLengthVector())

    return InstructionCost::getInvalid();


  // Requires moving elements between parts, which requires additional

  // unmodeled instructions.

  if (LT.first != 1)

    return InstructionCost::getInvalid();


  auto GetSlideOpcode = [&](int SlideAmt) {

    assert(SlideAmt != 0);

    bool IsVI = isUInt<5>(std::abs(SlideAmt));

    if (SlideAmt < 0)

      return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;

    return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;

  };


  std::array<std::pair<int, int>, 2> SrcInfo;

  if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))

    return InstructionCost::getInvalid();


  if (SrcInfo[1].second == 0)

    std::swap(SrcInfo[0], SrcInfo[1]);


  InstructionCost FirstSlideCost = 0;

  if (SrcInfo[0].second != 0) {

    unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);

    FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);

  }


  if (SrcInfo[1].first == -1)

    return FirstSlideCost;


  InstructionCost SecondSlideCost = 0;

  if (SrcInfo[1].second != 0) {

    unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);

    SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);

  } else {

    SecondSlideCost =

        getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);

  }


  auto EC = Tp->getElementCount();

  VectorType *MaskTy =

      VectorType::get(IntegerType::getInt1Ty(Tp->getContext()), EC);

  InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);

  return FirstSlideCost + SecondSlideCost + MaskCost;

}


InstructionCost


RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,

                             VectorType *SrcTy, ArrayRef<int> Mask,

                             TTI::TargetCostKind CostKind, int Index,

                             VectorType *SubTp, ArrayRef<const Value *> Args,

                             const Instruction *CxtI) const {

  assert((Mask.empty() || DstTy->isScalableTy() ||

          Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&

         "Expected the Mask to match the return size if given");

  assert(SrcTy->getScalarType() == DstTy->getScalarType() &&

         "Expected the same scalar types");


  Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);


  // First, handle cases where having a fixed length vector enables us to

  // give a more accurate cost than falling back to generic scalable codegen.

  // TODO: Each of these cases hints at a modeling gap around scalable vectors.

  if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);

      FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {

    InstructionCost VRegSplittingCost = costShuffleViaVRegSplitting(

        *this, LT.second, ST->getRealVLen(),

        Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);

    if (VRegSplittingCost.isValid())

      return VRegSplittingCost;

    switch (Kind) {

    default:

      break;

    case TTI::SK_PermuteSingleSrc: {

      if (Mask.size() >= 2) {

        MVT EltTp = LT.second.getVectorElementType();

        // If the size of the element is < ELEN then shuffles of interleaves and

        // deinterleaves of 2 vectors can be lowered into the following

        // sequences

        if (EltTp.getScalarSizeInBits() < ST->getELen()) {

          // Example sequence:

          //   vsetivli     zero, 4, e8, mf4, ta, ma (ignored)

          //   vwaddu.vv    v10, v8, v9

          //   li       a0, -1                   (ignored)

          //   vwmaccu.vx   v10, a0, v9

          if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))

            return 2 * LT.first * TLI->getLMULCost(LT.second);


          if (Mask[0] == 0 || Mask[0] == 1) {

            auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());

            // Example sequence:

            //   vnsrl.wi   v10, v8, 0

            if (equal(DeinterleaveMask, Mask))

              return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,

                                                        LT.second, CostKind);

          }

        }

        int SubVectorSize;

        if (LT.second.getScalarSizeInBits() != 1 &&

            isRepeatedConcatMask(Mask, SubVectorSize)) {

          InstructionCost Cost = 0;

          unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);

          // The cost of extraction from a subvector is 0 if the index is 0.

          for (unsigned I = 0; I != NumSlides; ++I) {

            unsigned InsertIndex = SubVectorSize * (1 << I);

            FixedVectorType *SubTp =

                FixedVectorType::get(SrcTy->getElementType(), InsertIndex);

            FixedVectorType *DestTp =

                FixedVectorType::getDoubleElementsVectorType(SubTp);

            std::pair<InstructionCost, MVT> DestLT =

                getTypeLegalizationCost(DestTp);

            // Add the cost of whole vector register move because the

            // destination vector register group for vslideup cannot overlap the

            // source.

            Cost += DestLT.first * TLI->getLMULCost(DestLT.second);

            Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},

                                   CostKind, InsertIndex, SubTp);

          }

          return Cost;

        }

      }


      if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);

          SlideCost.isValid())

        return SlideCost;


      // vrgather + cost of generating the mask constant.

      // We model this for an unknown mask with a single vrgather.

      if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||

                            LT.second.getVectorNumElements() <= 256)) {

        VectorType *IdxTy =

            getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());

        InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);

        return IndexCost +

               getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);

      }

      break;

    }

    case TTI::SK_Transpose:

    case TTI::SK_PermuteTwoSrc: {


      if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);

          SlideCost.isValid())

        return SlideCost;


      // 2 x (vrgather + cost of generating the mask constant) + cost of mask

      // register for the second vrgather. We model this for an unknown

      // (shuffle) mask.

      if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||

                            LT.second.getVectorNumElements() <= 256)) {

        auto &C = SrcTy->getContext();

        auto EC = SrcTy->getElementCount();

        VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);

        VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC);

        InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);

        InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);

        return 2 * IndexCost +

               getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},

                                       LT.second, CostKind) +

               MaskCost;

      }

      break;

    }

    }


    auto shouldSplit = [](TTI::ShuffleKind Kind) {

      switch (Kind) {

      default:

        return false;

      case TTI::SK_PermuteSingleSrc:

      case TTI::SK_Transpose:

      case TTI::SK_PermuteTwoSrc:

        return true;

      }

    };


    if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&

        shouldSplit(Kind)) {

      InstructionCost SplitCost =

          costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);

      if (SplitCost.isValid())

        return SplitCost;

    }

  }


  // Handle scalable vectors (and fixed vectors legalized to scalable vectors).

  switch (Kind) {

  default:

    // Fallthrough to generic handling.

    // TODO: Most of these cases will return getInvalid in generic code, and

    // must be implemented here.

    break;

  case TTI::SK_ExtractSubvector:

    // Extract at zero is always a subregister extract

    if (Index == 0)

      return TTI::TCC_Free;


    // If we're extracting a subvector of at most m1 size at a sub-register

    // boundary - which unfortunately we need exact vlen to identify - this is

    // a subregister extract at worst and thus won't require a vslidedown.

    // TODO: Extend for aligned m2, m4 subvector extracts

    // TODO: Extend for misalgined (but contained) extracts

    // TODO: Extend for scalable subvector types

    if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);

        SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {

      if (std::optional<unsigned> VLen = ST->getRealVLen();

          VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&

          SubLT.second.getSizeInBits() <= *VLen)

        return TTI::TCC_Free;

    }


    // Example sequence:

    // vsetivli     zero, 4, e8, mf2, tu, ma (ignored)

    // vslidedown.vi  v8, v9, 2

    return LT.first *

           getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);

  case TTI::SK_InsertSubvector:

    // Example sequence:

    // vsetivli     zero, 4, e8, mf2, tu, ma (ignored)

    // vslideup.vi  v8, v9, 2

    LT = getTypeLegalizationCost(DstTy);

    return LT.first *

           getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);

  case TTI::SK_Select: {

    // Example sequence:

    // li           a0, 90

    // vsetivli     zero, 8, e8, mf2, ta, ma (ignored)

    // vmv.s.x      v0, a0

    // vmerge.vvm   v8, v9, v8, v0

    // We use 2 for the cost of the mask materialization as this is the true

    // cost for small masks and most shuffles are small.  At worst, this cost

    // should be a very small constant for the constant pool load.  As such,

    // we may bias towards large selects slightly more than truly warranted.

    return LT.first *

           (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},

                                        LT.second, CostKind));

  }

  case TTI::SK_Broadcast: {

    bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==

                                           Instruction::InsertElement);

    if (LT.second.getScalarSizeInBits() == 1) {

      if (HasScalar) {

        // Example sequence:

        //   andi a0, a0, 1

        //   vsetivli zero, 2, e8, mf8, ta, ma (ignored)

        //   vmv.v.x v8, a0

        //   vmsne.vi v0, v8, 0

        return LT.first *

               (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},

                                            LT.second, CostKind));

      }

      // Example sequence:

      //   vsetivli  zero, 2, e8, mf8, ta, mu (ignored)

      //   vmv.v.i v8, 0

      //   vmerge.vim      v8, v8, 1, v0

      //   vmv.x.s a0, v8

      //   andi    a0, a0, 1

      //   vmv.v.x v8, a0

      //   vmsne.vi  v0, v8, 0


      return LT.first *

             (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,

                                           RISCV::VMV_X_S, RISCV::VMV_V_X,

                                           RISCV::VMSNE_VI},

                                          LT.second, CostKind));

    }


    if (HasScalar) {

      // Example sequence:

      //   vmv.v.x v8, a0

      return LT.first *

             getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);

    }


    // Example sequence:

    //   vrgather.vi     v9, v8, 0

    return LT.first *

           getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);

  }

  case TTI::SK_Splice: {

    // vslidedown+vslideup.

    // TODO: Multiplying by LT.first implies this legalizes into multiple copies

    // of similar code, but I think we expand through memory.

    unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};

    if (Index >= 0 && Index < 32)

      Opcodes[0] = RISCV::VSLIDEDOWN_VI;

    else if (Index < 0 && Index > -32)

      Opcodes[1] = RISCV::VSLIDEUP_VI;

    return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);

  }

  case TTI::SK_Reverse: {


    if (!LT.second.isVector())

      return InstructionCost::getInvalid();


    // TODO: Cases to improve here:

    // * Illegal vector types

    // * i64 on RV32

    if (SrcTy->getElementType()->isIntegerTy(1)) {

      VectorType *WideTy =

          VectorType::get(IntegerType::get(SrcTy->getContext(), 8),

                          cast<VectorType>(SrcTy)->getElementCount());

      return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,

                              TTI::CastContextHint::None, CostKind) +

             getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,

                            nullptr) +

             getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,

                              TTI::CastContextHint::None, CostKind);

    }


    MVT ContainerVT = LT.second;

    if (LT.second.isFixedLengthVector())

      ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);

    MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);

    if (ContainerVT.bitsLE(M1VT)) {

      // Example sequence:

      //   csrr a0, vlenb

      //   srli a0, a0, 3

      //   addi a0, a0, -1

      //   vsetvli a1, zero, e8, mf8, ta, mu (ignored)

      //   vid.v v9

      //   vrsub.vx v10, v9, a0

      //   vrgather.vv v9, v8, v10

      InstructionCost LenCost = 3;

      if (LT.second.isFixedLengthVector())

        // vrsub.vi has a 5 bit immediate field, otherwise an li suffices

        LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;

      unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};

      if (LT.second.isFixedLengthVector() &&

          isInt<5>(LT.second.getVectorNumElements() - 1))

        Opcodes[1] = RISCV::VRSUB_VI;

      InstructionCost GatherCost =

          getRISCVInstructionCost(Opcodes, LT.second, CostKind);

      return LT.first * (LenCost + GatherCost);

    }


    // At high LMUL, we split into a series of M1 reverses (see

    // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate

    // the resulting gap at the bottom (for fixed vectors only).  The important

    // bit is that the cost scales linearly, not quadratically with LMUL.

    unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};

    InstructionCost FixedCost =

        getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;

    unsigned Ratio =

        ContainerVT.getVectorMinNumElements() / M1VT.getVectorMinNumElements();

    InstructionCost GatherCost =

        getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;

    InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :

      getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);

    return FixedCost + LT.first * (GatherCost + SlideCost);

  }

  }

  return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,

                               SubTp);

}


static unsigned isM1OrSmaller(MVT VT) {

  RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);

  return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||

          LMUL == RISCVVType::VLMUL::LMUL_F4 ||

          LMUL == RISCVVType::VLMUL::LMUL_F2 ||

          LMUL == RISCVVType::VLMUL::LMUL_1);

}


InstructionCost RISCVTTIImpl::getScalarizationOverhead(

    VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,

    TTI::TargetCostKind CostKind, bool ForPoisonSrc,

    ArrayRef<Value *> VL) const {

  if (isa<ScalableVectorType>(Ty))

    return InstructionCost::getInvalid();


  // A build_vector (which is m1 sized or smaller) can be done in no

  // worse than one vslide1down.vx per element in the type.  We could

  // in theory do an explode_vector in the inverse manner, but our

  // lowering today does not have a first class node for this pattern.

  InstructionCost Cost = BaseT::getScalarizationOverhead(

      Ty, DemandedElts, Insert, Extract, CostKind);

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

  if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {

    if (Ty->getScalarSizeInBits() == 1) {

      auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));

      // Note: Implicit scalar anyextend is assumed to be free since the i1

      // must be stored in a GPR.

      return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,

                                      CostKind) +

             getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,

                              TTI::CastContextHint::None, CostKind, nullptr);

    }


    assert(LT.second.isFixedLengthVector());

    MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);

    if (isM1OrSmaller(ContainerVT)) {

      InstructionCost BV =

          cast<FixedVectorType>(Ty)->getNumElements() *

          getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);

      if (BV < Cost)

        Cost = BV;

    }

  }

  return Cost;

}


InstructionCost


RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,

                                    unsigned AddressSpace,

                                    TTI::TargetCostKind CostKind) const {

  if (!isLegalMaskedLoadStore(Src, Alignment) ||

      CostKind != TTI::TCK_RecipThroughput)

    return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,

                                        CostKind);


  return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);

}


InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(

    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,

    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,

    bool UseMaskForCond, bool UseMaskForGaps) const {


  // The interleaved memory access pass will lower (de)interleave ops combined

  // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg

  // only support masking per-iteration (i.e. condition), not per-segment (i.e.

  // gap).

  if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {

    auto *VTy = cast<VectorType>(VecTy);

    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);

    // Need to make sure type has't been scalarized

    if (LT.second.isVector()) {

      auto *SubVecTy =

          VectorType::get(VTy->getElementType(),

                          VTy->getElementCount().divideCoefficientBy(Factor));

      if (VTy->getElementCount().isKnownMultipleOf(Factor) &&

          TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,

                                            AddressSpace, DL)) {


        // Some processors optimize segment loads/stores as one wide memory op +

        // Factor * LMUL shuffle ops.

        if (ST->hasOptimizedSegmentLoadStore(Factor)) {

          InstructionCost Cost =

              getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);

          MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();

          Cost += Factor * TLI->getLMULCost(SubVecVT);

          return LT.first * Cost;

        }


        // Otherwise, the cost is proportional to the number of elements (VL *

        // Factor ops).

        InstructionCost MemOpCost =

            getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,

                            CostKind, {TTI::OK_AnyValue, TTI::OP_None});

        unsigned NumLoads = getEstimatedVLFor(VTy);

        return NumLoads * MemOpCost;

      }

    }

  }


  // TODO: Return the cost of interleaved accesses for scalable vector when

  // unable to convert to segment accesses instructions.

  if (isa<ScalableVectorType>(VecTy))

    return InstructionCost::getInvalid();


  auto *FVTy = cast<FixedVectorType>(VecTy);

  InstructionCost MemCost =

      getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);

  unsigned VF = FVTy->getNumElements() / Factor;


  // An interleaved load will look like this for Factor=3:

  // %wide.vec = load <12 x i32>, ptr %3, align 4

  // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>

  // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>

  // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>

  if (Opcode == Instruction::Load) {

    InstructionCost Cost = MemCost;

    for (unsigned Index : Indices) {

      FixedVectorType *VecTy =

          FixedVectorType::get(FVTy->getElementType(), VF * Factor);

      auto Mask = createStrideMask(Index, Factor, VF);

      Mask.resize(VF * Factor, -1);

      InstructionCost ShuffleCost =

          getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, VecTy, VecTy,

                         Mask, CostKind, 0, nullptr, {});

      Cost += ShuffleCost;

    }

    return Cost;

  }


  // TODO: Model for NF > 2

  // We'll need to enhance getShuffleCost to model shuffles that are just

  // inserts and extracts into subvectors, since they won't have the full cost

  // of a vrgather.

  // An interleaved store for 3 vectors of 4 lanes will look like

  // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>

  // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>

  // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>

  // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>

  // store <12 x i32> %interleaved.vec, ptr %10, align 4

  if (Factor != 2)

    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,

                                             Alignment, AddressSpace, CostKind,

                                             UseMaskForCond, UseMaskForGaps);


  assert(Opcode == Instruction::Store && "Opcode must be a store");

  // For an interleaving store of 2 vectors, we perform one large interleaving

  // shuffle that goes into the wide store

  auto Mask = createInterleaveMask(VF, Factor);

  InstructionCost ShuffleCost =

      getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, FVTy, Mask,

                     CostKind, 0, nullptr, {});

  return MemCost + ShuffleCost;

}


InstructionCost RISCVTTIImpl::getGatherScatterOpCost(

    unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,

    Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {

  if (CostKind != TTI::TCK_RecipThroughput)

    return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,

                                         Alignment, CostKind, I);


  if ((Opcode == Instruction::Load &&

       !isLegalMaskedGather(DataTy, Align(Alignment))) ||

      (Opcode == Instruction::Store &&

       !isLegalMaskedScatter(DataTy, Align(Alignment))))

    return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,

                                         Alignment, CostKind, I);


  // Cost is proportional to the number of memory operations implied.  For

  // scalable vectors, we use an estimate on that number since we don't

  // know exactly what VL will be.

  auto &VTy = *cast<VectorType>(DataTy);

  InstructionCost MemOpCost =

      getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,

                      {TTI::OK_AnyValue, TTI::OP_None}, I);

  unsigned NumLoads = getEstimatedVLFor(&VTy);

  return NumLoads * MemOpCost;

}


InstructionCost RISCVTTIImpl::getExpandCompressMemoryOpCost(

    unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,

    TTI::TargetCostKind CostKind, const Instruction *I) const {

  bool IsLegal = (Opcode == Instruction::Store &&

                  isLegalMaskedCompressStore(DataTy, Alignment)) ||

                 (Opcode == Instruction::Load &&

                  isLegalMaskedExpandLoad(DataTy, Alignment));

  if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)

    return BaseT::getExpandCompressMemoryOpCost(Opcode, DataTy, VariableMask,

                                                Alignment, CostKind, I);

  // Example compressstore sequence:

  // vsetivli        zero, 8, e32, m2, ta, ma (ignored)

  // vcompress.vm    v10, v8, v0

  // vcpop.m a1, v0

  // vsetvli zero, a1, e32, m2, ta, ma

  // vse32.v v10, (a0)

  // Example expandload sequence:

  // vsetivli        zero, 8, e8, mf2, ta, ma (ignored)

  // vcpop.m a1, v0

  // vsetvli zero, a1, e32, m2, ta, ma

  // vle32.v v10, (a0)

  // vsetivli        zero, 8, e32, m2, ta, ma

  // viota.m v12, v0

  // vrgather.vv     v8, v10, v12, v0.t

  auto MemOpCost =

      getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);

  auto LT = getTypeLegalizationCost(DataTy);

  SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};

  if (VariableMask)

    Opcodes.push_back(RISCV::VCPOP_M);

  if (Opcode == Instruction::Store)

    Opcodes.append({RISCV::VCOMPRESS_VM});

  else

    Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});

  return MemOpCost +

         LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);

}


InstructionCost RISCVTTIImpl::getStridedMemoryOpCost(

    unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,

    Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {

  if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&

       !isLegalStridedLoadStore(DataTy, Alignment)) ||

      (Opcode != Instruction::Load && Opcode != Instruction::Store))

    return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,

                                         Alignment, CostKind, I);


  if (CostKind == TTI::TCK_CodeSize)

    return TTI::TCC_Basic;


  // Cost is proportional to the number of memory operations implied.  For

  // scalable vectors, we use an estimate on that number since we don't

  // know exactly what VL will be.

  auto &VTy = *cast<VectorType>(DataTy);

  InstructionCost MemOpCost =

      getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,

                      {TTI::OK_AnyValue, TTI::OP_None}, I);

  unsigned NumLoads = getEstimatedVLFor(&VTy);

  return NumLoads * MemOpCost;

}


InstructionCost


RISCVTTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const {

  // FIXME: This is a property of the default vector convention, not

  // all possible calling conventions.  Fixing that will require

  // some TTI API and SLP rework.

  InstructionCost Cost = 0;

  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

  for (auto *Ty : Tys) {

    if (!Ty->isVectorTy())

      continue;

    Align A = DL.getPrefTypeAlign(Ty);

    Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +

            getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);

  }

  return Cost;

}


// Currently, these represent both throughput and codesize costs

// for the respective intrinsics.  The costs in this table are simply

// instruction counts with the following adjustments made:

// * One vsetvli is considered free.


static const CostTblEntry VectorIntrinsicCostTable[]{

    {Intrinsic::floor, MVT::f32, 9},

    {Intrinsic::floor, MVT::f64, 9},

    {Intrinsic::ceil, MVT::f32, 9},

    {Intrinsic::ceil, MVT::f64, 9},

    {Intrinsic::trunc, MVT::f32, 7},

    {Intrinsic::trunc, MVT::f64, 7},

    {Intrinsic::round, MVT::f32, 9},

    {Intrinsic::round, MVT::f64, 9},

    {Intrinsic::roundeven, MVT::f32, 9},

    {Intrinsic::roundeven, MVT::f64, 9},

    {Intrinsic::rint, MVT::f32, 7},

    {Intrinsic::rint, MVT::f64, 7},

    {Intrinsic::nearbyint, MVT::f32, 9},

    {Intrinsic::nearbyint, MVT::f64, 9},

    {Intrinsic::bswap, MVT::i16, 3},

    {Intrinsic::bswap, MVT::i32, 12},

    {Intrinsic::bswap, MVT::i64, 31},

    {Intrinsic::vp_bswap, MVT::i16, 3},

    {Intrinsic::vp_bswap, MVT::i32, 12},

    {Intrinsic::vp_bswap, MVT::i64, 31},

    {Intrinsic::vp_fshl, MVT::i8, 7},

    {Intrinsic::vp_fshl, MVT::i16, 7},

    {Intrinsic::vp_fshl, MVT::i32, 7},

    {Intrinsic::vp_fshl, MVT::i64, 7},

    {Intrinsic::vp_fshr, MVT::i8, 7},

    {Intrinsic::vp_fshr, MVT::i16, 7},

    {Intrinsic::vp_fshr, MVT::i32, 7},

    {Intrinsic::vp_fshr, MVT::i64, 7},

    {Intrinsic::bitreverse, MVT::i8, 17},

    {Intrinsic::bitreverse, MVT::i16, 24},

    {Intrinsic::bitreverse, MVT::i32, 33},

    {Intrinsic::bitreverse, MVT::i64, 52},

    {Intrinsic::vp_bitreverse, MVT::i8, 17},

    {Intrinsic::vp_bitreverse, MVT::i16, 24},

    {Intrinsic::vp_bitreverse, MVT::i32, 33},

    {Intrinsic::vp_bitreverse, MVT::i64, 52},

    {Intrinsic::ctpop, MVT::i8, 12},

    {Intrinsic::ctpop, MVT::i16, 19},

    {Intrinsic::ctpop, MVT::i32, 20},

    {Intrinsic::ctpop, MVT::i64, 21},

    {Intrinsic::ctlz, MVT::i8, 19},

    {Intrinsic::ctlz, MVT::i16, 28},

    {Intrinsic::ctlz, MVT::i32, 31},

    {Intrinsic::ctlz, MVT::i64, 35},

    {Intrinsic::cttz, MVT::i8, 16},

    {Intrinsic::cttz, MVT::i16, 23},

    {Intrinsic::cttz, MVT::i32, 24},

    {Intrinsic::cttz, MVT::i64, 25},

    {Intrinsic::vp_ctpop, MVT::i8, 12},

    {Intrinsic::vp_ctpop, MVT::i16, 19},

    {Intrinsic::vp_ctpop, MVT::i32, 20},

    {Intrinsic::vp_ctpop, MVT::i64, 21},

    {Intrinsic::vp_ctlz, MVT::i8, 19},

    {Intrinsic::vp_ctlz, MVT::i16, 28},

    {Intrinsic::vp_ctlz, MVT::i32, 31},

    {Intrinsic::vp_ctlz, MVT::i64, 35},

    {Intrinsic::vp_cttz, MVT::i8, 16},

    {Intrinsic::vp_cttz, MVT::i16, 23},

    {Intrinsic::vp_cttz, MVT::i32, 24},

    {Intrinsic::vp_cttz, MVT::i64, 25},

};


InstructionCost


RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,

                                    TTI::TargetCostKind CostKind) const {

  auto *RetTy = ICA.getReturnType();

  switch (ICA.getID()) {

  case Intrinsic::lrint:

  case Intrinsic::llrint:

  case Intrinsic::lround:

  case Intrinsic::llround: {

    auto LT = getTypeLegalizationCost(RetTy);

    Type *SrcTy = ICA.getArgTypes().front();

    auto SrcLT = getTypeLegalizationCost(SrcTy);

    if (ST->hasVInstructions() && LT.second.isVector()) {

      SmallVector<unsigned, 2> Ops;

      unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());

      unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());

      if (LT.second.getVectorElementType() == MVT::bf16) {

        if (!ST->hasVInstructionsBF16Minimal())

          return InstructionCost::getInvalid();

        if (DstEltSz == 32)

          Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};

        else

          Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};

      } else if (LT.second.getVectorElementType() == MVT::f16 &&

                 !ST->hasVInstructionsF16()) {

        if (!ST->hasVInstructionsF16Minimal())

          return InstructionCost::getInvalid();

        if (DstEltSz == 32)

          Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};

        else

          Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};


      } else if (SrcEltSz > DstEltSz) {

        Ops = {RISCV::VFNCVT_X_F_W};

      } else if (SrcEltSz < DstEltSz) {

        Ops = {RISCV::VFWCVT_X_F_V};

      } else {

        Ops = {RISCV::VFCVT_X_F_V};

      }


      // We need to use the source LMUL in the case of a narrowing op, and the

      // destination LMUL otherwise.

      if (SrcEltSz > DstEltSz)

        return SrcLT.first *

               getRISCVInstructionCost(Ops, SrcLT.second, CostKind);

      return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);

    }

    break;

  }

  case Intrinsic::ceil:

  case Intrinsic::floor:

  case Intrinsic::trunc:

  case Intrinsic::rint:

  case Intrinsic::round:

  case Intrinsic::roundeven: {

    // These all use the same code.

    auto LT = getTypeLegalizationCost(RetTy);

    if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))

      return LT.first * 8;

    break;

  }

  case Intrinsic::umin:

  case Intrinsic::umax:

  case Intrinsic::smin:

  case Intrinsic::smax: {

    auto LT = getTypeLegalizationCost(RetTy);

    if (LT.second.isScalarInteger() && ST->hasStdExtZbb())

      return LT.first;


    if (ST->hasVInstructions() && LT.second.isVector()) {

      unsigned Op;

      switch (ICA.getID()) {

      case Intrinsic::umin:

        Op = RISCV::VMINU_VV;

        break;

      case Intrinsic::umax:

        Op = RISCV::VMAXU_VV;

        break;

      case Intrinsic::smin:

        Op = RISCV::VMIN_VV;

        break;

      case Intrinsic::smax:

        Op = RISCV::VMAX_VV;

        break;

      }

      return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);

    }

    break;

  }

  case Intrinsic::sadd_sat:

  case Intrinsic::ssub_sat:

  case Intrinsic::uadd_sat:

  case Intrinsic::usub_sat: {

    auto LT = getTypeLegalizationCost(RetTy);

    if (ST->hasVInstructions() && LT.second.isVector()) {

      unsigned Op;

      switch (ICA.getID()) {

      case Intrinsic::sadd_sat:

        Op = RISCV::VSADD_VV;

        break;

      case Intrinsic::ssub_sat:

        Op = RISCV::VSSUBU_VV;

        break;

      case Intrinsic::uadd_sat:

        Op = RISCV::VSADDU_VV;

        break;

      case Intrinsic::usub_sat:

        Op = RISCV::VSSUBU_VV;

        break;

      }

      return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);

    }

    break;

  }

  case Intrinsic::fma:

  case Intrinsic::fmuladd: {

    // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin

    auto LT = getTypeLegalizationCost(RetTy);

    if (ST->hasVInstructions() && LT.second.isVector())

      return LT.first *

             getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);

    break;

  }

  case Intrinsic::fabs: {

    auto LT = getTypeLegalizationCost(RetTy);

    if (ST->hasVInstructions() && LT.second.isVector()) {

      // lui a0, 8

      // addi a0, a0, -1

      // vsetvli a1, zero, e16, m1, ta, ma

      // vand.vx v8, v8, a0

      // f16 with zvfhmin and bf16 with zvfhbmin

      if (LT.second.getVectorElementType() == MVT::bf16 ||

          (LT.second.getVectorElementType() == MVT::f16 &&

           !ST->hasVInstructionsF16()))

        return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,

                                                  CostKind) +

               2;

      else

        return LT.first *

               getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);

    }

    break;

  }

  case Intrinsic::sqrt: {

    auto LT = getTypeLegalizationCost(RetTy);

    if (ST->hasVInstructions() && LT.second.isVector()) {

      SmallVector<unsigned, 4> ConvOp;

      SmallVector<unsigned, 2> FsqrtOp;

      MVT ConvType = LT.second;

      MVT FsqrtType = LT.second;

      // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16

      // will be spilt.

      if (LT.second.getVectorElementType() == MVT::bf16) {

        if (LT.second == MVT::nxv32bf16) {

          ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,

                    RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};

          FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};

          ConvType = MVT::nxv16f16;

          FsqrtType = MVT::nxv16f32;

        } else {

          ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};

          FsqrtOp = {RISCV::VFSQRT_V};

          FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);

        }

      } else if (LT.second.getVectorElementType() == MVT::f16 &&

                 !ST->hasVInstructionsF16()) {

        if (LT.second == MVT::nxv32f16) {

          ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,

                    RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};

          FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};

          ConvType = MVT::nxv16f16;

          FsqrtType = MVT::nxv16f32;

        } else {

          ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};

          FsqrtOp = {RISCV::VFSQRT_V};

          FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);

        }

      } else {

        FsqrtOp = {RISCV::VFSQRT_V};

      }


      return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +

                         getRISCVInstructionCost(ConvOp, ConvType, CostKind));

    }

    break;

  }

  case Intrinsic::cttz:

  case Intrinsic::ctlz:

  case Intrinsic::ctpop: {

    auto LT = getTypeLegalizationCost(RetTy);

    if (ST->hasStdExtZvbb() && LT.second.isVector()) {

      unsigned Op;

      switch (ICA.getID()) {

      case Intrinsic::cttz:

        Op = RISCV::VCTZ_V;

        break;

      case Intrinsic::ctlz:

        Op = RISCV::VCLZ_V;

        break;

      case Intrinsic::ctpop:

        Op = RISCV::VCPOP_V;

        break;

      }

      return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);

    }

    break;

  }

  case Intrinsic::abs: {

    auto LT = getTypeLegalizationCost(RetTy);

    if (ST->hasVInstructions() && LT.second.isVector()) {

      // vrsub.vi v10, v8, 0

      // vmax.vv v8, v8, v10

      return LT.first *

             getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},

                                     LT.second, CostKind);

    }

    break;

  }

  case Intrinsic::get_active_lane_mask: {

    if (ST->hasVInstructions()) {

      Type *ExpRetTy = VectorType::get(

          ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());

      auto LT = getTypeLegalizationCost(ExpRetTy);


      // vid.v   v8  // considered hoisted

      // vsaddu.vx   v8, v8, a0

      // vmsltu.vx   v0, v8, a1

      return LT.first *

             getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},

                                     LT.second, CostKind);

    }

    break;

  }

  // TODO: add more intrinsic

  case Intrinsic::stepvector: {

    auto LT = getTypeLegalizationCost(RetTy);

    // Legalisation of illegal types involves an `index' instruction plus

    // (LT.first - 1) vector adds.

    if (ST->hasVInstructions())

      return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +

             (LT.first - 1) *

                 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);

    return 1 + (LT.first - 1);

  }

  case Intrinsic::experimental_cttz_elts: {

    Type *ArgTy = ICA.getArgTypes()[0];

    EVT ArgType = TLI->getValueType(DL, ArgTy, true);

    if (getTLI()->shouldExpandCttzElements(ArgType))

      break;

    InstructionCost Cost = getRISCVInstructionCost(

        RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);


    // If zero_is_poison is false, then we will generate additional

    // cmp + select instructions to convert -1 to EVL.

    Type *BoolTy = Type::getInt1Ty(RetTy->getContext());

    if (ICA.getArgs().size() > 1 &&

        cast<ConstantInt>(ICA.getArgs()[1])->isZero())

      Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,

                                 CmpInst::ICMP_SLT, CostKind) +

              getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,

                                 CmpInst::BAD_ICMP_PREDICATE, CostKind);


    return Cost;

  }

  case Intrinsic::experimental_vp_splat: {

    auto LT = getTypeLegalizationCost(RetTy);

    // TODO: Lower i1 experimental_vp_splat

    if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1)

      return InstructionCost::getInvalid();

    return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint()

                                                  ? RISCV::VFMV_V_F

                                                  : RISCV::VMV_V_X,

                                              LT.second, CostKind);

  }

  case Intrinsic::experimental_vp_splice: {

    // To support type-based query from vectorizer, set the index to 0.

    // Note that index only change the cost from vslide.vx to vslide.vi and in

    // current implementations they have same costs.

    return getShuffleCost(TTI::SK_Splice, cast<VectorType>(ICA.getReturnType()),

                          cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,

                          0, cast<VectorType>(ICA.getReturnType()));

  }

  case Intrinsic::fptoui_sat:

  case Intrinsic::fptosi_sat: {

    InstructionCost Cost = 0;

    bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;

    Type *SrcTy = ICA.getArgTypes()[0];


    auto SrcLT = getTypeLegalizationCost(SrcTy);

    auto DstLT = getTypeLegalizationCost(RetTy);

    if (!SrcTy->isVectorTy())

      break;


    if (!SrcLT.first.isValid() || !DstLT.first.isValid())

      return InstructionCost::getInvalid();


    Cost +=

        getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,

                         RetTy, SrcTy, TTI::CastContextHint::None, CostKind);


    // Handle NaN.

    // vmfne v0, v8, v8         # If v8[i] is NaN set v0[i] to 1.

    // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.

    Type *CondTy = RetTy->getWithNewBitWidth(1);

    Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,

                               CmpInst::FCMP_UNO, CostKind);

    Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,

                               CmpInst::FCMP_UNO, CostKind);

    return Cost;

  }

  }


  if (ST->hasVInstructions() && RetTy->isVectorTy()) {

    if (auto LT = getTypeLegalizationCost(RetTy);

        LT.second.isVector()) {

      MVT EltTy = LT.second.getVectorElementType();

      if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,

                                              ICA.getID(), EltTy))

        return LT.first * Entry->Cost;

    }

  }


  return BaseT::getIntrinsicInstrCost(ICA, CostKind);

}


InstructionCost


RISCVTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,

                                        const SCEV *Ptr,

                                        TTI::TargetCostKind CostKind) const {

  // Address computations for vector indexed load/store likely require an offset

  // and/or scaling.

  if (ST->hasVInstructions() && PtrTy->isVectorTy())

    return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);


  return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);

}


InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,

                                               Type *Src,

                                               TTI::CastContextHint CCH,

                                               TTI::TargetCostKind CostKind,

                                               const Instruction *I) const {

  bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);

  if (!IsVectorType)

    return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);


  // FIXME: Need to compute legalizing cost for illegal types.  The current

  // code handles only legal types and those which can be trivially

  // promoted to legal.

  if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||

      Dst->getScalarSizeInBits() > ST->getELen())

    return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);


  int ISD = TLI->InstructionOpcodeToISD(Opcode);

  assert(ISD && "Invalid opcode");

  std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);

  std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);


  // Handle i1 source and dest cases *before* calling logic in BasicTTI.

  // The shared implementation doesn't model vector widening during legalization

  // and instead assumes scalarization.  In order to scalarize an <N x i1>

  // vector, we need to extend/trunc to/from i8.  If we don't special case

  // this, we can get an infinite recursion cycle.

  switch (ISD) {

  default:

    break;

  case ISD::SIGN_EXTEND:

  case ISD::ZERO_EXTEND:

    if (Src->getScalarSizeInBits() == 1) {

      // We do not use vsext/vzext to extend from mask vector.

      // Instead we use the following instructions to extend from mask vector:

      // vmv.v.i v8, 0

      // vmerge.vim v8, v8, -1, v0 (repeated per split)

      return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +

             DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,

                                                   DstLT.second, CostKind) +

             DstLT.first - 1;

    }

    break;

  case ISD::TRUNCATE:

    if (Dst->getScalarSizeInBits() == 1) {

      // We do not use several vncvt to truncate to mask vector. So we could

      // not use PowDiff to calculate it.

      // Instead we use the following instructions to truncate to mask vector:

      // vand.vi v8, v8, 1

      // vmsne.vi v0, v8, 0

      return SrcLT.first *

                 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},

                                         SrcLT.second, CostKind) +

             SrcLT.first - 1;

    }

    break;

  };


  // Our actual lowering for the case where a wider legal type is available

  // uses promotion to the wider type.  This is reflected in the result of

  // getTypeLegalizationCost, but BasicTTI assumes the widened cases are

  // scalarized if the legalized Src and Dst are not equal sized.

  const DataLayout &DL = this->getDataLayout();

  if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||

      !SrcLT.first.isValid() || !DstLT.first.isValid() ||

      !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),

                           SrcLT.second.getSizeInBits()) ||

      !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),

                           DstLT.second.getSizeInBits()))

    return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);


  // The split cost is handled by the base getCastInstrCost

  assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");


  int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -

                (int)Log2_32(SrcLT.second.getScalarSizeInBits());

  switch (ISD) {

  case ISD::SIGN_EXTEND:

  case ISD::ZERO_EXTEND: {

    if ((PowDiff < 1) || (PowDiff > 3))

      return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);

    unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};

    unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};

    unsigned Op =

        (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];

    return getRISCVInstructionCost(Op, DstLT.second, CostKind);

  }

  case ISD::TRUNCATE:

  case ISD::FP_EXTEND:

  case ISD::FP_ROUND: {

    // Counts of narrow/widen instructions.

    unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();

    unsigned DstEltSize = DstLT.second.getScalarSizeInBits();


    unsigned Op = (ISD == ISD::TRUNCATE)    ? RISCV::VNSRL_WI

                  : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V

                                            : RISCV::VFNCVT_F_F_W;

    InstructionCost Cost = 0;

    for (; SrcEltSize != DstEltSize;) {

      MVT ElementMVT = (ISD == ISD::TRUNCATE)

                           ? MVT::getIntegerVT(DstEltSize)

                           : MVT::getFloatingPointVT(DstEltSize);

      MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);

      DstEltSize =

          (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;

      Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);

    }

    return Cost;

  }

  case ISD::FP_TO_SINT:

  case ISD::FP_TO_UINT: {

    unsigned IsSigned = ISD == ISD::FP_TO_SINT;

    unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;

    unsigned FWCVT =

        IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;

    unsigned FNCVT =

        IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;

    unsigned SrcEltSize = Src->getScalarSizeInBits();

    unsigned DstEltSize = Dst->getScalarSizeInBits();

    InstructionCost Cost = 0;

    if ((SrcEltSize == 16) &&

        (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {

      // If the target only supports zvfhmin or it is fp16-to-i64 conversion

      // pre-widening to f32 and then convert f32 to integer

      VectorType *VecF32Ty =

          VectorType::get(Type::getFloatTy(Dst->getContext()),

                          cast<VectorType>(Dst)->getElementCount());

      std::pair<InstructionCost, MVT> VecF32LT =

          getTypeLegalizationCost(VecF32Ty);

      Cost +=

          VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,

                                                   VecF32LT.second, CostKind);

      Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);

      return Cost;

    }

    if (DstEltSize == SrcEltSize)

      Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);

    else if (DstEltSize > SrcEltSize)

      Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);

    else { // (SrcEltSize > DstEltSize)

      // First do a narrowing conversion to an integer half the size, then

      // truncate if needed.

      MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);

      MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);

      Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);

      if ((SrcEltSize / 2) > DstEltSize) {

        Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());

        Cost +=

            getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);

      }

    }

    return Cost;

  }

  case ISD::SINT_TO_FP:

  case ISD::UINT_TO_FP: {

    unsigned IsSigned = ISD == ISD::SINT_TO_FP;

    unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;

    unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;

    unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;

    unsigned SrcEltSize = Src->getScalarSizeInBits();

    unsigned DstEltSize = Dst->getScalarSizeInBits();


    InstructionCost Cost = 0;

    if ((DstEltSize == 16) &&

        (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {

      // If the target only supports zvfhmin or it is i64-to-fp16 conversion

      // it is converted to f32 and then converted to f16

      VectorType *VecF32Ty =

          VectorType::get(Type::getFloatTy(Dst->getContext()),

                          cast<VectorType>(Dst)->getElementCount());

      std::pair<InstructionCost, MVT> VecF32LT =

          getTypeLegalizationCost(VecF32Ty);

      Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);

      Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,

                                                       DstLT.second, CostKind);

      return Cost;

    }


    if (DstEltSize == SrcEltSize)

      Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);

    else if (DstEltSize > SrcEltSize) {

      if ((DstEltSize / 2) > SrcEltSize) {

        VectorType *VecTy =

            VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),

                            cast<VectorType>(Dst)->getElementCount());

        unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;

        Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);

      }

      Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);

    } else

      Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);

    return Cost;

  }

  }

  return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);

}


unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {

  if (isa<ScalableVectorType>(Ty)) {

    const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());

    const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();

    const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;

    return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);

  }

  return cast<FixedVectorType>(Ty)->getNumElements();

}


InstructionCost


RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,

                                     FastMathFlags FMF,

                                     TTI::TargetCostKind CostKind) const {

  if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())

    return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);


  // Skip if scalar size of Ty is bigger than ELEN.

  if (Ty->getScalarSizeInBits() > ST->getELen())

    return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

  if (Ty->getElementType()->isIntegerTy(1)) {

    // SelectionDAGBuilder does following transforms:

    //   vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)

    //   vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)

    if (IID == Intrinsic::umax || IID == Intrinsic::smin)

      return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);

    else

      return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);

  }


  if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {

    SmallVector<unsigned, 3> Opcodes;

    InstructionCost ExtraCost = 0;

    switch (IID) {

    case Intrinsic::maximum:

      if (FMF.noNaNs()) {

        Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};

      } else {

        Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,

                   RISCV::VFMV_F_S};

        // Cost of Canonical Nan + branch

        // lui a0, 523264

        // fmv.w.x fa0, a0

        Type *DstTy = Ty->getScalarType();

        const unsigned EltTyBits = DstTy->getScalarSizeInBits();

        Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);

        ExtraCost = 1 +

                    getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,

                                     TTI::CastContextHint::None, CostKind) +

                    getCFInstrCost(Instruction::Br, CostKind);

      }

      break;


    case Intrinsic::minimum:

      if (FMF.noNaNs()) {

        Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};

      } else {

        Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,

                   RISCV::VFMV_F_S};

        // Cost of Canonical Nan + branch

        // lui a0, 523264

        // fmv.w.x fa0, a0

        Type *DstTy = Ty->getScalarType();

        const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);

        Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);

        ExtraCost = 1 +

                    getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,

                                     TTI::CastContextHint::None, CostKind) +

                    getCFInstrCost(Instruction::Br, CostKind);

      }

      break;

    }

    return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);

  }


  // IR Reduction is composed by one rvv reduction instruction and vmv

  unsigned SplitOp;

  SmallVector<unsigned, 3> Opcodes;

  switch (IID) {

  default:

    llvm_unreachable("Unsupported intrinsic");

  case Intrinsic::smax:

    SplitOp = RISCV::VMAX_VV;

    Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};

    break;

  case Intrinsic::smin:

    SplitOp = RISCV::VMIN_VV;

    Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};

    break;

  case Intrinsic::umax:

    SplitOp = RISCV::VMAXU_VV;

    Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};

    break;

  case Intrinsic::umin:

    SplitOp = RISCV::VMINU_VV;

    Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};

    break;

  case Intrinsic::maxnum:

    SplitOp = RISCV::VFMAX_VV;

    Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};

    break;

  case Intrinsic::minnum:

    SplitOp = RISCV::VFMIN_VV;

    Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};

    break;

  }

  // Add a cost for data larger than LMUL8

  InstructionCost SplitCost =

      (LT.first > 1) ? (LT.first - 1) *

                           getRISCVInstructionCost(SplitOp, LT.second, CostKind)

                     : 0;

  return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);

}


InstructionCost


RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,

                                         std::optional<FastMathFlags> FMF,

                                         TTI::TargetCostKind CostKind) const {

  if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())

    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);


  // Skip if scalar size of Ty is bigger than ELEN.

  if (Ty->getScalarSizeInBits() > ST->getELen())

    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);


  int ISD = TLI->InstructionOpcodeToISD(Opcode);

  assert(ISD && "Invalid opcode");


  if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&

      ISD != ISD::FADD)

    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

  Type *ElementTy = Ty->getElementType();

  if (ElementTy->isIntegerTy(1)) {

    // Example sequences:

    //   vfirst.m a0, v0

    //   seqz a0, a0

    if (LT.second == MVT::v1i1)

      return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +

             getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,

                                CmpInst::ICMP_EQ, CostKind);


    if (ISD == ISD::AND) {

      // Example sequences:

      //   vmand.mm v8, v9, v8 ; needed every time type is split

      //   vmnot.m v8, v0      ; alias for vmnand

      //   vcpop.m a0, v8

      //   seqz a0, a0


      // See the discussion: https://github.com/llvm/llvm-project/pull/119160

      // For LMUL <= 8, there is no splitting,

      //   the sequences are vmnot, vcpop and seqz.

      // When LMUL > 8 and split = 1,

      //   the sequences are vmnand, vcpop and seqz.

      // When LMUL > 8 and split > 1,

      //   the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.

      return ((LT.first > 2) ? (LT.first - 2) : 0) *

                 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +

             getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +

             getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +

             getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,

                                CmpInst::ICMP_EQ, CostKind);

    } else if (ISD == ISD::XOR || ISD == ISD::ADD) {

      // Example sequences:

      //   vsetvli a0, zero, e8, mf8, ta, ma

      //   vmxor.mm v8, v0, v8 ; needed every time type is split

      //   vcpop.m a0, v8

      //   andi a0, a0, 1

      return (LT.first - 1) *

                 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +

             getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;

    } else {

      assert(ISD == ISD::OR);

      // Example sequences:

      //   vsetvli a0, zero, e8, mf8, ta, ma

      //   vmor.mm v8, v9, v8 ; needed every time type is split

      //   vcpop.m a0, v0

      //   snez a0, a0

      return (LT.first - 1) *

                 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +

             getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +

             getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,

                                CmpInst::ICMP_NE, CostKind);

    }

  }


  // IR Reduction of or/and is composed by one vmv and one rvv reduction

  // instruction, and others is composed by two vmv and one rvv reduction

  // instruction

  unsigned SplitOp;

  SmallVector<unsigned, 3> Opcodes;

  switch (ISD) {

  case ISD::ADD:

    SplitOp = RISCV::VADD_VV;

    Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};

    break;

  case ISD::OR:

    SplitOp = RISCV::VOR_VV;

    Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};

    break;

  case ISD::XOR:

    SplitOp = RISCV::VXOR_VV;

    Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};

    break;

  case ISD::AND:

    SplitOp = RISCV::VAND_VV;

    Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};

    break;

  case ISD::FADD:

    // We can't promote f16/bf16 fadd reductions.

    if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||

        LT.second.getScalarType() == MVT::bf16)

      return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);

    if (TTI::requiresOrderedReduction(FMF)) {

      Opcodes.push_back(RISCV::VFMV_S_F);

      for (unsigned i = 0; i < LT.first.getValue(); i++)

        Opcodes.push_back(RISCV::VFREDOSUM_VS);

      Opcodes.push_back(RISCV::VFMV_F_S);

      return getRISCVInstructionCost(Opcodes, LT.second, CostKind);

    }

    SplitOp = RISCV::VFADD_VV;

    Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};

    break;

  }

  // Add a cost for data larger than LMUL8

  InstructionCost SplitCost =

      (LT.first > 1) ? (LT.first - 1) *

                           getRISCVInstructionCost(SplitOp, LT.second, CostKind)

                     : 0;

  return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);

}


InstructionCost RISCVTTIImpl::getExtendedReductionCost(

    unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,

    std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {

  if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())

    return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,

                                           FMF, CostKind);


  // Skip if scalar size of ResTy is bigger than ELEN.

  if (ResTy->getScalarSizeInBits() > ST->getELen())

    return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,

                                           FMF, CostKind);


  if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)

    return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,

                                           FMF, CostKind);


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);


  if (IsUnsigned && Opcode == Instruction::Add &&

      LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {

    // Represent vector_reduce_add(ZExt(<n x i1>)) as

    // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).

    return LT.first *

           getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);

  }


  if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())

    return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,

                                           FMF, CostKind);


  return (LT.first - 1) +

         getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);

}


InstructionCost


RISCVTTIImpl::getStoreImmCost(Type *Ty, TTI::OperandValueInfo OpInfo,

                              TTI::TargetCostKind CostKind) const {

  assert(OpInfo.isConstant() && "non constant operand?");

  if (!isa<VectorType>(Ty))

    // FIXME: We need to account for immediate materialization here, but doing

    // a decent job requires more knowledge about the immediate than we

    // currently have here.

    return 0;


  if (OpInfo.isUniform())

    // vmv.v.i, vmv.v.x, or vfmv.v.f

    // We ignore the cost of the scalar constant materialization to be consistent

    // with how we treat scalar constants themselves just above.

    return 1;


  return getConstantPoolLoadCost(Ty, CostKind);

}


InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,

                                              Align Alignment,

                                              unsigned AddressSpace,

                                              TTI::TargetCostKind CostKind,

                                              TTI::OperandValueInfo OpInfo,

                                              const Instruction *I) const {

  EVT VT = TLI->getValueType(DL, Src, true);

  // Type legalization can't handle structs

  if (VT == MVT::Other)

    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,

                                  CostKind, OpInfo, I);


  InstructionCost Cost = 0;

  if (Opcode == Instruction::Store && OpInfo.isConstant())

    Cost += getStoreImmCost(Src, OpInfo, CostKind);


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);


  InstructionCost BaseCost = [&]() {

    InstructionCost Cost = LT.first;

    if (CostKind != TTI::TCK_RecipThroughput)

      return Cost;


    // Our actual lowering for the case where a wider legal type is available

    // uses the a VL predicated load on the wider type.  This is reflected in

    // the result of getTypeLegalizationCost, but BasicTTI assumes the

    // widened cases are scalarized.

    const DataLayout &DL = this->getDataLayout();

    if (Src->isVectorTy() && LT.second.isVector() &&

        TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),

                            LT.second.getSizeInBits()))

        return Cost;


    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,

                                  CostKind, OpInfo, I);

  }();


  // Assume memory ops cost scale with the number of vector registers

  // possible accessed by the instruction.  Note that BasicTTI already

  // handles the LT.first term for us.

  if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize)

    BaseCost *= TLI->getLMULCost(LT.second);

  return Cost + BaseCost;

}


InstructionCost RISCVTTIImpl::getCmpSelInstrCost(

    unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,

    TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,

    TTI::OperandValueInfo Op2Info, const Instruction *I) const {

  if (CostKind != TTI::TCK_RecipThroughput)

    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,

                                     Op1Info, Op2Info, I);


  if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())

    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,

                                     Op1Info, Op2Info, I);


  // Skip if scalar size of ValTy is bigger than ELEN.

  if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())

    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,

                                     Op1Info, Op2Info, I);


  auto GetConstantMatCost =

      [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {

    if (OpInfo.isUniform())

      // We return 0 we currently ignore the cost of materializing scalar

      // constants in GPRs.

      return 0;


    return getConstantPoolLoadCost(ValTy, CostKind);

  };


  InstructionCost ConstantMatCost;

  if (Op1Info.isConstant())

    ConstantMatCost += GetConstantMatCost(Op1Info);

  if (Op2Info.isConstant())

    ConstantMatCost += GetConstantMatCost(Op2Info);


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);

  if (Opcode == Instruction::Select && ValTy->isVectorTy()) {

    if (CondTy->isVectorTy()) {

      if (ValTy->getScalarSizeInBits() == 1) {

        // vmandn.mm v8, v8, v9

        // vmand.mm v9, v0, v9

        // vmor.mm v0, v9, v8

        return ConstantMatCost +

               LT.first *

                   getRISCVInstructionCost(

                       {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},

                       LT.second, CostKind);

      }

      // vselect and max/min are supported natively.

      return ConstantMatCost +

             LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,

                                                CostKind);

    }


    if (ValTy->getScalarSizeInBits() == 1) {

      //  vmv.v.x v9, a0

      //  vmsne.vi v9, v9, 0

      //  vmandn.mm v8, v8, v9

      //  vmand.mm v9, v0, v9

      //  vmor.mm v0, v9, v8

      MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);

      return ConstantMatCost +

             LT.first *

                 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},

                                         InterimVT, CostKind) +

             LT.first * getRISCVInstructionCost(

                            {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},

                            LT.second, CostKind);

    }


    // vmv.v.x v10, a0

    // vmsne.vi v0, v10, 0

    // vmerge.vvm v8, v9, v8, v0

    return ConstantMatCost +

           LT.first * getRISCVInstructionCost(

                          {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},

                          LT.second, CostKind);

  }


  if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&

      CmpInst::isIntPredicate(VecPred)) {

    // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE

    // provided they incur the same cost across all implementations

    return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,

                                                                LT.second,

                                                                CostKind);

  }


  if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&

      CmpInst::isFPPredicate(VecPred)) {


    // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask

    if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))

      return ConstantMatCost +

             getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);


    // If we do not support the input floating point vector type, use the base

    // one which will calculate as:

    // ScalarizeCost + Num * Cost for fixed vector,

    // InvalidCost for scalable vector.

    if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||

        (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||

        (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))

      return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,

                                       Op1Info, Op2Info, I);


    // Assuming vector fp compare and mask instructions are all the same cost

    // until a need arises to differentiate them.

    switch (VecPred) {

    case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm

    case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm

    case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm

    case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm

      return ConstantMatCost +

             LT.first * getRISCVInstructionCost(

                            {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},

                            LT.second, CostKind);


    case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m

    case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m

    case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m

    case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m

      return ConstantMatCost +

             LT.first *

                 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},

                                         LT.second, CostKind);


    case CmpInst::FCMP_OEQ: // vmfeq.vv

    case CmpInst::FCMP_OGT: // vmflt.vv

    case CmpInst::FCMP_OGE: // vmfle.vv

    case CmpInst::FCMP_OLT: // vmflt.vv

    case CmpInst::FCMP_OLE: // vmfle.vv

    case CmpInst::FCMP_UNE: // vmfne.vv

      return ConstantMatCost +

             LT.first *

                 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);

    default:

      break;

    }

  }


  // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select

  // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will

  // generate a conditional branch + mv. The cost of scalar (icmp + select) will

  // be (0 + select instr cost).

  if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&

      ValTy->isIntegerTy() && !I->user_empty()) {

    if (all_of(I->users(), [&](const User *U) {

          return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&

                 U->getType()->isIntegerTy() &&

                 !isa<ConstantData>(U->getOperand(1)) &&

                 !isa<ConstantData>(U->getOperand(2));

        }))

      return 0;

  }


  // TODO: Add cost for scalar type.


  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,

                                   Op1Info, Op2Info, I);

}


InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode,

                                             TTI::TargetCostKind CostKind,

                                             const Instruction *I) const {

  if (CostKind != TTI::TCK_RecipThroughput)

    return Opcode == Instruction::PHI ? 0 : 1;

  // Branches are assumed to be predicted.

  return 0;

}


InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,

                                                 TTI::TargetCostKind CostKind,

                                                 unsigned Index,

                                                 const Value *Op0,

                                                 const Value *Op1) const {

  assert(Val->isVectorTy() && "This must be a vector type");


  if (Opcode != Instruction::ExtractElement &&

      Opcode != Instruction::InsertElement)

    return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);


  // Legalize the type.

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);


  // This type is legalized to a scalar type.

  if (!LT.second.isVector()) {

    auto *FixedVecTy = cast<FixedVectorType>(Val);

    // If Index is a known constant, cost is zero.

    if (Index != -1U)

      return 0;

    // Extract/InsertElement with non-constant index is very costly when

    // scalarized; estimate cost of loads/stores sequence via the stack:

    // ExtractElement cost: store vector to stack, load scalar;

    // InsertElement cost: store vector to stack, store scalar, load vector.

    Type *ElemTy = FixedVecTy->getElementType();

    auto NumElems = FixedVecTy->getNumElements();

    auto Align = DL.getPrefTypeAlign(ElemTy);

    InstructionCost LoadCost =

        getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);

    InstructionCost StoreCost =

        getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);

    return Opcode == Instruction::ExtractElement

               ? StoreCost * NumElems + LoadCost

               : (StoreCost + LoadCost) * NumElems + StoreCost;

  }


  // For unsupported scalable vector.

  if (LT.second.isScalableVector() && !LT.first.isValid())

    return LT.first;


  // Mask vector extract/insert is expanded via e8.

  if (Val->getScalarSizeInBits() == 1) {

    VectorType *WideTy =

      VectorType::get(IntegerType::get(Val->getContext(), 8),

                      cast<VectorType>(Val)->getElementCount());

    if (Opcode == Instruction::ExtractElement) {

      InstructionCost ExtendCost

        = getCastInstrCost(Instruction::ZExt, WideTy, Val,

                           TTI::CastContextHint::None, CostKind);

      InstructionCost ExtractCost

        = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);

      return ExtendCost + ExtractCost;

    }

    InstructionCost ExtendCost

      = getCastInstrCost(Instruction::ZExt, WideTy, Val,

                         TTI::CastContextHint::None, CostKind);

    InstructionCost InsertCost

      = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);

    InstructionCost TruncCost

      = getCastInstrCost(Instruction::Trunc, Val, WideTy,

                         TTI::CastContextHint::None, CostKind);

    return ExtendCost + InsertCost + TruncCost;

  }


  // In RVV, we could use vslidedown + vmv.x.s to extract element from vector

  // and vslideup + vmv.s.x to insert element to vector.

  unsigned BaseCost = 1;

  // When insertelement we should add the index with 1 as the input of vslideup.

  unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;


  if (Index != -1U) {

    // The type may be split. For fixed-width vectors we can normalize the

    // index to the new type.

    if (LT.second.isFixedLengthVector()) {

      unsigned Width = LT.second.getVectorNumElements();

      Index = Index % Width;

    }


    // If exact VLEN is known, we will insert/extract into the appropriate

    // subvector with no additional subvector insert/extract cost.

    if (auto VLEN = ST->getRealVLen()) {

      unsigned EltSize = LT.second.getScalarSizeInBits();

      unsigned M1Max = *VLEN / EltSize;

      Index = Index % M1Max;

    }


    if (Index == 0)

      // We can extract/insert the first element without vslidedown/vslideup.

      SlideCost = 0;

    else if (ST->hasVendorXRivosVisni() && isUInt<5>(Index) &&

             Val->getScalarType()->isIntegerTy())

      SlideCost = 0; // With ri.vinsert/ri.vextract there is no slide needed

    else if (Opcode == Instruction::InsertElement)

      SlideCost = 1; // With a constant index, we do not need to use addi.

  }


  // When the vector needs to split into multiple register groups and the index

  // exceeds single vector register group, we need to insert/extract the element

  // via stack.

  if (LT.first > 1 &&

      ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&

                          LT.second.isScalableVector()))) {

    Type *ScalarType = Val->getScalarType();

    Align VecAlign = DL.getPrefTypeAlign(Val);

    Align SclAlign = DL.getPrefTypeAlign(ScalarType);

    // Extra addi for unknown index.

    InstructionCost IdxCost = Index == -1U ? 1 : 0;


    // Store all split vectors into stack and load the target element.

    if (Opcode == Instruction::ExtractElement)

      return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +

             getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,

                             CostKind) +

             IdxCost;


    // Store all split vectors into stack and store the target element and load

    // vectors back.

    return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +

           getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +

           getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,

                           CostKind) +

           IdxCost;

  }


  // Extract i64 in the target that has XLEN=32 need more instruction.

  if (Val->getScalarType()->isIntegerTy() &&

      ST->getXLen() < Val->getScalarSizeInBits()) {

    // For extractelement, we need the following instructions:

    // vsetivli zero, 1, e64, m1, ta, mu (not count)

    // vslidedown.vx v8, v8, a0

    // vmv.x.s a0, v8

    // li a1, 32

    // vsrl.vx v8, v8, a1

    // vmv.x.s a1, v8


    // For insertelement, we need the following instructions:

    // vsetivli zero, 2, e32, m4, ta, mu (not count)

    // vmv.v.i v12, 0

    // vslide1up.vx v16, v12, a1

    // vslide1up.vx v12, v16, a0

    // addi a0, a2, 1

    // vsetvli zero, a0, e64, m4, tu, mu (not count)

    // vslideup.vx v8, v12, a2


    // TODO: should we count these special vsetvlis?

    BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;

  }

  return BaseCost + SlideCost;

}


InstructionCost


RISCVTTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,

                                               TTI::TargetCostKind CostKind,

                                               unsigned Index) const {

  if (isa<FixedVectorType>(Val))

    return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind,

                                                   Index);


  // TODO: This code replicates what LoopVectorize.cpp used to do when asking

  // for the cost of extracting the last lane of a scalable vector. It probably

  // needs a more accurate cost.

  ElementCount EC = cast<VectorType>(Val)->getElementCount();

  assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");

  return getVectorInstrCost(Opcode, Val, CostKind,

                            EC.getKnownMinValue() - 1 - Index, nullptr,

                            nullptr);

}


InstructionCost RISCVTTIImpl::getArithmeticInstrCost(

    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,

    TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,

    ArrayRef<const Value *> Args, const Instruction *CxtI) const {


  // TODO: Handle more cost kinds.

  if (CostKind != TTI::TCK_RecipThroughput)

    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,

                                         Args, CxtI);


  if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())

    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,

                                         Args, CxtI);


  // Skip if scalar size of Ty is bigger than ELEN.

  if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())

    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,

                                         Args, CxtI);


  // Legalize the type.

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);


  // TODO: Handle scalar type.

  if (!LT.second.isVector())

    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,

                                         Args, CxtI);


  // f16 with zvfhmin and bf16 will be promoted to f32.

  // FIXME: nxv32[b]f16 will be custom lowered and split.

  unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);

  InstructionCost CastCost = 0;

  if ((LT.second.getVectorElementType() == MVT::f16 ||

       LT.second.getVectorElementType() == MVT::bf16) &&

      TLI->getOperationAction(ISDOpcode, LT.second) ==

          TargetLoweringBase::LegalizeAction::Promote) {

    MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);

    Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());

    Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());

    // Add cost of extending arguments

    CastCost += LT.first * Args.size() *

                getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,

                                 TTI::CastContextHint::None, CostKind);

    // Add cost of truncating result

    CastCost +=

        LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,

                                    TTI::CastContextHint::None, CostKind);

    // Compute cost of op in promoted type

    LT.second = PromotedVT;

  }


  auto getConstantMatCost =

      [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {

    if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))

      // Two sub-cases:

      // * Has a 5 bit immediate operand which can be splatted.

      // * Has a larger immediate which must be materialized in scalar register

      // We return 0 for both as we currently ignore the cost of materializing

      // scalar constants in GPRs.

      return 0;


    return getConstantPoolLoadCost(Ty, CostKind);

  };


  // Add the cost of materializing any constant vectors required.

  InstructionCost ConstantMatCost = 0;

  if (Op1Info.isConstant())

    ConstantMatCost += getConstantMatCost(0, Op1Info);

  if (Op2Info.isConstant())

    ConstantMatCost += getConstantMatCost(1, Op2Info);


  unsigned Op;

  switch (ISDOpcode) {

  case ISD::ADD:

  case ISD::SUB:

    Op = RISCV::VADD_VV;

    break;

  case ISD::SHL:

  case ISD::SRL:

  case ISD::SRA:

    Op = RISCV::VSLL_VV;

    break;

  case ISD::AND:

  case ISD::OR:

  case ISD::XOR:

    Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;

    break;

  case ISD::MUL:

  case ISD::MULHS:

  case ISD::MULHU:

    Op = RISCV::VMUL_VV;

    break;

  case ISD::SDIV:

  case ISD::UDIV:

    Op = RISCV::VDIV_VV;

    break;

  case ISD::SREM:

  case ISD::UREM:

    Op = RISCV::VREM_VV;

    break;

  case ISD::FADD:

  case ISD::FSUB:

    Op = RISCV::VFADD_VV;

    break;

  case ISD::FMUL:

    Op = RISCV::VFMUL_VV;

    break;

  case ISD::FDIV:

    Op = RISCV::VFDIV_VV;

    break;

  case ISD::FNEG:

    Op = RISCV::VFSGNJN_VV;

    break;

  default:

    // Assuming all other instructions have the same cost until a need arises to

    // differentiate them.

    return CastCost + ConstantMatCost +

           BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,

                                         Args, CxtI);

  }


  InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);

  // We use BasicTTIImpl to calculate scalar costs, which assumes floating point

  // ops are twice as expensive as integer ops. Do the same for vectors so

  // scalar floating point ops aren't cheaper than their vector equivalents.

  if (Ty->isFPOrFPVectorTy())

    InstrCost *= 2;

  return CastCost + ConstantMatCost + LT.first * InstrCost;

}


// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.


InstructionCost RISCVTTIImpl::getPointersChainCost(

    ArrayRef<const Value *> Ptrs, const Value *Base,

    const TTI::PointersChainInfo &Info, Type *AccessTy,

    TTI::TargetCostKind CostKind) const {

  InstructionCost Cost = TTI::TCC_Free;

  // In the basic model we take into account GEP instructions only

  // (although here can come alloca instruction, a value, constants and/or

  // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a

  // pointer). Typically, if Base is a not a GEP-instruction and all the

  // pointers are relative to the same base address, all the rest are

  // either GEP instructions, PHIs, bitcasts or constants. When we have same

  // base, we just calculate cost of each non-Base GEP as an ADD operation if

  // any their index is a non-const.

  // If no known dependencies between the pointers cost is calculated as a sum

  // of costs of GEP instructions.

  for (auto [I, V] : enumerate(Ptrs)) {

    const auto *GEP = dyn_cast<GetElementPtrInst>(V);

    if (!GEP)

      continue;

    if (Info.isSameBase() && V != Base) {

      if (GEP->hasAllConstantIndices())

        continue;

      // If the chain is unit-stride and BaseReg + stride*i is a legal

      // addressing mode, then presume the base GEP is sitting around in a

      // register somewhere and check if we can fold the offset relative to

      // it.

      unsigned Stride = DL.getTypeStoreSize(AccessTy);

      if (Info.isUnitStride() &&

          isLegalAddressingMode(AccessTy,

                                /* BaseGV */ nullptr,

                                /* BaseOffset */ Stride * I,

                                /* HasBaseReg */ true,

                                /* Scale */ 0,

                                GEP->getType()->getPointerAddressSpace()))

        continue;

      Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,

                                     {TTI::OK_AnyValue, TTI::OP_None},

                                     {TTI::OK_AnyValue, TTI::OP_None}, {});

    } else {

      SmallVector<const Value *> Indices(GEP->indices());

      Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),

                         Indices, AccessTy, CostKind);

    }

  }

  return Cost;

}


void RISCVTTIImpl::getUnrollingPreferences(

    Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,

    OptimizationRemarkEmitter *ORE) const {

  // TODO: More tuning on benchmarks and metrics with changes as needed

  //       would apply to all settings below to enable performance.


  if (ST->enableDefaultUnroll())

    return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);


  // Enable Upper bound unrolling universally, not dependent upon the conditions

  // below.

  UP.UpperBound = true;


  // Disable loop unrolling for Oz and Os.

  UP.OptSizeThreshold = 0;

  UP.PartialOptSizeThreshold = 0;

  if (L->getHeader()->getParent()->hasOptSize())

    return;


  SmallVector<BasicBlock *, 4> ExitingBlocks;

  L->getExitingBlocks(ExitingBlocks);

  LLVM_DEBUG(dbgs() << "Loop has:\n"

                    << "Blocks: " << L->getNumBlocks() << "\n"

                    << "Exit blocks: " << ExitingBlocks.size() << "\n");


  // Only allow another exit other than the latch. This acts as an early exit

  // as it mirrors the profitability calculation of the runtime unroller.

  if (ExitingBlocks.size() > 2)

    return;


  // Limit the CFG of the loop body for targets with a branch predictor.

  // Allowing 4 blocks permits if-then-else diamonds in the body.

  if (L->getNumBlocks() > 4)

    return;


  // Scan the loop: don't unroll loops with calls as this could prevent

  // inlining. Don't unroll auto-vectorized loops either, though do allow

  // unrolling of the scalar remainder.

  bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");

  InstructionCost Cost = 0;

  for (auto *BB : L->getBlocks()) {

    for (auto &I : *BB) {

      // Both auto-vectorized loops and the scalar remainder have the

      // isvectorized attribute, so differentiate between them by the presence

      // of vector instructions.

      if (IsVectorized && I.getType()->isVectorTy())

        return;


      if (isa<CallInst>(I) || isa<InvokeInst>(I)) {

        if (const Function *F = cast<CallBase>(I).getCalledFunction()) {

          if (!isLoweredToCall(F))

            continue;

        }

        return;

      }


      SmallVector<const Value *> Operands(I.operand_values());

      Cost += getInstructionCost(&I, Operands,

                                 TargetTransformInfo::TCK_SizeAndLatency);

    }

  }


  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");


  UP.Partial = true;

  UP.Runtime = true;

  UP.UnrollRemainder = true;

  UP.UnrollAndJam = true;


  // Force unrolling small loops can be very useful because of the branch

  // taken cost of the backedge.

  if (Cost < 12)

    UP.Force = true;

}


void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,

                                         TTI::PeelingPreferences &PP) const {

  BaseT::getPeelingPreferences(L, SE, PP);

}


bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,

                                      MemIntrinsicInfo &Info) const {

  const DataLayout &DL = getDataLayout();

  Intrinsic::ID IID = Inst->getIntrinsicID();

  LLVMContext &C = Inst->getContext();

  bool HasMask = false;


  auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,

                      bool IsWrite) -> int64_t {

    if (auto *TarExtTy =

            dyn_cast<TargetExtType>(II->getArgOperand(0)->getType()))

      return TarExtTy->getIntParameter(0);


    return 1;

  };


  switch (IID) {

  case Intrinsic::riscv_vle_mask:

  case Intrinsic::riscv_vse_mask:

  case Intrinsic::riscv_vlseg2_mask:

  case Intrinsic::riscv_vlseg3_mask:

  case Intrinsic::riscv_vlseg4_mask:

  case Intrinsic::riscv_vlseg5_mask:

  case Intrinsic::riscv_vlseg6_mask:

  case Intrinsic::riscv_vlseg7_mask:

  case Intrinsic::riscv_vlseg8_mask:

  case Intrinsic::riscv_vsseg2_mask:

  case Intrinsic::riscv_vsseg3_mask:

  case Intrinsic::riscv_vsseg4_mask:

  case Intrinsic::riscv_vsseg5_mask:

  case Intrinsic::riscv_vsseg6_mask:

  case Intrinsic::riscv_vsseg7_mask:

  case Intrinsic::riscv_vsseg8_mask:

    HasMask = true;

    [[fallthrough]];

  case Intrinsic::riscv_vle:

  case Intrinsic::riscv_vse:

  case Intrinsic::riscv_vlseg2:

  case Intrinsic::riscv_vlseg3:

  case Intrinsic::riscv_vlseg4:

  case Intrinsic::riscv_vlseg5:

  case Intrinsic::riscv_vlseg6:

  case Intrinsic::riscv_vlseg7:

  case Intrinsic::riscv_vlseg8:

  case Intrinsic::riscv_vsseg2:

  case Intrinsic::riscv_vsseg3:

  case Intrinsic::riscv_vsseg4:

  case Intrinsic::riscv_vsseg5:

  case Intrinsic::riscv_vsseg6:

  case Intrinsic::riscv_vsseg7:

  case Intrinsic::riscv_vsseg8: {

    // Intrinsic interface:

    // riscv_vle(merge, ptr, vl)

    // riscv_vle_mask(merge, ptr, mask, vl, policy)

    // riscv_vse(val, ptr, vl)

    // riscv_vse_mask(val, ptr, mask, vl, policy)

    // riscv_vlseg#(merge, ptr, vl, sew)

    // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)

    // riscv_vsseg#(val, ptr, vl, sew)

    // riscv_vsseg#_mask(val, ptr, mask, vl, sew)

    bool IsWrite = Inst->getType()->isVoidTy();

    Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();

    // The results of segment loads are TargetExtType.

    if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {

      unsigned SEW =

          1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))

                   ->getZExtValue();

      Ty = TarExtTy->getTypeParameter(0U);

      Ty = ScalableVectorType::get(

          IntegerType::get(C, SEW),

          cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);

    }

    const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);

    unsigned VLIndex = RVVIInfo->VLOperand;

    unsigned PtrOperandNo = VLIndex - 1 - HasMask;

    MaybeAlign Alignment =

        Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);

    Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));

    Value *Mask = ConstantInt::getTrue(MaskType);

    if (HasMask)

      Mask = Inst->getArgOperand(VLIndex - 1);

    Value *EVL = Inst->getArgOperand(VLIndex);

    unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);

    // RVV uses contiguous elements as a segment.

    if (SegNum > 1) {

      unsigned ElemSize = Ty->getScalarSizeInBits();

      auto *SegTy = IntegerType::get(C, ElemSize * SegNum);

      Ty = VectorType::get(SegTy, cast<VectorType>(Ty));

    }

    Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,

                                          Alignment, Mask, EVL);

    return true;

  }

  case Intrinsic::riscv_vlse_mask:

  case Intrinsic::riscv_vsse_mask:

  case Intrinsic::riscv_vlsseg2_mask:

  case Intrinsic::riscv_vlsseg3_mask:

  case Intrinsic::riscv_vlsseg4_mask:

  case Intrinsic::riscv_vlsseg5_mask:

  case Intrinsic::riscv_vlsseg6_mask:

  case Intrinsic::riscv_vlsseg7_mask:

  case Intrinsic::riscv_vlsseg8_mask:

  case Intrinsic::riscv_vssseg2_mask:

  case Intrinsic::riscv_vssseg3_mask:

  case Intrinsic::riscv_vssseg4_mask:

  case Intrinsic::riscv_vssseg5_mask:

  case Intrinsic::riscv_vssseg6_mask:

  case Intrinsic::riscv_vssseg7_mask:

  case Intrinsic::riscv_vssseg8_mask:

    HasMask = true;

    [[fallthrough]];

  case Intrinsic::riscv_vlse:

  case Intrinsic::riscv_vsse:

  case Intrinsic::riscv_vlsseg2:

  case Intrinsic::riscv_vlsseg3:

  case Intrinsic::riscv_vlsseg4:

  case Intrinsic::riscv_vlsseg5:

  case Intrinsic::riscv_vlsseg6:

  case Intrinsic::riscv_vlsseg7:

  case Intrinsic::riscv_vlsseg8:

  case Intrinsic::riscv_vssseg2:

  case Intrinsic::riscv_vssseg3:

  case Intrinsic::riscv_vssseg4:

  case Intrinsic::riscv_vssseg5:

  case Intrinsic::riscv_vssseg6:

  case Intrinsic::riscv_vssseg7:

  case Intrinsic::riscv_vssseg8: {

    // Intrinsic interface:

    // riscv_vlse(merge, ptr, stride, vl)

    // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)

    // riscv_vsse(val, ptr, stride, vl)

    // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)

    // riscv_vlsseg#(merge, ptr, offset, vl, sew)

    // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)

    // riscv_vssseg#(val, ptr, offset, vl, sew)

    // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)

    bool IsWrite = Inst->getType()->isVoidTy();

    Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();

    // The results of segment loads are TargetExtType.

    if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {

      unsigned SEW =

          1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))

                   ->getZExtValue();

      Ty = TarExtTy->getTypeParameter(0U);

      Ty = ScalableVectorType::get(

          IntegerType::get(C, SEW),

          cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);

    }

    const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);

    unsigned VLIndex = RVVIInfo->VLOperand;

    unsigned PtrOperandNo = VLIndex - 2 - HasMask;

    MaybeAlign Alignment =

        Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);


    Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);

    // Use the pointer alignment as the element alignment if the stride is a

    // multiple of the pointer alignment. Otherwise, the element alignment

    // should be the greatest common divisor of pointer alignment and stride.

    // For simplicity, just consider unalignment for elements.

    unsigned PointerAlign = Alignment.valueOrOne().value();

    if (!isa<ConstantInt>(Stride) ||

        cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)

      Alignment = Align(1);


    Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));

    Value *Mask = ConstantInt::getTrue(MaskType);

    if (HasMask)

      Mask = Inst->getArgOperand(VLIndex - 1);

    Value *EVL = Inst->getArgOperand(VLIndex);

    unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);

    // RVV uses contiguous elements as a segment.

    if (SegNum > 1) {

      unsigned ElemSize = Ty->getScalarSizeInBits();

      auto *SegTy = IntegerType::get(C, ElemSize * SegNum);

      Ty = VectorType::get(SegTy, cast<VectorType>(Ty));

    }

    Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,

                                          Alignment, Mask, EVL, Stride);

    return true;

  }

  case Intrinsic::riscv_vloxei_mask:

  case Intrinsic::riscv_vluxei_mask:

  case Intrinsic::riscv_vsoxei_mask:

  case Intrinsic::riscv_vsuxei_mask:

  case Intrinsic::riscv_vloxseg2_mask:

  case Intrinsic::riscv_vloxseg3_mask:

  case Intrinsic::riscv_vloxseg4_mask:

  case Intrinsic::riscv_vloxseg5_mask:

  case Intrinsic::riscv_vloxseg6_mask:

  case Intrinsic::riscv_vloxseg7_mask:

  case Intrinsic::riscv_vloxseg8_mask:

  case Intrinsic::riscv_vluxseg2_mask:

  case Intrinsic::riscv_vluxseg3_mask:

  case Intrinsic::riscv_vluxseg4_mask:

  case Intrinsic::riscv_vluxseg5_mask:

  case Intrinsic::riscv_vluxseg6_mask:

  case Intrinsic::riscv_vluxseg7_mask:

  case Intrinsic::riscv_vluxseg8_mask:

  case Intrinsic::riscv_vsoxseg2_mask:

  case Intrinsic::riscv_vsoxseg3_mask:

  case Intrinsic::riscv_vsoxseg4_mask:

  case Intrinsic::riscv_vsoxseg5_mask:

  case Intrinsic::riscv_vsoxseg6_mask:

  case Intrinsic::riscv_vsoxseg7_mask:

  case Intrinsic::riscv_vsoxseg8_mask:

  case Intrinsic::riscv_vsuxseg2_mask:

  case Intrinsic::riscv_vsuxseg3_mask:

  case Intrinsic::riscv_vsuxseg4_mask:

  case Intrinsic::riscv_vsuxseg5_mask:

  case Intrinsic::riscv_vsuxseg6_mask:

  case Intrinsic::riscv_vsuxseg7_mask:

  case Intrinsic::riscv_vsuxseg8_mask:

    HasMask = true;

    [[fallthrough]];

  case Intrinsic::riscv_vloxei:

  case Intrinsic::riscv_vluxei:

  case Intrinsic::riscv_vsoxei:

  case Intrinsic::riscv_vsuxei:

  case Intrinsic::riscv_vloxseg2:

  case Intrinsic::riscv_vloxseg3:

  case Intrinsic::riscv_vloxseg4:

  case Intrinsic::riscv_vloxseg5:

  case Intrinsic::riscv_vloxseg6:

  case Intrinsic::riscv_vloxseg7:

  case Intrinsic::riscv_vloxseg8:

  case Intrinsic::riscv_vluxseg2:

  case Intrinsic::riscv_vluxseg3:

  case Intrinsic::riscv_vluxseg4:

  case Intrinsic::riscv_vluxseg5:

  case Intrinsic::riscv_vluxseg6:

  case Intrinsic::riscv_vluxseg7:

  case Intrinsic::riscv_vluxseg8:

  case Intrinsic::riscv_vsoxseg2:

  case Intrinsic::riscv_vsoxseg3:

  case Intrinsic::riscv_vsoxseg4:

  case Intrinsic::riscv_vsoxseg5:

  case Intrinsic::riscv_vsoxseg6:

  case Intrinsic::riscv_vsoxseg7:

  case Intrinsic::riscv_vsoxseg8:

  case Intrinsic::riscv_vsuxseg2:

  case Intrinsic::riscv_vsuxseg3:

  case Intrinsic::riscv_vsuxseg4:

  case Intrinsic::riscv_vsuxseg5:

  case Intrinsic::riscv_vsuxseg6:

  case Intrinsic::riscv_vsuxseg7:

  case Intrinsic::riscv_vsuxseg8: {

    // Intrinsic interface (only listed ordered version):

    // riscv_vloxei(merge, ptr, index, vl)

    // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)

    // riscv_vsoxei(val, ptr, index, vl)

    // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)

    // riscv_vloxseg#(merge, ptr, index, vl, sew)

    // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)

    // riscv_vsoxseg#(val, ptr, index, vl, sew)

    // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)

    bool IsWrite = Inst->getType()->isVoidTy();

    Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();

    // The results of segment loads are TargetExtType.

    if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {

      unsigned SEW =

          1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))

                   ->getZExtValue();

      Ty = TarExtTy->getTypeParameter(0U);

      Ty = ScalableVectorType::get(

          IntegerType::get(C, SEW),

          cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);

    }

    const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);

    unsigned VLIndex = RVVIInfo->VLOperand;

    unsigned PtrOperandNo = VLIndex - 2 - HasMask;

    Value *Mask;

    if (HasMask) {

      Mask = Inst->getArgOperand(VLIndex - 1);

    } else {

      // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,

      // and casting that to scalar i64 triggers a vector/scalar mismatch

      // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it

      // via extractelement instead.

      Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));

      Mask = ConstantInt::getTrue(MaskType);

    }

    Value *EVL = Inst->getArgOperand(VLIndex);

    unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);

    // RVV uses contiguous elements as a segment.

    if (SegNum > 1) {

      unsigned ElemSize = Ty->getScalarSizeInBits();

      auto *SegTy = IntegerType::get(C, ElemSize * SegNum);

      Ty = VectorType::get(SegTy, cast<VectorType>(Ty));

    }

    Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);

    Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,

                                          Align(1), Mask, EVL,

                                          /* Stride */ nullptr, OffsetOp);

    return true;

  }

  }

  return false;

}


unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) const {

  if (Ty->isVectorTy()) {

    // f16 with only zvfhmin and bf16 will be promoted to f32

    Type *EltTy = cast<VectorType>(Ty)->getElementType();

    if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||

        EltTy->isBFloatTy())

      Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),

                           cast<VectorType>(Ty));


    TypeSize Size = DL.getTypeSizeInBits(Ty);

    if (Size.isScalable() && ST->hasVInstructions())

      return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);


    if (ST->useRVVForFixedLengthVectors())

      return divideCeil(Size, ST->getRealMinVLen());

  }


  return BaseT::getRegUsageForType(Ty);

}


unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {

  if (SLPMaxVF.getNumOccurrences())

    return SLPMaxVF;


  // Return how many elements can fit in getRegisterBitwidth.  This is the

  // same routine as used in LoopVectorizer.  We should probably be

  // accounting for whether we actually have instructions with the right

  // lane type, but we don't have enough information to do that without

  // some additional plumbing which hasn't been justified yet.

  TypeSize RegWidth =

    getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);

  // If no vector registers, or absurd element widths, disable

  // vectorization by returning 1.

  return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);

}


unsigned RISCVTTIImpl::getMinTripCountTailFoldingThreshold() const {

  return RVVMinTripCount;

}


bool RISCVTTIImpl::preferAlternateOpcodeVectorization() const {

  return ST->enableUnalignedVectorMem();

}


TTI::AddressingModeKind


RISCVTTIImpl::getPreferredAddressingMode(const Loop *L,

                                         ScalarEvolution *SE) const {

  if (ST->hasVendorXCVmem() && !ST->is64Bit())

    return TTI::AMK_PostIndexed;


  return BasicTTIImplBase::getPreferredAddressingMode(L, SE);

}


bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,

                                 const TargetTransformInfo::LSRCost &C2) const {

  // RISC-V specific here are "instruction number 1st priority".

  // If we need to emit adds inside the loop to add up base registers, then

  // we need at least one extra temporary register.

  unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);

  unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);

  return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,

                  C1.NumIVMuls, C1.NumBaseAdds,

                  C1.ScaleCost, C1.ImmCost, C1.SetupCost) <

         std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,

                  C2.NumIVMuls, C2.NumBaseAdds,

                  C2.ScaleCost, C2.ImmCost, C2.SetupCost);

}


bool RISCVTTIImpl::isLegalMaskedExpandLoad(Type *DataTy,

                                           Align Alignment) const {

  auto *VTy = dyn_cast<VectorType>(DataTy);

  if (!VTy || VTy->isScalableTy())

    return false;


  if (!isLegalMaskedLoadStore(DataTy, Alignment))

    return false;


  // FIXME: If it is an i8 vector and the element count exceeds 256, we should

  // scalarize these types with LMUL >= maximum fixed-length LMUL.

  if (VTy->getElementType()->isIntegerTy(8))

    if (VTy->getElementCount().getFixedValue() > 256)

      return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <

             ST->getMaxLMULForFixedLengthVectors();

  return true;

}


bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy,

                                              Align Alignment) const {

  auto *VTy = dyn_cast<VectorType>(DataTy);

  if (!VTy || VTy->isScalableTy())

    return false;


  if (!isLegalMaskedLoadStore(DataTy, Alignment))

    return false;

  return true;

}


/// See if \p I should be considered for address type promotion. We check if \p

/// I is a sext with right type and used in memory accesses. If it used in a

/// "complex" getelementptr, we allow it to be promoted without finding other

/// sext instructions that sign extended the same initial value. A getelementptr

/// is considered as "complex" if it has more than 2 operands.


bool RISCVTTIImpl::shouldConsiderAddressTypePromotion(

    const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {

  bool Considerable = false;

  AllowPromotionWithoutCommonHeader = false;

  if (!isa<SExtInst>(&I))

    return false;

  Type *ConsideredSExtType =

      Type::getInt64Ty(I.getParent()->getParent()->getContext());

  if (I.getType() != ConsideredSExtType)

    return false;

  // See if the sext is the one with the right type and used in at least one

  // GetElementPtrInst.

  for (const User *U : I.users()) {

    if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {

      Considerable = true;

      // A getelementptr is considered as "complex" if it has more than 2

      // operands. We will promote a SExt used in such complex GEP as we

      // expect some computation to be merged if they are done on 64 bits.

      if (GEPInst->getNumOperands() > 2) {

        AllowPromotionWithoutCommonHeader = true;

        break;

      }

    }

  }

  return Considerable;

}


bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {

  switch (Opcode) {

  case Instruction::Add:

  case Instruction::Sub:

  case Instruction::Mul:

  case Instruction::And:

  case Instruction::Or:

  case Instruction::Xor:

  case Instruction::FAdd:

  case Instruction::FSub:

  case Instruction::FMul:

  case Instruction::FDiv:

  case Instruction::ICmp:

  case Instruction::FCmp:

    return true;

  case Instruction::Shl:

  case Instruction::LShr:

  case Instruction::AShr:

  case Instruction::UDiv:

  case Instruction::SDiv:

  case Instruction::URem:

  case Instruction::SRem:

  case Instruction::Select:

    return Operand == 1;

  default:

    return false;

  }

}


bool RISCVTTIImpl::canSplatOperand(Instruction *I, int Operand) const {

  if (!I->getType()->isVectorTy() || !ST->hasVInstructions())

    return false;


  if (canSplatOperand(I->getOpcode(), Operand))

    return true;


  auto *II = dyn_cast<IntrinsicInst>(I);

  if (!II)

    return false;


  switch (II->getIntrinsicID()) {

  case Intrinsic::fma:

  case Intrinsic::vp_fma:

  case Intrinsic::fmuladd:

  case Intrinsic::vp_fmuladd:

    return Operand == 0 || Operand == 1;

  case Intrinsic::vp_shl:

  case Intrinsic::vp_lshr:

  case Intrinsic::vp_ashr:

  case Intrinsic::vp_udiv:

  case Intrinsic::vp_sdiv:

  case Intrinsic::vp_urem:

  case Intrinsic::vp_srem:

  case Intrinsic::ssub_sat:

  case Intrinsic::vp_ssub_sat:

  case Intrinsic::usub_sat:

  case Intrinsic::vp_usub_sat:

  case Intrinsic::vp_select:

    return Operand == 1;

    // These intrinsics are commutative.

  case Intrinsic::vp_add:

  case Intrinsic::vp_mul:

  case Intrinsic::vp_and:

  case Intrinsic::vp_or:

  case Intrinsic::vp_xor:

  case Intrinsic::vp_fadd:

  case Intrinsic::vp_fmul:

  case Intrinsic::vp_icmp:

  case Intrinsic::vp_fcmp:

  case Intrinsic::smin:

  case Intrinsic::vp_smin:

  case Intrinsic::umin:

  case Intrinsic::vp_umin:

  case Intrinsic::smax:

  case Intrinsic::vp_smax:

  case Intrinsic::umax:

  case Intrinsic::vp_umax:

  case Intrinsic::sadd_sat:

  case Intrinsic::vp_sadd_sat:

  case Intrinsic::uadd_sat:

  case Intrinsic::vp_uadd_sat:

    // These intrinsics have 'vr' versions.

  case Intrinsic::vp_sub:

  case Intrinsic::vp_fsub:

  case Intrinsic::vp_fdiv:

    return Operand == 0 || Operand == 1;

  default:

    return false;

  }

}


/// Check if sinking \p I's operands to I's basic block is profitable, because

/// the operands can be folded into a target instruction, e.g.

/// splats of scalars can fold into vector instructions.


bool RISCVTTIImpl::isProfitableToSinkOperands(

    Instruction *I, SmallVectorImpl<Use *> &Ops) const {

  using namespace llvm::PatternMatch;


  if (I->isBitwiseLogicOp()) {

    if (!I->getType()->isVectorTy()) {

      if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {

        for (auto &Op : I->operands()) {

          // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)

          if (match(Op.get(), m_Not(m_Value()))) {

            Ops.push_back(&Op);

            return true;

          }

        }

      }

    } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {

      for (auto &Op : I->operands()) {

        // (and X, (not Y)) -> (vandn.vv X, Y)

        if (match(Op.get(), m_Not(m_Value()))) {

          Ops.push_back(&Op);

          return true;

        }

        // (and X, (splat (not Y))) -> (vandn.vx X, Y)

        if (match(Op.get(), m_Shuffle(m_InsertElt(m_Value(), m_Not(m_Value()),

                                                  m_ZeroInt()),

                                      m_Value(), m_ZeroMask()))) {

          Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);

          Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);

          Ops.push_back(&Not);

          Ops.push_back(&InsertElt);

          Ops.push_back(&Op);

          return true;

        }

      }

    }

  }


  if (!I->getType()->isVectorTy() || !ST->hasVInstructions())

    return false;


  // Don't sink splat operands if the target prefers it. Some targets requires

  // S2V transfer buffers and we can run out of them copying the same value

  // repeatedly.

  // FIXME: It could still be worth doing if it would improve vector register

  // pressure and prevent a vector spill.

  if (!ST->sinkSplatOperands())

    return false;


  for (auto OpIdx : enumerate(I->operands())) {

    if (!canSplatOperand(I, OpIdx.index()))

      continue;


    Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());

    // Make sure we are not already sinking this operand

    if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))

      continue;


    // We are looking for a splat/vp.splat that can be sunk.

    bool IsVPSplat = match(Op, m_Intrinsic<Intrinsic::experimental_vp_splat>(

                                   m_Value(), m_Value(), m_Value()));

    if (!IsVPSplat &&

        !match(Op, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),

                             m_Value(), m_ZeroMask())))

      continue;


    // Don't sink i1 splats.

    if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))

      continue;


    // All uses of the shuffle should be sunk to avoid duplicating it across gpr

    // and vector registers

    for (Use &U : Op->uses()) {

      Instruction *Insn = cast<Instruction>(U.getUser());

      if (!canSplatOperand(Insn, U.getOperandNo()))

        return false;

    }


    // Sink any fpexts since they might be used in a widening fp pattern.

    if (IsVPSplat) {

      if (isa<FPExtInst>(Op->getOperand(0)))

        Ops.push_back(&Op->getOperandUse(0));

    } else {

      Use *InsertEltUse = &Op->getOperandUse(0);

      auto *InsertElt = cast<InsertElementInst>(InsertEltUse);

      if (isa<FPExtInst>(InsertElt->getOperand(1)))

        Ops.push_back(&InsertElt->getOperandUse(1));

      Ops.push_back(InsertEltUse);

    }

    Ops.push_back(&OpIdx.value());

  }

  return true;

}


RISCVTTIImpl::TTI::MemCmpExpansionOptions


RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {

  TTI::MemCmpExpansionOptions Options;

  // TODO: Enable expansion when unaligned access is not supported after we fix

  // issues in ExpandMemcmp.

  if (!ST->enableUnalignedScalarMem())

    return Options;


  if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)

    return Options;


  Options.AllowOverlappingLoads = true;

  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);

  Options.NumLoadsPerBlock = Options.MaxNumLoads;

  if (ST->is64Bit()) {

    Options.LoadSizes = {8, 4, 2, 1};

    Options.AllowedTailExpansions = {3, 5, 6};

  } else {

    Options.LoadSizes = {4, 2, 1};

    Options.AllowedTailExpansions = {3};

  }


  if (IsZeroCmp && ST->hasVInstructions()) {

    unsigned VLenB = ST->getRealMinVLen() / 8;

    // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be

    // `VLenB * MaxLMUL` so that it fits in a single register group.

    unsigned MinSize = ST->getXLen() / 8 + 1;

    unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();

    for (unsigned Size = MinSize; Size <= MaxSize; Size++)

      Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);

  }

  return Options;

}


assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

BasicTTIImpl.h
This file provides a helper that implements much of the TTI interface in terms of the target-independ...

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

shouldSplit
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
Definition ControlHeightReduction.cpp:1039

CostKind
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))

IntrinsicCostStrategy::InstructionCost
@ InstructionCost
Definition CostModel.cpp:52

CostTable.h
Cost tables and simple lookup functions.

GEP
Hexagon Common GEP
Definition HexagonCommonGEP.cpp:164

InstrCost
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

OffsetOp
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
Definition InstCombineCompares.cpp:5876

Instructions.h

Ops
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Definition ItaniumDemangle.h:3368

Options
static LVOptions Options
Definition LVOptions.cpp:25

F
#define F(x, y, z)
Definition MD5.cpp:55

I
#define I(x, y, z)
Definition MD5.cpp:58

Operands
mir Rename Register Operands
Definition MIRNamerPass.cpp:74

getCalledFunction
static const Function * getCalledFunction(const Value *V)
Definition MemoryBuiltins.cpp:159

OpIdx
MachineInstr unsigned OpIdx
Definition NVPTXPrologEpilogPass.cpp:56

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

PatternMatch.h

RISCVMatInt.h

costShuffleViaVRegSplitting
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
Definition RISCVTargetTransformInfo.cpp:521

costShuffleViaSplitting
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
Definition RISCVTargetTransformInfo.cpp:451

isRepeatedConcatMask
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
Definition RISCVTargetTransformInfo.cpp:415

isM1OrSmaller
static unsigned isM1OrSmaller(MVT VT)
Definition RISCVTargetTransformInfo.cpp:957

SLPMaxVF
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)

RVVRegisterWidthLMUL
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)

RVVMinTripCount
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)

getIntImmCostImpl
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
Definition RISCVTargetTransformInfo.cpp:120

getVRGatherIndexType
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
Definition RISCVTargetTransformInfo.cpp:437

VectorIntrinsicCostTable
static const CostTblEntry VectorIntrinsicCostTable[]
Definition RISCVTargetTransformInfo.cpp:1219

canUseShiftPair
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
Definition RISCVTargetTransformInfo.cpp:146

canUseShiftCmp
static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm)
Definition RISCVTargetTransformInfo.cpp:174

RISCVTargetTransformInfo.h
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...

getValueType
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
Definition SLPVectorizer.cpp:264

STLExtras.h
This file contains some templates that are useful if you are working with the STL at all.

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:114

Ptr
@ Ptr
Definition TargetLibraryInfo.cpp:77

TargetLowering.h
This file describes how to lower LLVM code to machine code.

TargetTransformInfo.h
This pass exposes codegen information to IR-level passes.

ValueTypes.h

VectorType
Definition ItaniumDemangle.h:1189

llvm::APInt
Class for arbitrary precision integers.
Definition APInt.h:78

llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41

llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147

llvm::BasicTTIImplBase< RISCVTTIImpl >::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
Definition BasicTTIImpl.h:1589

llvm::BasicTTIImplBase< RISCVTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
Definition BasicTTIImpl.h:1423

llvm::BasicTTIImplBase< RISCVTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition BasicTTIImpl.h:1033

llvm::BasicTTIImplBase< RISCVTTIImpl >::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:3186

llvm::BasicTTIImplBase< RISCVTTIImpl >::getGEPCost
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:558

llvm::BasicTTIImplBase< RISCVTTIImpl >::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Definition BasicTTIImpl.h:888

llvm::BasicTTIImplBase< RISCVTTIImpl >::getStridedMemoryOpCost
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
Definition BasicTTIImpl.h:1577

llvm::BasicTTIImplBase< RISCVTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
Definition BasicTTIImpl.h:1111

llvm::BasicTTIImplBase< RISCVTTIImpl >::isLegalAddressingMode
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
Definition BasicTTIImpl.h:459

llvm::BasicTTIImplBase< RISCVTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition BasicTTIImpl.h:1167

llvm::BasicTTIImplBase< RISCVTTIImpl >::getExpandCompressMemoryOpCost
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
Definition BasicTTIImpl.h:1567

llvm::BasicTTIImplBase< RISCVTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:3174

llvm::BasicTTIImplBase< RISCVTTIImpl >::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Definition BasicTTIImpl.h:1371

llvm::BasicTTIImplBase< RISCVTTIImpl >::getMaxVScale
std::optional< unsigned > getMaxVScale() const override
Definition BasicTTIImpl.h:879

llvm::BasicTTIImplBase::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition BasicTTIImpl.h:702

llvm::BasicTTIImplBase< RISCVTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition BasicTTIImpl.h:774

llvm::BasicTTIImplBase< RISCVTTIImpl >::getIndexedVectorInstrCostFromEnd
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
Definition BasicTTIImpl.h:1457

llvm::BasicTTIImplBase< RISCVTTIImpl >::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
Definition BasicTTIImpl.h:1196

llvm::BasicTTIImplBase< RISCVTTIImpl >::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Definition BasicTTIImpl.h:997

llvm::BasicTTIImplBase< RISCVTTIImpl >::isLegalAddImmediate
bool isLegalAddImmediate(int64_t imm) const override
Definition BasicTTIImpl.h:447

llvm::BasicTTIImplBase< RISCVTTIImpl >::getVScaleForTuning
std::optional< unsigned > getVScaleForTuning() const override
Definition BasicTTIImpl.h:880

llvm::BasicTTIImplBase< RISCVTTIImpl >::getExtendedReductionCost
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:3234

llvm::BasicTTIImplBase< RISCVTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:1734

llvm::BasicTTIImplBase< RISCVTTIImpl >::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
Definition BasicTTIImpl.h:3054

llvm::BasicTTIImplBase< RISCVTTIImpl >::getRegUsageForType
unsigned getRegUsageForType(Type *Ty) const override
Definition BasicTTIImpl.h:553

llvm::BasicTTIImplBase< RISCVTTIImpl >::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
Definition BasicTTIImpl.h:1558

llvm::BasicTTIImplBase< RISCVTTIImpl >::DL
const DataLayout & DL

llvm::BasicTTIImplBase< RISCVTTIImpl >::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Definition BasicTTIImpl.h:1503

llvm::BasicTTIImplBase< RISCVTTIImpl >::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:1549

llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition InstrTypes.h:1290

llvm::CallBase::arg_size
unsigned arg_size() const
Definition InstrTypes.h:1288

llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676

llvm::CmpInst::FCMP_OEQ
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679

llvm::CmpInst::BAD_ICMP_PREDICATE
@ BAD_ICMP_PREDICATE
Definition InstrTypes.h:709

llvm::CmpInst::FCMP_TRUE
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693

llvm::CmpInst::ICMP_SLT
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705

llvm::CmpInst::FCMP_OLT
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682

llvm::CmpInst::FCMP_ULE
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691

llvm::CmpInst::FCMP_OGT
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680

llvm::CmpInst::FCMP_OGE
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681

llvm::CmpInst::FCMP_ULT
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690

llvm::CmpInst::FCMP_ONE
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684

llvm::CmpInst::FCMP_UEQ
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687

llvm::CmpInst::FCMP_UGT
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688

llvm::CmpInst::FCMP_OLE
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683

llvm::CmpInst::FCMP_ORD
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685

llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition InstrTypes.h:697

llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition InstrTypes.h:698

llvm::CmpInst::FCMP_UNE
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692

llvm::CmpInst::FCMP_UGE
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689

llvm::CmpInst::FCMP_FALSE
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678

llvm::CmpInst::FCMP_UNO
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686

llvm::CmpInst::isFPPredicate
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770

llvm::CmpInst::isIntPredicate
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776

llvm::ConstantInt::getTrue
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
Definition Constants.cpp:868

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63

llvm::ElementCount
Definition TypeSize.h:299

llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22

llvm::FastMathFlags::noNaNs
bool noNaNs() const
Definition FMF.h:65

llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition DerivedTypes.h:594

llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition DerivedTypes.h:637

llvm::FixedVectorType::getDoubleElementsVectorType
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
Definition DerivedTypes.h:629

llvm::FixedVectorType::get
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803

llvm::Function
Definition Function.h:64

llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition Instructions.h:950

llvm::InstructionCost
Definition InstructionCost.h:30

llvm::InstructionCost::getInvalid
static InstructionCost getInvalid(CostType Val=0)
Definition InstructionCost.h:74

llvm::InstructionCost::getValue
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
Definition InstructionCost.h:88

llvm::InstructionCost::isValid
bool isValid() const
Definition InstructionCost.h:80

llvm::Instruction
Definition Instruction.h:69

llvm::Instruction::isCommutative
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
Definition Instruction.cpp:1282

llvm::IntegerType::get
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319

llvm::IntrinsicCostAttributes
Definition TargetTransformInfo.h:126

llvm::IntrinsicCostAttributes::getArgTypes
const SmallVectorImpl< Type * > & getArgTypes() const
Definition TargetTransformInfo.h:165

llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition TargetTransformInfo.h:161

llvm::IntrinsicCostAttributes::getArgs
const SmallVectorImpl< const Value * > & getArgs() const
Definition TargetTransformInfo.h:164

llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition TargetTransformInfo.h:159

llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition IntrinsicInst.h:49

llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition IntrinsicInst.h:56

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68

llvm::Loop
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40

llvm::MVT
Machine Value Type.
Definition MachineValueType.h:36

llvm::MVT::getFloatingPointVT
static MVT getFloatingPointVT(unsigned BitWidth)
Definition MachineValueType.h:438

llvm::MVT::getVectorMinNumElements
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition MachineValueType.h:284

llvm::MVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition MachineValueType.h:353

llvm::MVT::changeVectorElementType
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition MachineValueType.h:214

llvm::MVT::bitsLE
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
Definition MachineValueType.h:432

llvm::MVT::getVectorNumElements
unsigned getVectorNumElements() const
Definition MachineValueType.h:301

llvm::MVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition MachineValueType.h:107

llvm::MVT::changeTypeToInteger
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition MachineValueType.h:224

llvm::MVT::getSizeInBits
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
Definition MachineValueType.h:315

llvm::MVT::getFixedSizeInBits
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition MachineValueType.h:349

llvm::MVT::bitsGT
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
Definition MachineValueType.h:411

llvm::MVT::isFixedLengthVector
bool isFixedLengthVector() const
Definition MachineValueType.h:136

llvm::MVT::getStoreSize
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition MachineValueType.h:363

llvm::MVT::getVectorElementType
MVT getVectorElementType() const
Definition MachineValueType.h:270

llvm::MVT::getIntegerVT
static MVT getIntegerVT(unsigned BitWidth)
Definition MachineValueType.h:448

llvm::MVT::getScalarType
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Definition MachineValueType.h:266

llvm::Operator::getOpcode
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43

llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition OptimizationRemarkEmitter.h:33

llvm::RISCVSubtarget
Definition RISCVSubtarget.h:79

llvm::RISCVTTIImpl
Definition RISCVTargetTransformInfo.h:28

llvm::RISCVTTIImpl::getExtendedReductionCost
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
Definition RISCVTargetTransformInfo.cpp:2050

llvm::RISCVTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
Definition RISCVTargetTransformInfo.cpp:2308

llvm::RISCVTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition RISCVTargetTransformInfo.cpp:2486

llvm::RISCVTTIImpl::isLegalMaskedExpandLoad
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
Definition RISCVTargetTransformInfo.cpp:3111

llvm::RISCVTTIImpl::isLegalMaskedLoadStore
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
Definition RISCVTargetTransformInfo.h:271

llvm::RISCVTTIImpl::getIntImmCostIntrin
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
Definition RISCVTargetTransformInfo.cpp:318

llvm::RISCVTTIImpl::getMinTripCountTailFoldingThreshold
unsigned getMinTripCountTailFoldingThreshold() const override
Definition RISCVTargetTransformInfo.cpp:3079

llvm::RISCVTTIImpl::getPreferredAddressingMode
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
Definition RISCVTargetTransformInfo.cpp:3088

llvm::RISCVTTIImpl::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
Definition RISCVTargetTransformInfo.cpp:1608

llvm::RISCVTTIImpl::getStoreImmCost
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
Definition RISCVTargetTransformInfo.cpp:2085

llvm::RISCVTTIImpl::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
Definition RISCVTargetTransformInfo.cpp:2744

llvm::RISCVTTIImpl::getCostOfKeepingLiveOverCall
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
Definition RISCVTargetTransformInfo.cpp:1199

llvm::RISCVTTIImpl::hasActiveVectorLength
bool hasActiveVectorLength() const override
Definition RISCVTargetTransformInfo.cpp:325

llvm::RISCVTTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
Definition RISCVTargetTransformInfo.cpp:1619

llvm::RISCVTTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Definition RISCVTargetTransformInfo.cpp:2148

llvm::RISCVTTIImpl::getIndexedVectorInstrCostFromEnd
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
Definition RISCVTargetTransformInfo.cpp:2469

llvm::RISCVTTIImpl::getExpandCompressMemoryOpCost
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *Src, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
Definition RISCVTargetTransformInfo.cpp:1137

llvm::RISCVTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition RISCVTargetTransformInfo.cpp:2663

llvm::RISCVTTIImpl::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
Definition RISCVTargetTransformInfo.cpp:1004

llvm::RISCVTTIImpl::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
Definition RISCVTargetTransformInfo.cpp:1112

llvm::RISCVTTIImpl::getIntImmCostInst
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
Definition RISCVTargetTransformInfo.cpp:206

llvm::RISCVTTIImpl::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
Definition RISCVTargetTransformInfo.cpp:1826

llvm::RISCVTTIImpl::canSplatOperand
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
Definition RISCVTargetTransformInfo.cpp:3201

llvm::RISCVTTIImpl::isLSRCostLess
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
Definition RISCVTargetTransformInfo.cpp:3096

llvm::RISCVTTIImpl::isLegalStridedLoadStore
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
Definition RISCVTargetTransformInfo.h:339

llvm::RISCVTTIImpl::getRegUsageForType
unsigned getRegUsageForType(Type *Ty) const override
Definition RISCVTargetTransformInfo.cpp:3043

llvm::RISCVTTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
Definition RISCVTargetTransformInfo.cpp:1015

llvm::RISCVTTIImpl::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
Definition RISCVTargetTransformInfo.cpp:965

llvm::RISCVTTIImpl::isLegalMaskedScatter
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
Definition RISCVTargetTransformInfo.h:323

llvm::RISCVTTIImpl::isLegalMaskedCompressStore
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
Definition RISCVTargetTransformInfo.cpp:3129

llvm::RISCVTTIImpl::preferAlternateOpcodeVectorization
bool preferAlternateOpcodeVectorization() const override
Definition RISCVTargetTransformInfo.cpp:3083

llvm::RISCVTTIImpl::isProfitableToSinkOperands
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
Definition RISCVTargetTransformInfo.cpp:3266

llvm::RISCVTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
Definition RISCVTargetTransformInfo.cpp:2317

llvm::RISCVTTIImpl::getMaxVScale
std::optional< unsigned > getMaxVScale() const override
Definition RISCVTargetTransformInfo.cpp:370

llvm::RISCVTTIImpl::shouldExpandReduction
bool shouldExpandReduction(const IntrinsicInst *II) const override
Definition RISCVTargetTransformInfo.cpp:356

llvm::RISCVTTIImpl::getVScaleForTuning
std::optional< unsigned > getVScaleForTuning() const override
Definition RISCVTargetTransformInfo.cpp:376

llvm::RISCVTTIImpl::isLegalMaskedGather
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
Definition RISCVTargetTransformInfo.h:320

llvm::RISCVTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition RISCVTargetTransformInfo.cpp:647

llvm::RISCVTTIImpl::getMaximumVF
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
Definition RISCVTargetTransformInfo.cpp:3063

llvm::RISCVTTIImpl::getPointersChainCost
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
Definition RISCVTargetTransformInfo.cpp:2616

llvm::RISCVTTIImpl::enableMemCmpExpansion
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
Definition RISCVTargetTransformInfo.cpp:3360

llvm::RISCVTTIImpl::getPartialReductionCost
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
Definition RISCVTargetTransformInfo.cpp:335

llvm::RISCVTTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Definition RISCVTargetTransformInfo.cpp:2103

llvm::RISCVTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
Definition RISCVTargetTransformInfo.cpp:1283

llvm::RISCVTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
Definition RISCVTargetTransformInfo.cpp:1932

llvm::RISCVTTIImpl::getRegisterBitWidth
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
Definition RISCVTargetTransformInfo.cpp:385

llvm::RISCVTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition RISCVTargetTransformInfo.cpp:2739

llvm::RISCVTTIImpl::shouldConsiderAddressTypePromotion
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
Definition RISCVTargetTransformInfo.cpp:3145

llvm::RISCVTTIImpl::getIntImmCost
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
Definition RISCVTargetTransformInfo.cpp:138

llvm::RISCVTTIImpl::getStridedMemoryOpCost
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
Definition RISCVTargetTransformInfo.cpp:1175

llvm::RISCVTTIImpl::getPopcntSupport
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
Definition RISCVTargetTransformInfo.cpp:330

llvm::RISCVTargetLowering::getM1VT
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
Definition RISCVISelLowering.h:368

llvm::RISCVTargetLowering::getVRGatherVVCost
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
Definition RISCVISelLowering.cpp:3057

llvm::RISCVTargetLowering::getVRGatherVICost
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
Definition RISCVISelLowering.cpp:3072

llvm::RISCVTargetLowering::computeVLMAX
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
Definition RISCVISelLowering.h:351

llvm::RISCVTargetLowering::getLMULCost
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
Definition RISCVISelLowering.cpp:3030

llvm::RISCVTargetLowering::getVSlideVICost
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
Definition RISCVISelLowering.cpp:3088

llvm::RISCVTargetLowering::getVSlideVXCost
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
Definition RISCVISelLowering.cpp:3080

llvm::RISCVTargetLowering::getLMUL
static RISCVVType::VLMUL getLMUL(MVT VT)
Definition RISCVISelLowering.cpp:2567

llvm::SCEV
This class represents an analyzed expression in the program.
Definition ScalarEvolution.h:72

llvm::ScalableVectorType::get
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:825

llvm::ScalarEvolution
The main scalar evolution driver.
Definition ScalarEvolution.h:448

llvm::ShuffleVectorInst::isIdentityMask
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
Definition Instructions.cpp:1951

llvm::ShuffleVectorInst::isInterleaveMask
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Definition Instructions.cpp:2363

llvm::SmallDenseSet
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:573

llvm::SmallVectorImpl::append
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition SmallVector.h:683

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:416

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:79

llvm::SmallVectorTemplateCommon::begin
iterator begin()
Definition SmallVector.h:270

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1196

llvm::StoreInst
An instruction for storing to memory.
Definition Instructions.h:297

llvm::TargetLoweringBase::Promote
@ Promote
Definition TargetLowering.h:204

llvm::TargetTransformInfoImplBase::getDataLayout
virtual const DataLayout & getDataLayout() const
Definition TargetTransformInfoImpl.h:50

llvm::TargetTransformInfoImplBase::getPreferredAddressingMode
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
Definition TargetTransformInfoImpl.h:307

llvm::TargetTransformInfoImplBase::isLoweredToCall
virtual bool isLoweredToCall(const Function *F) const
Definition TargetTransformInfoImpl.h:174

llvm::TargetTransformInfoImplCRTPBase::getInstructionCost
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
Definition TargetTransformInfoImpl.h:1366

llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition TargetTransformInfo.h:275

llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition TargetTransformInfo.h:276

llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition TargetTransformInfo.h:278

llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition TargetTransformInfo.h:279

llvm::TargetTransformInfo::TCK_Latency
@ TCK_Latency
The latency of instruction.
Definition TargetTransformInfo.h:277

llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition TargetTransformInfo.h:1618

llvm::TargetTransformInfo::RegisterKind
RegisterKind
Definition TargetTransformInfo.h:1215

llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition TargetTransformInfo.h:1215

llvm::TargetTransformInfo::RGK_ScalableVector
@ RGK_ScalableVector
Definition TargetTransformInfo.h:1215

llvm::TargetTransformInfo::RGK_Scalar
@ RGK_Scalar
Definition TargetTransformInfo.h:1215

llvm::TargetTransformInfo::PopcntSupportKind
PopcntSupportKind
Flags indicating the kind of support for population count.
Definition TargetTransformInfo.h:741

llvm::TargetTransformInfo::PSK_Software
@ PSK_Software
Definition TargetTransformInfo.h:741

llvm::TargetTransformInfo::PSK_FastHardware
@ PSK_FastHardware
Definition TargetTransformInfo.h:741

llvm::TargetTransformInfo::PartialReductionExtendKind
PartialReductionExtendKind
Definition TargetTransformInfo.h:225

llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition TargetTransformInfo.h:301

llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition TargetTransformInfo.h:302

llvm::TargetTransformInfo::AddressingModeKind
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
Definition TargetTransformInfo.h:804

llvm::TargetTransformInfo::AMK_PostIndexed
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
Definition TargetTransformInfo.h:807

llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition TargetTransformInfo.h:1132

llvm::TargetTransformInfo::SK_InsertSubvector
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
Definition TargetTransformInfo.h:1139

llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition TargetTransformInfo.h:1135

llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition TargetTransformInfo.h:1143

llvm::TargetTransformInfo::SK_Transpose
@ SK_Transpose
Transpose two vectors.
Definition TargetTransformInfo.h:1138

llvm::TargetTransformInfo::SK_Splice
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
Definition TargetTransformInfo.h:1145

llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition TargetTransformInfo.h:1133

llvm::TargetTransformInfo::SK_PermuteTwoSrc
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
Definition TargetTransformInfo.h:1141

llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition TargetTransformInfo.h:1134

llvm::TargetTransformInfo::SK_ExtractSubvector
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
Definition TargetTransformInfo.h:1140

llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition TargetTransformInfo.h:1437

llvm::TargetTransformInfo::CastContextHint::None
@ None
The cast is not used with a load/store of any kind.
Definition TargetTransformInfo.h:1438

llvm::TypeSize
Definition TypeSize.h:333

llvm::TypeSize::getFixed
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344

llvm::TypeSize::getScalable
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:347

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45

llvm::Type::getInt64Ty
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298

llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273

llvm::Type::isScalableTy
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:62

llvm::Type::isBFloatTy
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145

llvm::Type::getPointerAddressSpace
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition DerivedTypes.h:773

llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352

llvm::Type::getWithNewBitWidth
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
Definition DerivedTypes.h:766

llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142

llvm::Type::getWithNewType
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
Definition DerivedTypes.h:760

llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128

llvm::Type::getScalarSizeInBits
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231

llvm::Type::getInt1Ty
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294

llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240

llvm::Type::getIntNTy
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301

llvm::Type::getFloatTy
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:285

llvm::Type::isVoidTy
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition Use.h:35

llvm::User
Definition User.h:44

llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition User.h:232

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256

llvm::Value::user_begin
user_iterator user_begin()
Definition Value.h:402

llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439

llvm::Value::getPointerAlignment
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956

llvm::Value::getContext
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099

llvm::VectorType
Base class of all SIMD vector types.
Definition DerivedTypes.h:430

llvm::VectorType::getElementCount
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition DerivedTypes.h:697

llvm::VectorType::get
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.

llvm::cl::opt
Definition CommandLine.h:1455

llvm::detail::DenseSetImpl::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202

llvm::details::FixedOrScalableQuantity::isKnownMultipleOf
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:181

llvm::details::FixedOrScalableQuantity::getFixedValue
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201

llvm::details::FixedOrScalableQuantity< TypeSize, uint64_t >::isKnownLE
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:231

llvm::details::FixedOrScalableQuantity< TypeSize, uint64_t >::isKnownLT
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:217

llvm::details::FixedOrScalableQuantity::getKnownMinValue
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166

llvm::details::FixedOrScalableQuantity::divideCoefficientBy
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:253

uint32_t

uint64_t

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

llvm::AArch64CC::LT
@ LT
Definition AArch64BaseInfo.h:266

llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:127

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::ISD
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24

llvm::ISD::SREM
@ SREM
Definition ISDOpcodes.h:264

llvm::ISD::UDIV
@ UDIV
Definition ISDOpcodes.h:263

llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition ISDOpcodes.h:863

llvm::ISD::SDIV
@ SDIV
Definition ISDOpcodes.h:262

llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259

llvm::ISD::FSUB
@ FSUB
Definition ISDOpcodes.h:411

llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862

llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410

llvm::ISD::SRL
@ SRL
Definition ISDOpcodes.h:758

llvm::ISD::SRA
@ SRA
Definition ISDOpcodes.h:757

llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826

llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition ISDOpcodes.h:909

llvm::ISD::OR
@ OR
Definition ISDOpcodes.h:731

llvm::ISD::MULHU
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695

llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756

llvm::ISD::XOR
@ XOR
Definition ISDOpcodes.h:732

llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832

llvm::ISD::FMUL
@ FMUL
Definition ISDOpcodes.h:412

llvm::ISD::SUB
@ SUB
Definition ISDOpcodes.h:260

llvm::ISD::MULHS
@ MULHS
Definition ISDOpcodes.h:696

llvm::ISD::FDIV
@ FDIV
Definition ISDOpcodes.h:413

llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908

llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730

llvm::ISD::UREM
@ UREM
Definition ISDOpcodes.h:265

llvm::ISD::MUL
@ MUL
Definition ISDOpcodes.h:261

llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941

llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838

llvm::Intrinsic::ID
unsigned ID
Definition GenericSSAContext.h:28

llvm::MIPatternMatch::m_ZeroInt
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
Definition MIPatternMatch.h:277

llvm::MIPatternMatch::m_Not
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
Definition MIPatternMatch.h:936

llvm::NVPTXAS::AddressSpace
AddressSpace
Definition NVPTXAddrSpace.h:21

llvm::PatternMatch
Definition PatternMatch.h:47

llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition PatternMatch.h:49

llvm::PatternMatch::m_Intrinsic
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
Definition PatternMatch.h:2764

llvm::PatternMatch::m_Shuffle
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
Definition PatternMatch.h:2032

llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition PatternMatch.h:105

llvm::PatternMatch::m_InsertElt
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
Definition PatternMatch.h:1950

llvm::RISCVMatInt::getIntMatCost
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
Definition RISCVMatInt.cpp:539

llvm::RISCVVType::VLMUL
VLMUL
Definition RISCVTargetParser.h:73

llvm::RISCVVType::LMUL_1
@ LMUL_1
Definition RISCVTargetParser.h:74

llvm::RISCVVType::LMUL_F4
@ LMUL_F4
Definition RISCVTargetParser.h:80

llvm::RISCVVType::LMUL_F8
@ LMUL_F8
Definition RISCVTargetParser.h:79

llvm::RISCVVType::LMUL_F2
@ LMUL_F2
Definition RISCVTargetParser.h:81

llvm::RISCV::RVVBitsPerBlock
static constexpr unsigned RVVBitsPerBlock
Definition RISCVTargetParser.h:51

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:139

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:445

llvm::codeview::CompileSym2Flags::EC
@ EC
Definition CodeView.h:433

llvm::pdb::PDB_SymType::VectorType
@ VectorType
Definition PDBTypes.h:278

llvm::sampleprof::Base
@ Base
Definition Discriminator.h:58

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::Log2_32_Ceil
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:355

llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727

llvm::CostTableLookup
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35

llvm::getBooleanLoopAttribute
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition LoopInfo.cpp:1109

llvm::Cost
InstructionCost Cost
Definition FunctionSpecialization.h:103

llvm::isInt
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174

llvm::enumerate
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644

llvm::countr_zero
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186

llvm::isShiftedMask_64
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734

llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342

llvm::createStrideMask
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
Definition VectorUtils.cpp:1165

llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207

llvm::isUInt
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548

llvm::PoisonMaskElem
constexpr int PoisonMaskElem
Definition Instructions.h:1923

llvm::divideCeil
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405

llvm::TTI
TargetTransformInfo TTI
Definition TargetTransformInfo.h:218

llvm::isMaskedSlidePair
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
Definition VectorUtils.cpp:489

llvm::createInterleaveMask
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
Definition VectorUtils.cpp:1154

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:22

llvm::CostTblEntry
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30

llvm::copy
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1837

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560

llvm::SignExtend64
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:583

llvm::processShuffleMasks
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
Definition VectorUtils.cpp:665

llvm::equal
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2090

llvm::bit_floor
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:316

std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39

llvm::Align::value
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77

llvm::EVT
Extended Value Type.
Definition ValueTypes.h:35

llvm::EVT::getTypeForEVT
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition ValueTypes.cpp:218

llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106

llvm::MaybeAlign::valueOrOne
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130

llvm::MemIntrinsicInfo
Information about a load/store intrinsic defined by the target.
Definition TargetTransformInfo.h:75

llvm::PatternMatch::m_ZeroMask
Definition PatternMatch.h:1989

llvm::TargetTransformInfo::LSRCost
Definition TargetTransformInfo.h:537

llvm::TargetTransformInfo::LSRCost::NumIVMuls
unsigned NumIVMuls
Definition TargetTransformInfo.h:543

llvm::TargetTransformInfo::LSRCost::ScaleCost
unsigned ScaleCost
Definition TargetTransformInfo.h:547

llvm::TargetTransformInfo::LSRCost::Insns
unsigned Insns
TODO: Some of these could be merged.
Definition TargetTransformInfo.h:540

llvm::TargetTransformInfo::LSRCost::ImmCost
unsigned ImmCost
Definition TargetTransformInfo.h:545

llvm::TargetTransformInfo::LSRCost::AddRecCost
unsigned AddRecCost
Definition TargetTransformInfo.h:542

llvm::TargetTransformInfo::LSRCost::NumRegs
unsigned NumRegs
Definition TargetTransformInfo.h:541

llvm::TargetTransformInfo::LSRCost::NumBaseAdds
unsigned NumBaseAdds
Definition TargetTransformInfo.h:544

llvm::TargetTransformInfo::LSRCost::SetupCost
unsigned SetupCost
Definition TargetTransformInfo.h:546

llvm::TargetTransformInfo::MemCmpExpansionOptions
Returns options for expansion of memcmp. IsZeroCmp is.
Definition TargetTransformInfo.h:994

llvm::TargetTransformInfo::OperandValueInfo
Definition TargetTransformInfo.h:1169

llvm::TargetTransformInfo::OperandValueInfo::isConstant
bool isConstant() const
Definition TargetTransformInfo.h:1173

llvm::TargetTransformInfo::PeelingPreferences
Definition TargetTransformInfo.h:681

llvm::TargetTransformInfo::PointersChainInfo
Describe known properties for a set of pointers.
Definition TargetTransformInfo.h:323

llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition TargetTransformInfo.h:551

llvm::TargetTransformInfo::UnrollingPreferences::UpperBound
bool UpperBound
Allow using trip count upper bound to unroll loops.
Definition TargetTransformInfo.h:622

llvm::TargetTransformInfo::UnrollingPreferences::Force
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
Definition TargetTransformInfo.h:620

llvm::TargetTransformInfo::UnrollingPreferences::PartialOptSizeThreshold
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
Definition TargetTransformInfo.h:580

llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJam
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
Definition TargetTransformInfo.h:626

llvm::TargetTransformInfo::UnrollingPreferences::UnrollRemainder
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
Definition TargetTransformInfo.h:624

llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition TargetTransformInfo.h:612

llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition TargetTransformInfo.h:608

llvm::TargetTransformInfo::UnrollingPreferences::OptSizeThreshold
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Definition TargetTransformInfo.h:573

llvm::cl::desc
Definition CommandLine.h:411