/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Bug Summary

File:	llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Warning:	line 1699, column 21 The result of the left shift is undefined due to shifting by '4294967295', which is greater or equal to the width of type 'int'

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AArch64TargetTransformInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/lib/Target/AArch64 -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/lib/Target/AArch64 -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64 -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/include -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/lib/Target/AArch64 -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-08-28-193554-24367-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

→

1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//

9#include "AArch64TargetTransformInfo.h"
10#include "AArch64ExpandImm.h"
11#include "MCTargetDesc/AArch64AddressingModes.h"
12#include "llvm/Analysis/LoopInfo.h"
13#include "llvm/Analysis/TargetTransformInfo.h"
14#include "llvm/CodeGen/BasicTTIImpl.h"
15#include "llvm/CodeGen/CostTable.h"
16#include "llvm/CodeGen/TargetLowering.h"
17#include "llvm/IR/Intrinsics.h"
18#include "llvm/IR/IntrinsicInst.h"
19#include "llvm/IR/IntrinsicsAArch64.h"
20#include "llvm/IR/PatternMatch.h"
21#include "llvm/Support/Debug.h"
22#include "llvm/Transforms/InstCombine/InstCombiner.h"
23#include <algorithm>
24using namespace llvm;
25using namespace llvm::PatternMatch;

27#define DEBUG_TYPE"aarch64tti" "aarch64tti"

29static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
                                             cl::init(true), cl::Hidden);

32bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
                                       const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();

const FeatureBitset &CallerBits =
    TM.getSubtargetImpl(*Caller)->getFeatureBits();
const FeatureBitset &CalleeBits =
    TM.getSubtargetImpl(*Callee)->getFeatureBits();

// Inline a callee if its target-features are a subset of the callers
// target-features.
return (CallerBits & CalleeBits) == CalleeBits;
44}

46/// Calculate the cost of materializing a 64-bit value. This helper
47/// method might only calculate a fraction of a larger immediate. Therefore it
48/// is valid to return a cost of ZERO.
49InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
// Check if the immediate can be encoded within an instruction.
if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
  return 0;

if (Val < 0)
  Val = ~Val;

// Calculate how many moves we will need to materialize this constant.
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
AArch64_IMM::expandMOVImm(Val, 64, Insn);
return Insn.size();
61}

63/// Calculate the cost of materializing the given constant.
64InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                                            TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
 __assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 66, __extension__ __PRETTY_FUNCTION__));

unsigned BitSize = Ty->getPrimitiveSizeInBits();
if (BitSize == 0)
  return ~0U;

// Sign-extend all constants to a multiple of 64-bit.
APInt ImmVal = Imm;
if (BitSize & 0x3f)
  ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);

// Split the constant into 64-bit chunks and calculate the cost for each
// chunk.
InstructionCost Cost = 0;
for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
  APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
  int64_t Val = Tmp.getSExtValue();
  Cost += getIntImmCost(Val);
}
// We need at least one instruction to materialze the constant.
return std::max<InstructionCost>(1, Cost);
87}

89InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
                                                const APInt &Imm, Type *Ty,
                                                TTI::TargetCostKind CostKind,
                                                Instruction *Inst) {
assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
 __assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 93, __extension__ __PRETTY_FUNCTION__));

unsigned BitSize = Ty->getPrimitiveSizeInBits();
// There is no cost model for constants with a bit size of 0. Return TCC_Free
// here, so that constant hoisting will ignore this constant.
if (BitSize == 0)
  return TTI::TCC_Free;

unsigned ImmIdx = ~0U;
switch (Opcode) {
default:
  return TTI::TCC_Free;
case Instruction::GetElementPtr:
  // Always hoist the base address of a GetElementPtr.
  if (Idx == 0)
    return 2 * TTI::TCC_Basic;
  return TTI::TCC_Free;
case Instruction::Store:
  ImmIdx = 0;
  break;
case Instruction::Add:
case Instruction::Sub:
case Instruction::Mul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
case Instruction::ICmp:
  ImmIdx = 1;
  break;
// Always return TCC_Free for the shift value of a shift instruction.
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
  if (Idx == 1)
    return TTI::TCC_Free;
  break;
case Instruction::Trunc:
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::IntToPtr:
case Instruction::PtrToInt:
case Instruction::BitCast:
case Instruction::PHI:
case Instruction::Call:
case Instruction::Select:
case Instruction::Ret:
case Instruction::Load:
  break;
}

if (Idx == ImmIdx) {
  int NumConstants = (BitSize + 63) / 64;
  InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
  return (Cost <= NumConstants * TTI::TCC_Basic)
             ? static_cast<int>(TTI::TCC_Free)
             : Cost;
}
return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
155}

157InstructionCost
158AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
                                  const APInt &Imm, Type *Ty,
                                  TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
 __assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 161, __extension__ __PRETTY_FUNCTION__));

unsigned BitSize = Ty->getPrimitiveSizeInBits();
// There is no cost model for constants with a bit size of 0. Return TCC_Free
// here, so that constant hoisting will ignore this constant.
if (BitSize == 0)
  return TTI::TCC_Free;

// Most (all?) AArch64 intrinsics do not support folding immediates into the
// selected instruction, so we compute the materialization cost for the
// immediate directly.
if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
  return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);

switch (IID) {
default:
  return TTI::TCC_Free;
case Intrinsic::sadd_with_overflow:
case Intrinsic::uadd_with_overflow:
case Intrinsic::ssub_with_overflow:
case Intrinsic::usub_with_overflow:
case Intrinsic::smul_with_overflow:
case Intrinsic::umul_with_overflow:
  if (Idx == 1) {
    int NumConstants = (BitSize + 63) / 64;
    InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
    return (Cost <= NumConstants * TTI::TCC_Basic)
               ? static_cast<int>(TTI::TCC_Free)
               : Cost;
  }
  break;
case Intrinsic::experimental_stackmap:
  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
    return TTI::TCC_Free;
  break;
case Intrinsic::experimental_patchpoint_void:
case Intrinsic::experimental_patchpoint_i64:
  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
    return TTI::TCC_Free;
  break;
case Intrinsic::experimental_gc_statepoint:
  if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
    return TTI::TCC_Free;
  break;
}
return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
207}

209TargetTransformInfo::PopcntSupportKind
210AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")(static_cast <bool> (isPowerOf2_32(TyWidth) && "Ty width must be power of 2"
) ? void (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 211, __extension__ __PRETTY_FUNCTION__));
if (TyWidth == 32 || TyWidth == 64)
  return TTI::PSK_FastHardware;
// TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
return TTI::PSK_Software;
216}

218InstructionCost
219AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                    TTI::TargetCostKind CostKind) {
auto *RetTy = ICA.getReturnType();
switch (ICA.getID()) {
case Intrinsic::umin:
case Intrinsic::umax:
case Intrinsic::smin:
case Intrinsic::smax: {
  static const auto ValidMinMaxTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
                                      MVT::v8i16, MVT::v2i32, MVT::v4i32};
  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
  // v2i64 types get converted to cmp+bif hence the cost of 2
  if (LT.second == MVT::v2i64)
    return LT.first * 2;
  if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
    return LT.first;
  break;
}
case Intrinsic::sadd_sat:
case Intrinsic::ssub_sat:
case Intrinsic::uadd_sat:
case Intrinsic::usub_sat: {
  static const auto ValidSatTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
                                   MVT::v8i16, MVT::v2i32, MVT::v4i32,
                                   MVT::v2i64};
  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
  // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
  // need to extend the type, as it uses shr(qadd(shl, shl)).
  unsigned Instrs =
      LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
  if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
    return LT.first * Instrs;
  break;
}
case Intrinsic::abs: {
  static const auto ValidAbsTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
                                   MVT::v8i16, MVT::v2i32, MVT::v4i32,
                                   MVT::v2i64};
  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
  if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
    return LT.first;
  break;
}
case Intrinsic::experimental_stepvector: {
  InstructionCost Cost = 1; // Cost of the `index' instruction
  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
  // Legalisation of illegal vectors involves an `index' instruction plus
  // (LT.first - 1) vector adds.
  if (LT.first > 1) {
    Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
    InstructionCost AddCost =
        getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
    Cost += AddCost * (LT.first - 1);
  }
  return Cost;
}
case Intrinsic::bitreverse: {
  static const CostTblEntry BitreverseTbl[] = {
      {Intrinsic::bitreverse, MVT::i32, 1},
      {Intrinsic::bitreverse, MVT::i64, 1},
      {Intrinsic::bitreverse, MVT::v8i8, 1},
      {Intrinsic::bitreverse, MVT::v16i8, 1},
      {Intrinsic::bitreverse, MVT::v4i16, 2},
      {Intrinsic::bitreverse, MVT::v8i16, 2},
      {Intrinsic::bitreverse, MVT::v2i32, 2},
      {Intrinsic::bitreverse, MVT::v4i32, 2},
      {Intrinsic::bitreverse, MVT::v1i64, 2},
      {Intrinsic::bitreverse, MVT::v2i64, 2},
  };
  const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy);
  const auto *Entry =
      CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
  // Cost Model is using the legal type(i32) that i8 and i16 will be converted
  // to +1 so that we match the actual lowering cost
  if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
      TLI->getValueType(DL, RetTy, true) == MVT::i16)
    return LegalisationCost.first * Entry->Cost + 1;
  if (Entry)
    return LegalisationCost.first * Entry->Cost;
  break;
}
case Intrinsic::ctpop: {
  static const CostTblEntry CtpopCostTbl[] = {
      {ISD::CTPOP, MVT::v2i64, 4},
      {ISD::CTPOP, MVT::v4i32, 3},
      {ISD::CTPOP, MVT::v8i16, 2},
      {ISD::CTPOP, MVT::v16i8, 1},
      {ISD::CTPOP, MVT::i64,   4},
      {ISD::CTPOP, MVT::v2i32, 3},
      {ISD::CTPOP, MVT::v4i16, 2},
      {ISD::CTPOP, MVT::v8i8,  1},
      {ISD::CTPOP, MVT::i32,   5},
  };
  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
  MVT MTy = LT.second;
  if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
    // Extra cost of +1 when illegal vector types are legalized by promoting
    // the integer type.
    int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
                                          RetTy->getScalarSizeInBits()
                        ? 1
                        : 0;
    return LT.first * Entry->Cost + ExtraCost;
  }
  break;
}
default:
  break;
}
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
329}

331/// The function will remove redundant reinterprets casting in the presence
332/// of the control flow
333static Optional<Instruction *> processPhiNode(InstCombiner &IC,
                                            IntrinsicInst &II) {
SmallVector<Instruction *, 32> Worklist;
auto RequiredType = II.getType();

auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
assert(PN && "Expected Phi Node!")(static_cast <bool> (PN && "Expected Phi Node!"
) ? void (0) : __assert_fail ("PN && \"Expected Phi Node!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 339, __extension__ __PRETTY_FUNCTION__));

// Don't create a new Phi unless we can remove the old one.
if (!PN->hasOneUse())
  return None;

for (Value *IncValPhi : PN->incoming_values()) {
  auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
  if (!Reinterpret ||
      Reinterpret->getIntrinsicID() !=
          Intrinsic::aarch64_sve_convert_to_svbool ||
      RequiredType != Reinterpret->getArgOperand(0)->getType())
    return None;
}

// Create the new Phi
LLVMContext &Ctx = PN->getContext();
IRBuilder<> Builder(Ctx);
Builder.SetInsertPoint(PN);
PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
Worklist.push_back(PN);

for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
  auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
  NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
  Worklist.push_back(Reinterpret);
}

// Cleanup Phi Node and reinterprets
return IC.replaceInstUsesWith(II, NPN);
369}

371static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC,
                                                          IntrinsicInst &II) {
// If the reinterpret instruction operand is a PHI Node
if (isa<PHINode>(II.getArgOperand(0)))
  return processPhiNode(IC, II);

SmallVector<Instruction *, 32> CandidatesForRemoval;
Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;

const auto *IVTy = cast<VectorType>(II.getType());

// Walk the chain of conversions.
while (Cursor) {
  // If the type of the cursor has fewer lanes than the final result, zeroing
  // must take place, which breaks the equivalence chain.
  const auto *CursorVTy = cast<VectorType>(Cursor->getType());
  if (CursorVTy->getElementCount().getKnownMinValue() <
      IVTy->getElementCount().getKnownMinValue())
    break;

  // If the cursor has the same type as I, it is a viable replacement.
  if (Cursor->getType() == IVTy)
    EarliestReplacement = Cursor;

  auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);

  // If this is not an SVE conversion intrinsic, this is the end of the chain.
  if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
                                Intrinsic::aarch64_sve_convert_to_svbool ||
                            IntrinsicCursor->getIntrinsicID() ==
                                Intrinsic::aarch64_sve_convert_from_svbool))
    break;

  CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
  Cursor = IntrinsicCursor->getOperand(0);
}

// If no viable replacement in the conversion chain was found, there is
// nothing to do.
if (!EarliestReplacement)
  return None;

return IC.replaceInstUsesWith(II, EarliestReplacement);
414}

416static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
                                               IntrinsicInst &II) {
IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
if (!Pg)
  return None;

if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
  return None;

const auto PTruePattern =
    cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
if (PTruePattern != AArch64SVEPredPattern::vl1)
  return None;

// The intrinsic is inserting into lane zero so use an insert instead.
auto *IdxTy = Type::getInt64Ty(II.getContext());
auto *Insert = InsertElementInst::Create(
    II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
Insert->insertBefore(&II);
Insert->takeName(&II);

return IC.replaceInstUsesWith(II, Insert);
438}

440static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
                                                 IntrinsicInst &II) {
LLVMContext &Ctx = II.getContext();
IRBuilder<> Builder(Ctx);
Builder.SetInsertPoint(&II);

// Check that the predicate is all active
auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
  return None;

const auto PTruePattern =
    cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
if (PTruePattern != AArch64SVEPredPattern::all)
  return None;

// Check that we have a compare of zero..
auto *DupX = dyn_cast<IntrinsicInst>(II.getArgOperand(2));
if (!DupX || DupX->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
  return None;

auto *DupXArg = dyn_cast<ConstantInt>(DupX->getArgOperand(0));
if (!DupXArg || !DupXArg->isZero())
  return None;

// ..against a dupq
auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
if (!DupQLane ||
    DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
  return None;

// Where the dupq is a lane 0 replicate of a vector insert
if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
  return None;

auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
if (!VecIns ||
    VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert)
  return None;

// Where the vector insert is a fixed constant vector insert into undef at
// index zero
if (!isa<UndefValue>(VecIns->getArgOperand(0)))
  return None;

if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
  return None;

auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
if (!ConstVec)
  return None;

auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
  return None;

unsigned NumElts = VecTy->getNumElements();
unsigned PredicateBits = 0;

// Expand intrinsic operands to a 16-bit byte level predicate
for (unsigned I = 0; I < NumElts; ++I) {
  auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
  if (!Arg)
    return None;
  if (!Arg->isZero())
    PredicateBits |= 1 << (I * (16 / NumElts));
}

// If all bits are zero bail early with an empty predicate
if (PredicateBits == 0) {
  auto *PFalse = Constant::getNullValue(II.getType());
  PFalse->takeName(&II);
  return IC.replaceInstUsesWith(II, PFalse);
}

// Calculate largest predicate type used (where byte predicate is largest)
unsigned Mask = 8;
for (unsigned I = 0; I < 16; ++I)
  if ((PredicateBits & (1 << I)) != 0)
    Mask |= (I % 8);

unsigned PredSize = Mask & -Mask;
auto *PredType = ScalableVectorType::get(
    Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));

// Ensure all relevant bits are set
for (unsigned I = 0; I < 16; I += PredSize)
  if ((PredicateBits & (1 << I)) == 0)
    return None;

auto *PTruePat =
    ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
                                      {PredType}, {PTruePat});
auto *ConvertToSVBool = Builder.CreateIntrinsic(
    Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
auto *ConvertFromSVBool =
    Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
                            {II.getType()}, {ConvertToSVBool});

ConvertFromSVBool->takeName(&II);
return IC.replaceInstUsesWith(II, ConvertFromSVBool);
543}

545static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
                                                IntrinsicInst &II) {
IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);
Value *Pg = II.getArgOperand(0);
Value *Vec = II.getArgOperand(1);
auto IntrinsicID = II.getIntrinsicID();
bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;

// lastX(splat(X)) --> X
if (auto *SplatVal = getSplatValue(Vec))
  return IC.replaceInstUsesWith(II, SplatVal);

// If x and/or y is a splat value then:
// lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
Value *LHS, *RHS;
if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
  if (isSplatValue(LHS) || isSplatValue(RHS)) {
    auto *OldBinOp = cast<BinaryOperator>(Vec);
    auto OpC = OldBinOp->getOpcode();
    auto *NewLHS =
        Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
    auto *NewRHS =
        Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
    auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
        OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
    return IC.replaceInstUsesWith(II, NewBinOp);
  }
}

auto *C = dyn_cast<Constant>(Pg);
if (IsAfter && C && C->isNullValue()) {
  // The intrinsic is extracting lane 0 so use an extract instead.
  auto *IdxTy = Type::getInt64Ty(II.getContext());
  auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
  Extract->insertBefore(&II);
  Extract->takeName(&II);
  return IC.replaceInstUsesWith(II, Extract);
}

auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
if (!IntrPG)
  return None;

if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
  return None;

const auto PTruePattern =
    cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();

// Can the intrinsic's predicate be converted to a known constant index?
unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
if (!MinNumElts)
  return None;

unsigned Idx = MinNumElts - 1;
// Increment the index if extracting the element after the last active
// predicate element.
if (IsAfter)
  ++Idx;

// Ignore extracts whose index is larger than the known minimum vector
// length. NOTE: This is an artificial constraint where we prefer to
// maintain what the user asked for until an alternative is proven faster.
auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
if (Idx >= PgVTy->getMinNumElements())
  return None;

// The intrinsic is extracting a fixed lane so use an extract instead.
auto *IdxTy = Type::getInt64Ty(II.getContext());
auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
Extract->insertBefore(&II);
Extract->takeName(&II);
return IC.replaceInstUsesWith(II, Extract);
619}

621static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
                                              IntrinsicInst &II) {
LLVMContext &Ctx = II.getContext();
IRBuilder<> Builder(Ctx);
Builder.SetInsertPoint(&II);
// Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
// can work with RDFFR_PP for ptest elimination.
auto *AllPat =
    ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
                                      {II.getType()}, {AllPat});
auto *RDFFR =
    Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
RDFFR->takeName(&II);
return IC.replaceInstUsesWith(II, RDFFR);
636}

638static Optional<Instruction *>
639instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();

if (Pattern == AArch64SVEPredPattern::all) {
  LLVMContext &Ctx = II.getContext();
  IRBuilder<> Builder(Ctx);
  Builder.SetInsertPoint(&II);

  Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
  auto *VScale = Builder.CreateVScale(StepVal);
  VScale->takeName(&II);
  return IC.replaceInstUsesWith(II, VScale);
}

unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);

return MinNumElts && NumElts >= MinNumElts
           ? Optional<Instruction *>(IC.replaceInstUsesWith(
                 II, ConstantInt::get(II.getType(), MinNumElts)))
           : None;
659}

661static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
                                                 IntrinsicInst &II) {
IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1));

if (Op1 && Op2 &&
    Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
    Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
    Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {

  IRBuilder<> Builder(II.getContext());
  Builder.SetInsertPoint(&II);

  Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
  Type *Tys[] = {Op1->getArgOperand(0)->getType()};

  auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);

  PTest->takeName(&II);
  return IC.replaceInstUsesWith(II, PTest);
}

return None;
684}

686static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
                                                     IntrinsicInst &II) {
auto *OpPredicate = II.getOperand(0);
auto *OpMultiplicand = II.getOperand(1);
auto *OpMultiplier = II.getOperand(2);

IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);

// Return true if a given instruction is an aarch64_sve_dup_x intrinsic call
// with a unit splat value, false otherwise.
auto IsUnitDupX = [](auto *I) {
  auto *IntrI = dyn_cast<IntrinsicInst>(I);
  if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
    return false;

  auto *SplatValue = IntrI->getOperand(0);
  return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
};

// Return true if a given instruction is an aarch64_sve_dup intrinsic call
// with a unit splat value, false otherwise.
auto IsUnitDup = [](auto *I) {
  auto *IntrI = dyn_cast<IntrinsicInst>(I);
  if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
    return false;

  auto *SplatValue = IntrI->getOperand(2);
  return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
};

// The OpMultiplier variable should always point to the dup (if any), so
// swap if necessary.
if (IsUnitDup(OpMultiplicand) || IsUnitDupX(OpMultiplicand))
  std::swap(OpMultiplier, OpMultiplicand);

if (IsUnitDupX(OpMultiplier)) {
  // [f]mul pg (dupx 1) %n => %n
  OpMultiplicand->takeName(&II);
  return IC.replaceInstUsesWith(II, OpMultiplicand);
} else if (IsUnitDup(OpMultiplier)) {
  // [f]mul pg (dup pg 1) %n => %n
  auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
  auto *DupPg = DupInst->getOperand(1);
  // TODO: this is naive. The optimization is still valid if DupPg
  // 'encompasses' OpPredicate, not only if they're the same predicate.
  if (OpPredicate == DupPg) {
    OpMultiplicand->takeName(&II);
    return IC.replaceInstUsesWith(II, OpMultiplicand);
  }
}

return None;
739}

741static Optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
                                                  IntrinsicInst &II) {
IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);
Value *UnpackArg = II.getArgOperand(0);
auto *RetTy = cast<ScalableVectorType>(II.getType());
bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
                II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;

// Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
// Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
if (auto *ScalarArg = getSplatValue(UnpackArg)) {
  ScalarArg =
      Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
  Value *NewVal =
      Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
  NewVal->takeName(&II);
  return IC.replaceInstUsesWith(II, NewVal);
}

return None;
762}
763static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
                                               IntrinsicInst &II) {
auto *OpVal = II.getOperand(0);
auto *OpIndices = II.getOperand(1);
VectorType *VTy = cast<VectorType>(II.getType());

// Check whether OpIndices is an aarch64_sve_dup_x intrinsic call with
// constant splat value < minimal element count of result.
auto *DupXIntrI = dyn_cast<IntrinsicInst>(OpIndices);
if (!DupXIntrI || DupXIntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
  return None;

auto *SplatValue = dyn_cast<ConstantInt>(DupXIntrI->getOperand(0));
if (!SplatValue ||
    SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
  return None;

// Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
// splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);
auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
auto *VectorSplat =
    Builder.CreateVectorSplat(VTy->getElementCount(), Extract);

VectorSplat->takeName(&II);
return IC.replaceInstUsesWith(II, VectorSplat);
790}

792Optional<Instruction *>
793AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
                                   IntrinsicInst &II) const {
Intrinsic::ID IID = II.getIntrinsicID();
switch (IID) {
default:
  break;
case Intrinsic::aarch64_sve_convert_from_svbool:
  return instCombineConvertFromSVBool(IC, II);
case Intrinsic::aarch64_sve_dup:
  return instCombineSVEDup(IC, II);
case Intrinsic::aarch64_sve_cmpne:
case Intrinsic::aarch64_sve_cmpne_wide:
  return instCombineSVECmpNE(IC, II);
case Intrinsic::aarch64_sve_rdffr:
  return instCombineRDFFR(IC, II);
case Intrinsic::aarch64_sve_lasta:
case Intrinsic::aarch64_sve_lastb:
  return instCombineSVELast(IC, II);
case Intrinsic::aarch64_sve_cntd:
  return instCombineSVECntElts(IC, II, 2);
case Intrinsic::aarch64_sve_cntw:
  return instCombineSVECntElts(IC, II, 4);
case Intrinsic::aarch64_sve_cnth:
  return instCombineSVECntElts(IC, II, 8);
case Intrinsic::aarch64_sve_cntb:
  return instCombineSVECntElts(IC, II, 16);
case Intrinsic::aarch64_sve_ptest_any:
case Intrinsic::aarch64_sve_ptest_first:
case Intrinsic::aarch64_sve_ptest_last:
  return instCombineSVEPTest(IC, II);
case Intrinsic::aarch64_sve_mul:
case Intrinsic::aarch64_sve_fmul:
  return instCombineSVEVectorMul(IC, II);
case Intrinsic::aarch64_sve_tbl:
  return instCombineSVETBL(IC, II);
case Intrinsic::aarch64_sve_uunpkhi:
case Intrinsic::aarch64_sve_uunpklo:
case Intrinsic::aarch64_sve_sunpkhi:
case Intrinsic::aarch64_sve_sunpklo:
  return instCombineSVEUnpack(IC, II);
}

return None;
836}

838bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
                                         ArrayRef<const Value *> Args) {

// A helper that returns a vector type from the given type. The number of
// elements in type Ty determine the vector width.
auto toVectorTy = [&](Type *ArgTy) {
  return VectorType::get(ArgTy->getScalarType(),
                         cast<VectorType>(DstTy)->getElementCount());
};

// Exit early if DstTy is not a vector type whose elements are at least
// 16-bits wide.
if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
  return false;

// Determine if the operation has a widening variant. We consider both the
// "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
// instructions.
//
// TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
//       verify that their extending operands are eliminated during code
//       generation.
switch (Opcode) {
case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
  break;
default:
  return false;
}

// To be a widening instruction (either the "wide" or "long" versions), the
// second operand must be a sign- or zero extend having a single user. We
// only consider extends having a single user because they may otherwise not
// be eliminated.
if (Args.size() != 2 ||
    (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
    !Args[1]->hasOneUse())
  return false;
auto *Extend = cast<CastInst>(Args[1]);

// Legalize the destination type and ensure it can be used in a widening
// operation.
auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
  return false;

// Legalize the source type and ensure it can be used in a widening
// operation.
auto *SrcTy = toVectorTy(Extend->getSrcTy());
auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
  return false;

// Get the total number of vector elements in the legalized types.
InstructionCost NumDstEls =
    DstTyL.first * DstTyL.second.getVectorMinNumElements();
InstructionCost NumSrcEls =
    SrcTyL.first * SrcTyL.second.getVectorMinNumElements();

// Return true if the legalized types have the same number of vector elements
// and the destination element type size is twice that of the source type.
return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
902}

904InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
                                               Type *Src,
                                               TTI::CastContextHint CCH,
                                               TTI::TargetCostKind CostKind,
                                               const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
 void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 910, __extension__ __PRETTY_FUNCTION__));

// If the cast is observable, and it is used by a widening instruction (e.g.,
// uaddl, saddw, etc.), it may be free.
if (I && I->hasOneUse()) {
  auto *SingleUser = cast<Instruction>(*I->user_begin());
  SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
  if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
    // If the cast is the second operand, it is free. We will generate either
    // a "wide" or "long" version of the widening instruction.
    if (I == SingleUser->getOperand(1))
      return 0;
    // If the cast is not the second operand, it will be free if it looks the
    // same as the second operand. In this case, we will generate a "long"
    // version of the widening instruction.
    if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
      if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
          cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
        return 0;
  }
}

// TODO: Allow non-throughput costs that aren't binary.
auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
  if (CostKind != TTI::TCK_RecipThroughput)
    return Cost == 0 ? 0 : 1;
  return Cost;
};

EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);

if (!SrcTy.isSimple() || !DstTy.isSimple())
  return AdjustCost(
      BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));

static const TypeConversionCostTblEntry
ConversionTbl[] = {
  { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32,  1 },
  { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64,  0 },
  { ISD::TRUNCATE, MVT::v8i8,  MVT::v8i32,  3 },
  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },

  // Truncations on nxvmiN
  { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
  { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
  { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
  { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
  { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
  { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
  { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
  { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
  { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
  { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
  { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
  { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
  { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
  { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
  { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
  { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },

  // The number of shll instructions for the extension.
  { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
  { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
  { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
  { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
  { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
  { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
  { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
  { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
  { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
  { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
  { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
  { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
  { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
  { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
  { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
  { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },

  // LowerVectorINT_TO_FP:
  { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
  { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
  { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
  { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },

  // Complex: to v2f32
  { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
  { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
  { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
  { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
  { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
  { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },

  // Complex: to v4f32
  { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8,  4 },
  { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
  { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
  { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },

  // Complex: to v8f32
  { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
  { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
  { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
  { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },

  // Complex: to v16f32
  { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
  { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },

  // Complex: to v2f64
  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },


  // LowerVectorFP_TO_INT
  { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
  { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
  { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
  { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
  { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
  { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },

  // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
  { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
  { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
  { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f32, 1 },
  { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
  { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
  { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f32, 1 },

  // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
  { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
  { ISD::FP_TO_SINT, MVT::v4i8,  MVT::v4f32, 2 },
  { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
  { ISD::FP_TO_UINT, MVT::v4i8,  MVT::v4f32, 2 },

  // Complex, from nxv2f32.
  { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
  { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
  { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
  { ISD::FP_TO_SINT, MVT::nxv2i8,  MVT::nxv2f32, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i8,  MVT::nxv2f32, 1 },

  // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
  { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
  { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
  { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f64, 2 },
  { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
  { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
  { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },

  // Complex, from nxv2f64.
  { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
  { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
  { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
  { ISD::FP_TO_SINT, MVT::nxv2i8,  MVT::nxv2f64, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i8,  MVT::nxv2f64, 1 },

  // Complex, from nxv4f32.
  { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
  { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
  { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
  { ISD::FP_TO_SINT, MVT::nxv4i8,  MVT::nxv4f32, 1 },
  { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
  { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
  { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
  { ISD::FP_TO_UINT, MVT::nxv4i8,  MVT::nxv4f32, 1 },

  // Complex, from nxv8f64. Illegal -> illegal conversions not required.
  { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
  { ISD::FP_TO_SINT, MVT::nxv8i8,  MVT::nxv8f64, 7 },
  { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
  { ISD::FP_TO_UINT, MVT::nxv8i8,  MVT::nxv8f64, 7 },

  // Complex, from nxv4f64. Illegal -> illegal conversions not required.
  { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
  { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
  { ISD::FP_TO_SINT, MVT::nxv4i8,  MVT::nxv4f64, 3 },
  { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
  { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
  { ISD::FP_TO_UINT, MVT::nxv4i8,  MVT::nxv4f64, 3 },

  // Complex, from nxv8f32. Illegal -> illegal conversions not required.
  { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
  { ISD::FP_TO_SINT, MVT::nxv8i8,  MVT::nxv8f32, 3 },
  { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
  { ISD::FP_TO_UINT, MVT::nxv8i8,  MVT::nxv8f32, 3 },

  // Complex, from nxv8f16.
  { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
  { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
  { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
  { ISD::FP_TO_SINT, MVT::nxv8i8,  MVT::nxv8f16, 1 },
  { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
  { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
  { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
  { ISD::FP_TO_UINT, MVT::nxv8i8,  MVT::nxv8f16, 1 },

  // Complex, from nxv4f16.
  { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
  { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
  { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
  { ISD::FP_TO_SINT, MVT::nxv4i8,  MVT::nxv4f16, 1 },
  { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
  { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
  { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
  { ISD::FP_TO_UINT, MVT::nxv4i8,  MVT::nxv4f16, 1 },

  // Complex, from nxv2f16.
  { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
  { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
  { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
  { ISD::FP_TO_SINT, MVT::nxv2i8,  MVT::nxv2f16, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i8,  MVT::nxv2f16, 1 },

  // Truncate from nxvmf32 to nxvmf16.
  { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
  { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
  { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },

  // Truncate from nxvmf64 to nxvmf16.
  { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
  { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
  { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },

  // Truncate from nxvmf64 to nxvmf32.
  { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
  { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
  { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },

  // Extend from nxvmf16 to nxvmf32.
  { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
  { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
  { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},

  // Extend from nxvmf16 to nxvmf64.
  { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
  { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
  { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},

  // Extend from nxvmf32 to nxvmf64.
  { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
  { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
  { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},

};

if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
                                               DstTy.getSimpleVT(),
                                               SrcTy.getSimpleVT()))
  return AdjustCost(Entry->Cost);

return AdjustCost(
    BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1179}

1181InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
                                                       Type *Dst,
                                                       VectorType *VecTy,
                                                       unsigned Index) {

// Make sure we were given a valid extend opcode.
assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&(static_cast <bool> ((Opcode == Instruction::SExt || Opcode
 == Instruction::ZExt) && "Invalid opcode") ? void (0
) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1188, __extension__ __PRETTY_FUNCTION__))
       "Invalid opcode")(static_cast <bool> ((Opcode == Instruction::SExt || Opcode
 == Instruction::ZExt) && "Invalid opcode") ? void (0
) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1188, __extension__ __PRETTY_FUNCTION__));

// We are extending an element we extract from a vector, so the source type
// of the extend is the element type of the vector.
auto *Src = VecTy->getElementType();

// Sign- and zero-extends are for integer types only.
assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type")(static_cast <bool> (isa<IntegerType>(Dst) &&
 isa<IntegerType>(Src) && "Invalid type") ? void
 (0) : __assert_fail ("isa<IntegerType>(Dst) && isa<IntegerType>(Src) && \"Invalid type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1195, __extension__ __PRETTY_FUNCTION__));

// Get the cost for the extract. We compute the cost (if any) for the extend
// below.
InstructionCost Cost =
    getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);

// Legalize the types.
auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
auto DstVT = TLI->getValueType(DL, Dst);
auto SrcVT = TLI->getValueType(DL, Src);
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

// If the resulting type is still a vector and the destination type is legal,
// we may get the extension for free. If not, get the default cost for the
// extend.
if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
  return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
                                 CostKind);

// The destination type should be larger than the element type. If not, get
// the default cost for the extend.
if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
  return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
                                 CostKind);

switch (Opcode) {
default:
  llvm_unreachable("Opcode should be either SExt or ZExt")::llvm::llvm_unreachable_internal("Opcode should be either SExt or ZExt"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1223);

// For sign-extends, we only need a smov, which performs the extension
// automatically.
case Instruction::SExt:
  return Cost;

// For zero-extends, the extend is performed automatically by a umov unless
// the destination type is i64 and the element type is i8 or i16.
case Instruction::ZExt:
  if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
    return Cost;
}

// If we are unable to perform the extend for free, get the default cost.
return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
                               CostKind);
1240}

1242InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
                                             TTI::TargetCostKind CostKind,
                                             const Instruction *I) {
if (CostKind != TTI::TCK_RecipThroughput)
  return Opcode == Instruction::PHI ? 0 : 1;
assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind")(static_cast <bool> (CostKind == TTI::TCK_RecipThroughput
 && "unexpected CostKind") ? void (0) : __assert_fail
 ("CostKind == TTI::TCK_RecipThroughput && \"unexpected CostKind\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1247, __extension__ __PRETTY_FUNCTION__));
// Branches are assumed to be predicted.
return 0;
1250}

1252InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                 unsigned Index) {
assert(Val->isVectorTy() && "This must be a vector type")(static_cast <bool> (Val->isVectorTy() && "This must be a vector type"
) ? void (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1254, __extension__ __PRETTY_FUNCTION__));

if (Index != -1U) {
  // Legalize the type.
  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);

  // This type is legalized to a scalar type.
  if (!LT.second.isVector())
    return 0;

  // The type may be split. Normalize the index to the new type.
  unsigned Width = LT.second.getVectorNumElements();
  Index = Index % Width;

  // The element at index zero is already inside the vector.
  if (Index == 0)
    return 0;
}

// All other insert/extracts cost this much.
return ST->getVectorInsertExtractBaseCost();
1275}

1277InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
  TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
  TTI::OperandValueProperties Opd1PropInfo,
  TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
  const Instruction *CxtI) {
// TODO: Handle more cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
                                       Opd2Info, Opd1PropInfo,
                                       Opd2PropInfo, Args, CxtI);

// Legalize the type.
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);

// If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
// add in the widening overhead specified by the sub-target. Since the
// extends feeding widening instructions are performed automatically, they
// aren't present in the generated code and have a zero cost. By adding a
// widening overhead here, we attach the total cost of the combined operation
// to the widening instruction.
InstructionCost Cost = 0;
if (isWideningInstruction(Ty, Opcode, Args))
  Cost += ST->getWideningBaseCost();

int ISD = TLI->InstructionOpcodeToISD(Opcode);

switch (ISD) {
default:
  return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
                                              Opd2Info,
                                              Opd1PropInfo, Opd2PropInfo);
case ISD::SDIV:
  if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
      Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
    // On AArch64, scalar signed division by constants power-of-two are
    // normally expanded to the sequence ADD + CMP + SELECT + SRA.
    // The OperandValue properties many not be same as that of previous
    // operation; conservatively assume OP_None.
    Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
                                   Opd1Info, Opd2Info,
                                   TargetTransformInfo::OP_None,
                                   TargetTransformInfo::OP_None);
    Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
                                   Opd1Info, Opd2Info,
                                   TargetTransformInfo::OP_None,
                                   TargetTransformInfo::OP_None);
    Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind,
                                   Opd1Info, Opd2Info,
                                   TargetTransformInfo::OP_None,
                                   TargetTransformInfo::OP_None);
    Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
                                   Opd1Info, Opd2Info,
                                   TargetTransformInfo::OP_None,
                                   TargetTransformInfo::OP_None);
    return Cost;
  }
  LLVM_FALLTHROUGH[[gnu::fallthrough]];
case ISD::UDIV:
  if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
    auto VT = TLI->getValueType(DL, Ty);
    if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
      // Vector signed division by constant are expanded to the
      // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
      // to MULHS + SUB + SRL + ADD + SRL.
      InstructionCost MulCost = getArithmeticInstrCost(
          Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info,
          TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
      InstructionCost AddCost = getArithmeticInstrCost(
          Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info,
          TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
      InstructionCost ShrCost = getArithmeticInstrCost(
          Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info,
          TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
      return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
    }
  }

  Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
                                        Opd2Info,
                                        Opd1PropInfo, Opd2PropInfo);
  if (Ty->isVectorTy()) {
    // On AArch64, vector divisions are not supported natively and are
    // expanded into scalar divisions of each pair of elements.
    Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind,
                                   Opd1Info, Opd2Info, Opd1PropInfo,
                                   Opd2PropInfo);
    Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
                                   Opd1Info, Opd2Info, Opd1PropInfo,
                                   Opd2PropInfo);
    // TODO: if one of the arguments is scalar, then it's not necessary to
    // double the cost of handling the vector elements.
    Cost += Cost;
  }
  return Cost;

case ISD::MUL:
  if (LT.second != MVT::v2i64)
    return (Cost + 1) * LT.first;
  // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
  // as elements are extracted from the vectors and the muls scalarized.
  // As getScalarizationOverhead is a bit too pessimistic, we estimate the
  // cost for a i64 vector directly here, which is:
  // - four i64 extracts,
  // - two i64 inserts, and
  // - two muls.
  // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with
  // LT.first = 2 the cost is 16.
  return LT.first * 8;
case ISD::ADD:
case ISD::XOR:
case ISD::OR:
case ISD::AND:
  // These nodes are marked as 'custom' for combining purposes only.
  // We know that they are legal. See LowerAdd in ISelLowering.
  return (Cost + 1) * LT.first;

case ISD::FADD:
  // These nodes are marked as 'custom' just to lower them to SVE.
  // We know said lowering will incur no additional cost.
  if (isa<FixedVectorType>(Ty) && !Ty->getScalarType()->isFP128Ty())
    return (Cost + 2) * LT.first;

  return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
                                              Opd2Info,
                                              Opd1PropInfo, Opd2PropInfo);
}
1404}

1406InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
                                                        ScalarEvolution *SE,
                                                        const SCEV *Ptr) {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
// extra micro-ops can significantly decrease throughput.
unsigned NumVectorInstToHideOverhead = 10;
int MaxMergeDistance = 64;

if (Ty->isVectorTy() && SE &&
    !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
  return NumVectorInstToHideOverhead;

// In many cases the address computation is not merged into the instruction
// addressing mode.
return 1;
1423}

1425InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
                                                 Type *CondTy,
                                                 CmpInst::Predicate VecPred,
                                                 TTI::TargetCostKind CostKind,
                                                 const Instruction *I) {
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
                                   I);

int ISD = TLI->InstructionOpcodeToISD(Opcode);
// We don't lower some vector selects well that are wider than the register
// width.
if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
  // We would need this many instructions to hide the scalarization happening.
  const int AmortizationCost = 20;

  // If VecPred is not set, check if we can get a predicate from the context
  // instruction, if its type matches the requested ValTy.
  if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
    CmpInst::Predicate CurrentPred;
    if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
                          m_Value())))
      VecPred = CurrentPred;
  }
  // Check if we have a compare/select chain that can be lowered using CMxx &
  // BFI pair.
  if (CmpInst::isIntPredicate(VecPred)) {
    static const auto ValidMinMaxTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
                                        MVT::v8i16, MVT::v2i32, MVT::v4i32,
                                        MVT::v2i64};
    auto LT = TLI->getTypeLegalizationCost(DL, ValTy);
    if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
      return LT.first;
  }

  static const TypeConversionCostTblEntry
  VectorSelectTbl[] = {
    { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
    { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
    { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
    { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
    { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
    { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
  };

  EVT SelCondTy = TLI->getValueType(DL, CondTy);
  EVT SelValTy = TLI->getValueType(DL, ValTy);
  if (SelCondTy.isSimple() && SelValTy.isSimple()) {
    if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
                                                   SelCondTy.getSimpleVT(),
                                                   SelValTy.getSimpleVT()))
      return Entry->Cost;
  }
}
// The base case handles scalable vectors fine for now, since it treats the
// cost as 1 * legalization cost.
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1483}

1485AArch64TTIImpl::TTI::MemCmpExpansionOptions
1486AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
TTI::MemCmpExpansionOptions Options;
if (ST->requiresStrictAlign()) {
  // TODO: Add cost modeling for strict align. Misaligned loads expand to
  // a bunch of instructions when strict align is enabled.
  return Options;
}
Options.AllowOverlappingLoads = true;
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
Options.NumLoadsPerBlock = Options.MaxNumLoads;
// TODO: Though vector loads usually perform well on AArch64, in some targets
// they may wake up the FP unit, which raises the power consumption.  Perhaps
// they could be used with no holds barred (-O3).
Options.LoadSizes = {8, 4, 2, 1};
return Options;
1501}

1503InstructionCost
1504AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                    Align Alignment, unsigned AddressSpace,
                                    TTI::TargetCostKind CostKind) {
if (useNeonVector(Src))
  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                      CostKind);
auto LT = TLI->getTypeLegalizationCost(DL, Src);
if (!LT.first.isValid())
  return InstructionCost::getInvalid();

// The code-generator is currently not able to handle scalable vectors
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
// it. This change will be removed when code-generation for these types is
// sufficiently reliable.
if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
  return InstructionCost::getInvalid();

return LT.first * 2;
1522}

1524InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
  unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
  Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
if (useNeonVector(DataTy))
  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
                                       Alignment, CostKind, I);
auto *VT = cast<VectorType>(DataTy);
auto LT = TLI->getTypeLegalizationCost(DL, DataTy);
if (!LT.first.isValid())
  return InstructionCost::getInvalid();

// The code-generator is currently not able to handle scalable vectors
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
// it. This change will be removed when code-generation for these types is
// sufficiently reliable.
if (cast<VectorType>(DataTy)->getElementCount() ==
    ElementCount::getScalable(1))
  return InstructionCost::getInvalid();

ElementCount LegalVF = LT.second.getVectorElementCount();
InstructionCost MemOpCost =
    getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
return LT.first * MemOpCost * getMaxNumElements(LegalVF, I->getFunction());
1547}

1549bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
1551}

1553InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
                                              MaybeAlign Alignment,
                                              unsigned AddressSpace,
                                              TTI::TargetCostKind CostKind,
                                              const Instruction *I) {
EVT VT = TLI->getValueType(DL, Ty, true);
// Type legalization can't handle structs
if (VT == MVT::Other)
  return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
                                CostKind);

auto LT = TLI->getTypeLegalizationCost(DL, Ty);
if (!LT.first.isValid())
  return InstructionCost::getInvalid();

// The code-generator is currently not able to handle scalable vectors
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
// it. This change will be removed when code-generation for these types is
// sufficiently reliable.
if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
  if (VTy->getElementCount() == ElementCount::getScalable(1))
    return InstructionCost::getInvalid();

// TODO: consider latency as well for TCK_SizeAndLatency.
if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
  return LT.first;

if (CostKind != TTI::TCK_RecipThroughput)
  return 1;

if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
    LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
  // Unaligned stores are extremely inefficient. We don't split all
  // unaligned 128-bit stores because the negative impact that has shown in
  // practice on inlined block copy code.
  // We make such stores expensive so that we will only vectorize if there
  // are 6 other instructions getting vectorized.
  const int AmortizationCost = 6;

  return LT.first * 2 * AmortizationCost;
}

// Check truncating stores and extending loads.
if (useNeonVector(Ty) &&
    Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
  // v4i8 types are lowered to scalar a load/store and sshll/xtn.
  if (VT == MVT::v4i8)
    return 2;
  // Otherwise we need to scalarize.
  return cast<FixedVectorType>(Ty)->getNumElements() * 2;
}

return LT.first;
1606}

1608InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
  bool UseMaskForCond, bool UseMaskForGaps) {
assert(Factor >= 2 && "Invalid interleave factor")(static_cast <bool> (Factor >= 2 && "Invalid interleave factor"
) ? void (0) : __assert_fail ("Factor >= 2 && \"Invalid interleave factor\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1612, __extension__ __PRETTY_FUNCTION__));
auto *VecVTy = cast<FixedVectorType>(VecTy);

if (!UseMaskForCond && !UseMaskForGaps &&
    Factor <= TLI->getMaxSupportedInterleaveFactor()) {
  unsigned NumElts = VecVTy->getNumElements();
  auto *SubVecTy =
      FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);

  // ldN/stN only support legal vector types of size 64 or 128 in bits.
  // Accesses having vector types that are a multiple of 128 bits can be
  // matched to more than one ldN/stN instruction.
  if (NumElts % Factor == 0 &&
      TLI->isLegalInterleavedAccessType(SubVecTy, DL))
    return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
}

return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                         Alignment, AddressSpace, CostKind,
                                         UseMaskForCond, UseMaskForGaps);
1632}

1634InstructionCost
1635AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
InstructionCost Cost = 0;
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
for (auto *I : Tys) {
  if (!I->isVectorTy())
    continue;
  if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
      128)
    Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
            getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
}
return Cost;
1647}

1649unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
return ST->getMaxInterleaveFactor();
1651}

1653// For Falkor, we want to avoid having too many strided loads in a loop since
1654// that can exhaust the HW prefetcher resources.  We adjust the unroller
1655// MaxCount preference below to attempt to ensure unrolling doesn't create too
1656// many strided loads.
1657static void
1658getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                            TargetTransformInfo::UnrollingPreferences &UP) {
enum { MaxStridedLoads = 7 };
auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
  int StridedLoads = 0;
  // FIXME? We could make this more precise by looking at the CFG and
  // e.g. not counting loads in each side of an if-then-else diamond.
  for (const auto BB : L->blocks()) {
    for (auto &I : *BB) {
      LoadInst *LMemI = dyn_cast<LoadInst>(&I);
      if (!LMemI)
        continue;

      Value *PtrValue = LMemI->getPointerOperand();
      if (L->isLoopInvariant(PtrValue))
        continue;

      const SCEV *LSCEV = SE.getSCEV(PtrValue);
      const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
      if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
        continue;

      // FIXME? We could take pairing of unrolled load copies into account
      // by looking at the AddRec, but we would probably have to limit this
      // to loops with no stores or other memory optimization barriers.
      ++StridedLoads;
      // We've seen enough strided loads that seeing more won't make a
      // difference.
      if (StridedLoads > MaxStridedLoads / 2)
        return StridedLoads;
    }
  }
  return StridedLoads;
};

int StridedLoads = countStridedLoads(L, SE);
LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoadsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: detected " <<
 StridedLoads << " strided loads\n"; } } while (false)
7
←
Assuming 'DebugFlag' is false→
8
←
Loop condition is false.  Exiting loop→
                  << " strided loads\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: detected " <<
 StridedLoads << " strided loads\n"; } } while (false);
// Pick the largest power of 2 unroll count that won't result in too many
// strided loads.
if (StridedLoads) {
9
←
Assuming 'StridedLoads' is not equal to 0→
10
←
Taking true branch→
  UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
11
←
Calling 'Log2_32'→
13
←
Returning from 'Log2_32'→
14
←
The result of the left shift is undefined due to shifting by '4294967295', which is greater or equal to the width of type 'int'
  LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to "
 << UP.MaxCount << '\n'; } } while (false)
                    << UP.MaxCount << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to "
 << UP.MaxCount << '\n'; } } while (false);
}
1703}

1705void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                           TTI::UnrollingPreferences &UP,
                                           OptimizationRemarkEmitter *ORE) {
// Enable partial unrolling and runtime unrolling.
BaseT::getUnrollingPreferences(L, SE, UP, ORE);

UP.UpperBound = true;

// For inner loop, it is more likely to be a hot one, and the runtime check
// can be promoted out from LICM pass, so the overhead is less, let's try
// a larger threshold to unroll more loops.
if (L->getLoopDepth() > 1)
1
Assuming the condition is false→
2
←
Taking false branch→
  UP.PartialThreshold *= 2;

// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;

if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
3
←
Assuming the condition is true→
5
←
Taking true branch→
    EnableFalkorHWPFUnrollFix)
4
←
Assuming the condition is true→
  getFalkorUnrollingPreferences(L, SE, UP);
6
←
Calling 'getFalkorUnrollingPreferences'→

// Scan the loop: don't unroll loops with calls as this could prevent
// inlining. Don't unroll vector loops either, as they don't benefit much from
// unrolling.
for (auto *BB : L->getBlocks()) {
  for (auto &I : *BB) {
    // Don't unroll vectorised loop.
    if (I.getType()->isVectorTy())
      return;

    if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
      if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
        if (!isLoweredToCall(F))
          continue;
      }
      return;
    }
  }
}

// Enable runtime unrolling for in-order models
// If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
// checking for that case, we can ensure that the default behaviour is
// unchanged
if (ST->getProcFamily() != AArch64Subtarget::Others &&
    !ST->getSchedModel().isOutOfOrder()) {
  UP.Runtime = true;
  UP.Partial = true;
  UP.UnrollRemainder = true;
  UP.DefaultUnrollRuntimeCount = 4;

  UP.UnrollAndJam = true;
  UP.UnrollAndJamInnerLoopThreshold = 60;
}
1759}

1761void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                                         TTI::PeelingPreferences &PP) {
BaseT::getPeelingPreferences(L, SE, PP);
1764}

1766Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
                                                       Type *ExpectedType) {
switch (Inst->getIntrinsicID()) {
default:
  return nullptr;
case Intrinsic::aarch64_neon_st2:
case Intrinsic::aarch64_neon_st3:
case Intrinsic::aarch64_neon_st4: {
  // Create a struct type
  StructType *ST = dyn_cast<StructType>(ExpectedType);
  if (!ST)
    return nullptr;
  unsigned NumElts = Inst->getNumArgOperands() - 1;
  if (ST->getNumElements() != NumElts)
    return nullptr;
  for (unsigned i = 0, e = NumElts; i != e; ++i) {
    if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
      return nullptr;
  }
  Value *Res = UndefValue::get(ExpectedType);
  IRBuilder<> Builder(Inst);
  for (unsigned i = 0, e = NumElts; i != e; ++i) {
    Value *L = Inst->getArgOperand(i);
    Res = Builder.CreateInsertValue(Res, L, i);
  }
  return Res;
}
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
  if (Inst->getType() == ExpectedType)
    return Inst;
  return nullptr;
}
1800}

1802bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
                                      MemIntrinsicInfo &Info) {
switch (Inst->getIntrinsicID()) {
default:
  break;
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
  Info.ReadMem = true;
  Info.WriteMem = false;
  Info.PtrVal = Inst->getArgOperand(0);
  break;
case Intrinsic::aarch64_neon_st2:
case Intrinsic::aarch64_neon_st3:
case Intrinsic::aarch64_neon_st4:
  Info.ReadMem = false;
  Info.WriteMem = true;
  Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
  break;
}

switch (Inst->getIntrinsicID()) {
default:
  return false;
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_st2:
  Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
  break;
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_st3:
  Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
  break;
case Intrinsic::aarch64_neon_ld4:
case Intrinsic::aarch64_neon_st4:
  Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
  break;
}
return true;
1840}

1842/// See if \p I should be considered for address type promotion. We check if \p
1843/// I is a sext with right type and used in memory accesses. If it used in a
1844/// "complex" getelementptr, we allow it to be promoted without finding other
1845/// sext instructions that sign extended the same initial value. A getelementptr
1846/// is considered as "complex" if it has more than 2 operands.
1847bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
  const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
bool Considerable = false;
AllowPromotionWithoutCommonHeader = false;
if (!isa<SExtInst>(&I))
  return false;
Type *ConsideredSExtType =
    Type::getInt64Ty(I.getParent()->getParent()->getContext());
if (I.getType() != ConsideredSExtType)
  return false;
// See if the sext is the one with the right type and used in at least one
// GetElementPtrInst.
for (const User *U : I.users()) {
  if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
    Considerable = true;
    // A getelementptr is considered as "complex" if it has more than 2
    // operands. We will promote a SExt used in such complex GEP as we
    // expect some computation to be merged if they are done on 64 bits.
    if (GEPInst->getNumOperands() > 2) {
      AllowPromotionWithoutCommonHeader = true;
      break;
    }
  }
}
return Considerable;
1872}

1874bool AArch64TTIImpl::isLegalToVectorizeReduction(
  const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
if (!VF.isScalable())
  return true;

Type *Ty = RdxDesc.getRecurrenceType();
if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
  return false;

switch (RdxDesc.getRecurrenceKind()) {
case RecurKind::Add:
case RecurKind::FAdd:
case RecurKind::And:
case RecurKind::Or:
case RecurKind::Xor:
case RecurKind::SMin:
case RecurKind::SMax:
case RecurKind::UMin:
case RecurKind::UMax:
case RecurKind::FMin:
case RecurKind::FMax:
  return true;
default:
  return false;
}
1899}

1901InstructionCost
1902AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
                                     bool IsUnsigned,
                                     TTI::TargetCostKind CostKind) {
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);

if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
  return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);

assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&(static_cast <bool> ((isa<ScalableVectorType>(Ty)
 == isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable"
) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1911, __extension__ __PRETTY_FUNCTION__))
       "Both vector needs to be equally scalable")(static_cast <bool> ((isa<ScalableVectorType>(Ty)
 == isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable"
) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1911, __extension__ __PRETTY_FUNCTION__));

InstructionCost LegalizationCost = 0;
if (LT.first > 1) {
  Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
  unsigned MinMaxOpcode =
      Ty->isFPOrFPVectorTy()
          ? Intrinsic::maxnum
          : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin);
  IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy});
  LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
}

return LegalizationCost + /*Cost of horizontal reduction*/ 2;
1925}

1927InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
  unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
InstructionCost LegalizationCost = 0;
if (LT.first > 1) {
  Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
  LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
  LegalizationCost *= LT.first - 1;
}

int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
 void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1938, __extension__ __PRETTY_FUNCTION__));
// Add the final reduction cost for the legal horizontal reduction
switch (ISD) {
case ISD::ADD:
case ISD::AND:
case ISD::OR:
case ISD::XOR:
case ISD::FADD:
  return LegalizationCost + 2;
default:
  return InstructionCost::getInvalid();
}
1950}

1952InstructionCost
1953AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
                                         Optional<FastMathFlags> FMF,
                                         TTI::TargetCostKind CostKind) {
if (TTI::requiresOrderedReduction(FMF)) {
  if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
    InstructionCost BaseCost =
        BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
    // Add on extra cost to reflect the extra overhead on some CPUs. We still
    // end up vectorizing for more computationally intensive loops.
    return BaseCost + FixedVTy->getNumElements();
  }

  if (Opcode != Instruction::FAdd)
    return InstructionCost::getInvalid();

  auto *VTy = cast<ScalableVectorType>(ValTy);
  InstructionCost Cost =
      getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
  Cost *= getMaxNumElements(VTy->getElementCount());
  return Cost;
}

if (isa<ScalableVectorType>(ValTy))
  return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);

std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
MVT MTy = LT.second;
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
 void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1981, __extension__ __PRETTY_FUNCTION__));

// Horizontal adds can use the 'addv' instruction. We model the cost of these
// instructions as twice a normal vector add, plus 1 for each legalization
// step (LT.first). This is the only arithmetic vector reduction operation for
// which we have an instruction.
// OR, XOR and AND costs should match the codegen from:
// OR: llvm/test/CodeGen/AArch64/reduce-or.ll
// XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
// AND: llvm/test/CodeGen/AArch64/reduce-and.ll
static const CostTblEntry CostTblNoPairwise[]{
    {ISD::ADD, MVT::v8i8,   2},
    {ISD::ADD, MVT::v16i8,  2},
    {ISD::ADD, MVT::v4i16,  2},
    {ISD::ADD, MVT::v8i16,  2},
    {ISD::ADD, MVT::v4i32,  2},
    {ISD::OR,  MVT::v8i8,  15},
    {ISD::OR,  MVT::v16i8, 17},
    {ISD::OR,  MVT::v4i16,  7},
    {ISD::OR,  MVT::v8i16,  9},
    {ISD::OR,  MVT::v2i32,  3},
    {ISD::OR,  MVT::v4i32,  5},
    {ISD::OR,  MVT::v2i64,  3},
    {ISD::XOR, MVT::v8i8,  15},
    {ISD::XOR, MVT::v16i8, 17},
    {ISD::XOR, MVT::v4i16,  7},
    {ISD::XOR, MVT::v8i16,  9},
    {ISD::XOR, MVT::v2i32,  3},
    {ISD::XOR, MVT::v4i32,  5},
    {ISD::XOR, MVT::v2i64,  3},
    {ISD::AND, MVT::v8i8,  15},
    {ISD::AND, MVT::v16i8, 17},
    {ISD::AND, MVT::v4i16,  7},
    {ISD::AND, MVT::v8i16,  9},
    {ISD::AND, MVT::v2i32,  3},
    {ISD::AND, MVT::v4i32,  5},
    {ISD::AND, MVT::v2i64,  3},
};
switch (ISD) {
default:
  break;
case ISD::ADD:
  if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
    return (LT.first - 1) + Entry->Cost;
  break;
case ISD::XOR:
case ISD::AND:
case ISD::OR:
  const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
  if (!Entry)
    break;
  auto *ValVTy = cast<FixedVectorType>(ValTy);
  if (!ValVTy->getElementType()->isIntegerTy(1) &&
      MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
      isPowerOf2_32(ValVTy->getNumElements())) {
    InstructionCost ExtraCost = 0;
    if (LT.first != 1) {
      // Type needs to be split, so there is an extra cost of LT.first - 1
      // arithmetic ops.
      auto *Ty = FixedVectorType::get(ValTy->getElementType(),
                                      MTy.getVectorNumElements());
      ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
      ExtraCost *= LT.first - 1;
    }
    return Entry->Cost + ExtraCost;
  }
  break;
}
return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2050}

2052InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
static const CostTblEntry ShuffleTbl[] = {
    { TTI::SK_Splice, MVT::nxv16i8,  1 },
    { TTI::SK_Splice, MVT::nxv8i16,  1 },
    { TTI::SK_Splice, MVT::nxv4i32,  1 },
    { TTI::SK_Splice, MVT::nxv2i64,  1 },
    { TTI::SK_Splice, MVT::nxv2f16,  1 },
    { TTI::SK_Splice, MVT::nxv4f16,  1 },
    { TTI::SK_Splice, MVT::nxv8f16,  1 },
    { TTI::SK_Splice, MVT::nxv2bf16, 1 },
    { TTI::SK_Splice, MVT::nxv4bf16, 1 },
    { TTI::SK_Splice, MVT::nxv8bf16, 1 },
    { TTI::SK_Splice, MVT::nxv2f32,  1 },
    { TTI::SK_Splice, MVT::nxv4f32,  1 },
    { TTI::SK_Splice, MVT::nxv2f64,  1 },
};

std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
EVT PromotedVT = LT.second.getScalarType() == MVT::i1
                     ? TLI->getPromotedVTForPredicate(EVT(LT.second))
                     : LT.second;
Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
InstructionCost LegalizationCost = 0;
if (Index < 0) {
  LegalizationCost =
      getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
                         CmpInst::BAD_ICMP_PREDICATE, CostKind) +
      getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
                         CmpInst::BAD_ICMP_PREDICATE, CostKind);
}

// Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
// Cost performed on a promoted type.
if (LT.second.getScalarType() == MVT::i1) {
  LegalizationCost +=
      getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
                       TTI::CastContextHint::None, CostKind) +
      getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
                       TTI::CastContextHint::None, CostKind);
}
const auto *Entry =
    CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
assert(Entry && "Illegal Type for Splice")(static_cast <bool> (Entry && "Illegal Type for Splice"
) ? void (0) : __assert_fail ("Entry && \"Illegal Type for Splice\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 2096, __extension__ __PRETTY_FUNCTION__));
LegalizationCost += Entry->Cost;
return LegalizationCost * LT.first;
2099}

2101InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                             VectorType *Tp,
                                             ArrayRef<int> Mask, int Index,
                                             VectorType *SubTp) {
Kind = improveShuffleKindFromMask(Kind, Mask);
if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
    Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
    Kind == TTI::SK_Reverse) {
  static const CostTblEntry ShuffleTbl[] = {
    // Broadcast shuffle kinds can be performed with 'dup'.
    { TTI::SK_Broadcast, MVT::v8i8,  1 },
    { TTI::SK_Broadcast, MVT::v16i8, 1 },
    { TTI::SK_Broadcast, MVT::v4i16, 1 },
    { TTI::SK_Broadcast, MVT::v8i16, 1 },
    { TTI::SK_Broadcast, MVT::v2i32, 1 },
    { TTI::SK_Broadcast, MVT::v4i32, 1 },
    { TTI::SK_Broadcast, MVT::v2i64, 1 },
    { TTI::SK_Broadcast, MVT::v2f32, 1 },
    { TTI::SK_Broadcast, MVT::v4f32, 1 },
    { TTI::SK_Broadcast, MVT::v2f64, 1 },
    // Transpose shuffle kinds can be performed with 'trn1/trn2' and
    // 'zip1/zip2' instructions.
    { TTI::SK_Transpose, MVT::v8i8,  1 },
    { TTI::SK_Transpose, MVT::v16i8, 1 },
    { TTI::SK_Transpose, MVT::v4i16, 1 },
    { TTI::SK_Transpose, MVT::v8i16, 1 },
    { TTI::SK_Transpose, MVT::v2i32, 1 },
    { TTI::SK_Transpose, MVT::v4i32, 1 },
    { TTI::SK_Transpose, MVT::v2i64, 1 },
    { TTI::SK_Transpose, MVT::v2f32, 1 },
    { TTI::SK_Transpose, MVT::v4f32, 1 },
    { TTI::SK_Transpose, MVT::v2f64, 1 },
    // Select shuffle kinds.
    // TODO: handle vXi8/vXi16.
    { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
    { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
    { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
    { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
    { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
    { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
    // PermuteSingleSrc shuffle kinds.
    { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
    { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
    { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
    { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
    { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
    { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
    { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case.
    { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case.
    { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case.
    { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl
    { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl
    { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl
    { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl
    { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl
    // Reverse can be lowered with `rev`.
    { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov.
    { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT
    { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov.
    { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov.
    { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT
    { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov.
    // Broadcast shuffle kinds for scalable vectors
    { TTI::SK_Broadcast, MVT::nxv16i8,  1 },
    { TTI::SK_Broadcast, MVT::nxv8i16,  1 },
    { TTI::SK_Broadcast, MVT::nxv4i32,  1 },
    { TTI::SK_Broadcast, MVT::nxv2i64,  1 },
    { TTI::SK_Broadcast, MVT::nxv2f16,  1 },
    { TTI::SK_Broadcast, MVT::nxv4f16,  1 },
    { TTI::SK_Broadcast, MVT::nxv8f16,  1 },
    { TTI::SK_Broadcast, MVT::nxv2bf16, 1 },
    { TTI::SK_Broadcast, MVT::nxv4bf16, 1 },
    { TTI::SK_Broadcast, MVT::nxv8bf16, 1 },
    { TTI::SK_Broadcast, MVT::nxv2f32,  1 },
    { TTI::SK_Broadcast, MVT::nxv4f32,  1 },
    { TTI::SK_Broadcast, MVT::nxv2f64,  1 },
    { TTI::SK_Broadcast, MVT::nxv16i1,  1 },
    { TTI::SK_Broadcast, MVT::nxv8i1,   1 },
    { TTI::SK_Broadcast, MVT::nxv4i1,   1 },
    { TTI::SK_Broadcast, MVT::nxv2i1,   1 },
    // Handle the cases for vector.reverse with scalable vectors
    { TTI::SK_Reverse, MVT::nxv16i8,  1 },
    { TTI::SK_Reverse, MVT::nxv8i16,  1 },
    { TTI::SK_Reverse, MVT::nxv4i32,  1 },
    { TTI::SK_Reverse, MVT::nxv2i64,  1 },
    { TTI::SK_Reverse, MVT::nxv2f16,  1 },
    { TTI::SK_Reverse, MVT::nxv4f16,  1 },
    { TTI::SK_Reverse, MVT::nxv8f16,  1 },
    { TTI::SK_Reverse, MVT::nxv2bf16, 1 },
    { TTI::SK_Reverse, MVT::nxv4bf16, 1 },
    { TTI::SK_Reverse, MVT::nxv8bf16, 1 },
    { TTI::SK_Reverse, MVT::nxv2f32,  1 },
    { TTI::SK_Reverse, MVT::nxv4f32,  1 },
    { TTI::SK_Reverse, MVT::nxv2f64,  1 },
    { TTI::SK_Reverse, MVT::nxv16i1,  1 },
    { TTI::SK_Reverse, MVT::nxv8i1,   1 },
    { TTI::SK_Reverse, MVT::nxv4i1,   1 },
    { TTI::SK_Reverse, MVT::nxv2i1,   1 },
  };
  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
  if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
    return LT.first * Entry->Cost;
}
if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
  return getSpliceCost(Tp, Index);
return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
2207}

←

/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12 
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15 
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24 
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28 
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40 
41namespace llvm {
42 
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45  /// The returned value is undefined.
46  ZB_Undefined,
47  /// The returned value is numeric_limits<T>::max()
48  ZB_Max,
49  /// The returned value is numeric_limits<T>::digits
50  ZB_Width
51};
52 
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e          = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58                 egamma     = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59                 ln2        = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60                 ln10       = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61                 log2e      = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62                 log10e     = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63                 pi         = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64                 inv_pi     = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65                 sqrtpi     = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66                 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67                 sqrt2      = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68                 inv_sqrt2  = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69                 sqrt3      = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70                 inv_sqrt3  = .57735026918962576451, // (0x1.279a74590331cP-1)
71                 phi        = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef          = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73                egammaf     = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74                ln2f        = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75                ln10f       = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76                log2ef      = 1.44269504F, // (0x1.715476P+0)
77                log10ef     = .434294482F, // (0x1.bcb7b2P-2)
78                pif         = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79                inv_pif     = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80                sqrtpif     = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81                inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82                sqrt2f      = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83                inv_sqrt2f  = .707106781F, // (0x1.6a09e6P-1)
84                sqrt3f      = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85                inv_sqrt3f  = .577350269F, // (0x1.279a74P-1)
86                phif        = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88 
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91  static unsigned count(T Val, ZeroBehavior) {
92    if (!Val)
93      return std::numeric_limits<T>::digits;
94    if (Val & 0x1)
95      return 0;
96 
97    // Bisection method.
98    unsigned ZeroBits = 0;
99    T Shift = std::numeric_limits<T>::digits >> 1;
100    T Mask = std::numeric_limits<T>::max() >> Shift;
101    while (Shift) {
102      if ((Val & Mask) == 0) {
103        Val >>= Shift;
104        ZeroBits |= Shift;
105      }
106      Shift >>= 1;
107      Mask >>= Shift;
108    }
109    return ZeroBits;
110  }
111};
112 
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115  static unsigned count(T Val, ZeroBehavior ZB) {
116    if (ZB != ZB_Undefined && Val == 0)
117      return 32;
118 
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120    return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122    unsigned long Index;
123    _BitScanForward(&Index, Val);
124    return Index;
125#endif
126  }
127};
128 
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131  static unsigned count(T Val, ZeroBehavior ZB) {
132    if (ZB != ZB_Undefined && Val == 0)
133      return 64;
134 
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136    return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138    unsigned long Index;
139    _BitScanForward64(&Index, Val);
140    return Index;
141#endif
142  }
143};
144#endif
145#endif
146} // namespace detail
147 
148/// Count number of 0's from the least significant bit to the most
149///   stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154///   valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157  static_assert(std::numeric_limits<T>::is_integer &&
158                    !std::numeric_limits<T>::is_signed,
159                "Only unsigned integral types are allowed.");
160  return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
161}
162 
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165  static unsigned count(T Val, ZeroBehavior) {
166    if (!Val)
167      return std::numeric_limits<T>::digits;
168 
169    // Bisection method.
170    unsigned ZeroBits = 0;
171    for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172      T Tmp = Val >> Shift;
173      if (Tmp)
174        Val = Tmp;
175      else
176        ZeroBits |= Shift;
177    }
178    return ZeroBits;
179  }
180};
181 
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184  static unsigned count(T Val, ZeroBehavior ZB) {
185    if (ZB != ZB_Undefined && Val == 0)
186      return 32;
187 
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189    return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191    unsigned long Index;
192    _BitScanReverse(&Index, Val);
193    return Index ^ 31;
194#endif
195  }
196};
197 
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200  static unsigned count(T Val, ZeroBehavior ZB) {
201    if (ZB != ZB_Undefined && Val == 0)
202      return 64;
203 
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205    return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207    unsigned long Index;
208    _BitScanReverse64(&Index, Val);
209    return Index ^ 63;
210#endif
211  }
212};
213#endif
214#endif
215} // namespace detail
216 
217/// Count number of 0's from the most significant bit to the least
218///   stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223///   valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226  static_assert(std::numeric_limits<T>::is_integer &&
227                    !std::numeric_limits<T>::is_signed,
228                "Only unsigned integral types are allowed.");
229  return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231 
232/// Get the index of the first set bit starting from the least
233///   significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238///   valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240  if (ZB == ZB_Max && Val == 0)
241    return std::numeric_limits<T>::max();
242 
243  return countTrailingZeros(Val, ZB_Undefined);
244}
245 
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0.  Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249  static_assert(std::is_unsigned<T>::value, "Invalid type!");
250  const unsigned Bits = CHAR_BIT8 * sizeof(T);
251  assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index"
) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 251, __extension__ __PRETTY_FUNCTION__));
252  return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254 
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0.  Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258  return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260 
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1.  Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264  return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266 
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1.  Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270  return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272 
273/// Get the index of the last set bit starting from the least
274///   significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279///   valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281  if (ZB == ZB_Max && Val == 0)
282    return std::numeric_limits<T>::max();
283 
284  // Use ^ instead of - because both gcc and llvm can remove the associated ^
285  // in the __builtin_clz intrinsic on x86.
286  return countLeadingZeros(Val, ZB_Undefined) ^
287         (std::numeric_limits<T>::digits - 1);
288}
289 
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297  R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302 
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306  unsigned char in[sizeof(Val)];
307  unsigned char out[sizeof(Val)];
308  std::memcpy(in, &Val, sizeof(Val));
309  for (unsigned i = 0; i < sizeof(Val); ++i)
310    out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311  std::memcpy(&Val, out, sizeof(Val));
312  return Val;
313}
314 
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318  return __builtin_bitreverse8(Val);
319}
320#endif
321 
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325  return __builtin_bitreverse16(Val);
326}
327#endif
328 
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332  return __builtin_bitreverse32(Val);
333}
334#endif
335 
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339  return __builtin_bitreverse64(Val);
340}
341#endif
342 
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346 
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349  return static_cast<uint32_t>(Value >> 32);
350}
351 
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354  return static_cast<uint32_t>(Value);
355}
356 
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359  return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361 
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364  return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368  return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371  return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374  return static_cast<int32_t>(x) == x;
375}
376 
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380  static_assert(
381      N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382  static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383  return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
384}
385 
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390///   return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396  static_assert(N > 0, "isUInt<0> doesn't make sense");
397  return X < (UINT64_C(1)1UL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401  return true;
402}
403 
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406  return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409  return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412  return static_cast<uint32_t>(x) == x;
413}
414 
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418  static_assert(
419      N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420  static_assert(N + S <= 64,
421                "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422  // Per the two static_asserts above, S must be strictly less than 64.  So
423  // 1 << S is not undefined behavior.
424  return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
425}
426 
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429  assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
 "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 429, __extension__ __PRETTY_FUNCTION__));
430 
431  // uint64_t(1) << 64 is undefined behavior, so we can't do
432  //   (uint64_t(1) << N) - 1
433  // without checking first that N != 64.  But this works and doesn't have a
434  // branch.
435  return UINT64_MAX(18446744073709551615UL) >> (64 - N);
436}
437 
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440  assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
 "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 440, __extension__ __PRETTY_FUNCTION__));
441 
442  return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
443}
444 
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447  assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
 "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 447, __extension__ __PRETTY_FUNCTION__));
448 
449  // This relies on two's complement wraparound when N == 64, so we convert to
450  // int64_t only at the very end to avoid UB.
451  return (UINT64_C(1)1UL << (N - 1)) - 1;
452}
453 
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456  return N >= 64 || x <= maxUIntN(N);
457}
458 
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461  return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463 
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468  return Value && ((Value + 1) & Value) == 0;
469}
470 
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474  return Value && ((Value + 1) & Value) == 0;
475}
476 
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480  return Value && isMask_32((Value - 1) | Value);
481}
482 
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486  return Value && isMask_64((Value - 1) | Value);
487}
488 
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492  return Value && !(Value & (Value - 1));
493}
494 
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497  return Value && !(Value & (Value - 1));
498}
499 
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510  static_assert(std::numeric_limits<T>::is_integer &&
511                    !std::numeric_limits<T>::is_signed,
512                "Only unsigned integral types are allowed.");
513  return countLeadingZeros<T>(~Value, ZB);
514}
515 
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526  static_assert(std::numeric_limits<T>::is_integer &&
527                    !std::numeric_limits<T>::is_signed,
528                "Only unsigned integral types are allowed.");
529  return countTrailingZeros<T>(~Value, ZB);
530}
531 
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534  static unsigned count(T Value) {
535    // Generic version, forward to 32 bits.
536    static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538    return __builtin_popcount(Value);
539#else
540    uint32_t v = Value;
541    v = v - ((v >> 1) & 0x55555555);
542    v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543    return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545  }
546};
547 
548template <typename T> struct PopulationCounter<T, 8> {
549  static unsigned count(T Value) {
550#if defined(__GNUC__4)
551    return __builtin_popcountll(Value);
552#else
553    uint64_t v = Value;
554    v = v - ((v >> 1) & 0x5555555555555555ULL);
555    v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556    v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557    return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559  }
560};
561} // namespace detail
562 
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568  static_assert(std::numeric_limits<T>::is_integer &&
569                    !std::numeric_limits<T>::is_signed,
570                "Only unsigned integral types are allowed.");
571  return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573 
574/// Compile time Log2.
575/// Valid only for positive powers of two.
576template <size_t kValue> constexpr inline size_t CTLog2() {
577  static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
578                "Value is not a valid power of 2");
579  return 1 + CTLog2<kValue / 2>();
580}
581 
582template <> constexpr inline size_t CTLog2<1>() { return 0; }
583 
584/// Return the log base 2 of the specified value.
585inline double Log2(double Value) {
586#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
587  return __builtin_log(Value) / __builtin_log(2.0);
588#else
589  return log2(Value);
590#endif
591}
592 
593/// Return the floor log base 2 of the specified value, -1 if the value is zero.
594/// (32 bit edition.)
595/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
596inline unsigned Log2_32(uint32_t Value) {
597  return 31 - countLeadingZeros(Value);
12
←
Returning the value 4294967295→
598}
599 
600/// Return the floor log base 2 of the specified value, -1 if the value is zero.
601/// (64 bit edition.)
602inline unsigned Log2_64(uint64_t Value) {
603  return 63 - countLeadingZeros(Value);
604}
605 
606/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
607/// (32 bit edition).
608/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
609inline unsigned Log2_32_Ceil(uint32_t Value) {
610  return 32 - countLeadingZeros(Value - 1);
611}
612 
613/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
614/// (64 bit edition.)
615inline unsigned Log2_64_Ceil(uint64_t Value) {
616  return 64 - countLeadingZeros(Value - 1);
617}
618 
619/// Return the greatest common divisor of the values using Euclid's algorithm.
620template <typename T>
621inline T greatestCommonDivisor(T A, T B) {
622  while (B) {
623    T Tmp = B;
624    B = A % B;
625    A = Tmp;
626  }
627  return A;
628}
629 
630inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
631  return greatestCommonDivisor<uint64_t>(A, B);
632}
633 
634/// This function takes a 64-bit integer and returns the bit equivalent double.
635inline double BitsToDouble(uint64_t Bits) {
636  double D;
637  static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
638  memcpy(&D, &Bits, sizeof(Bits));
639  return D;
640}
641 
642/// This function takes a 32-bit integer and returns the bit equivalent float.
643inline float BitsToFloat(uint32_t Bits) {
644  float F;
645  static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
646  memcpy(&F, &Bits, sizeof(Bits));
647  return F;
648}
649 
650/// This function takes a double and returns the bit equivalent 64-bit integer.
651/// Note that copying doubles around changes the bits of NaNs on some hosts,
652/// notably x86, so this routine cannot be used if these bits are needed.
653inline uint64_t DoubleToBits(double Double) {
654  uint64_t Bits;
655  static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
656  memcpy(&Bits, &Double, sizeof(Double));
657  return Bits;
658}
659 
660/// This function takes a float and returns the bit equivalent 32-bit integer.
661/// Note that copying floats around changes the bits of NaNs on some hosts,
662/// notably x86, so this routine cannot be used if these bits are needed.
663inline uint32_t FloatToBits(float Float) {
664  uint32_t Bits;
665  static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
666  memcpy(&Bits, &Float, sizeof(Float));
667  return Bits;
668}
669 
670/// A and B are either alignments or offsets. Return the minimum alignment that
671/// may be assumed after adding the two together.
672constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
673  // The largest power of 2 that divides both A and B.
674  //
675  // Replace "-Value" by "1+~Value" in the following commented code to avoid
676  // MSVC warning C4146
677  //    return (A | B) & -(A | B);
678  return (A | B) & (1 + ~(A | B));
679}
680 
681/// Returns the next power of two (in 64-bits) that is strictly greater than A.
682/// Returns zero on overflow.
683inline uint64_t NextPowerOf2(uint64_t A) {
684  A |= (A >> 1);
685  A |= (A >> 2);
686  A |= (A >> 4);
687  A |= (A >> 8);
688  A |= (A >> 16);
689  A |= (A >> 32);
690  return A + 1;
691}
692 
693/// Returns the power of two which is less than or equal to the given value.
694/// Essentially, it is a floor operation across the domain of powers of two.
695inline uint64_t PowerOf2Floor(uint64_t A) {
696  if (!A) return 0;
697  return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
698}
699 
700/// Returns the power of two which is greater than or equal to the given value.
701/// Essentially, it is a ceil operation across the domain of powers of two.
702inline uint64_t PowerOf2Ceil(uint64_t A) {
703  if (!A)
704    return 0;
705  return NextPowerOf2(A - 1);
706}
707 
708/// Returns the next integer (mod 2**64) that is greater than or equal to
709/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
710///
711/// If non-zero \p Skew is specified, the return value will be a minimal
712/// integer that is greater than or equal to \p Value and equal to
713/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
714/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
715///
716/// Examples:
717/// \code
718///   alignTo(5, 8) = 8
719///   alignTo(17, 8) = 24
720///   alignTo(~0LL, 8) = 0
721///   alignTo(321, 255) = 510
722///
723///   alignTo(5, 8, 7) = 7
724///   alignTo(17, 8, 1) = 17
725///   alignTo(~0LL, 8, 3) = 3
726///   alignTo(321, 255, 42) = 552
727/// \endcode
728inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
729  assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 729, __extension__ __PRETTY_FUNCTION__));
730  Skew %= Align;
731  return (Value + Align - 1 - Skew) / Align * Align + Skew;
732}
733 
734/// Returns the next integer (mod 2**64) that is greater than or equal to
735/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
736template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
737  static_assert(Align != 0u, "Align must be non-zero");
738  return (Value + Align - 1) / Align * Align;
739}
740 
741/// Returns the integer ceil(Numerator / Denominator).
742inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
743  return alignTo(Numerator, Denominator) / Denominator;
744}
745 
746/// Returns the integer nearest(Numerator / Denominator).
747inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
748  return (Numerator + (Denominator / 2)) / Denominator;
749}
750 
751/// Returns the largest uint64_t less than or equal to \p Value and is
752/// \p Skew mod \p Align. \p Align must be non-zero
753inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
754  assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 754, __extension__ __PRETTY_FUNCTION__));
755  Skew %= Align;
756  return (Value - Skew) / Align * Align + Skew;
757}
758 
759/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
760/// Requires 0 < B <= 32.
761template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
762  static_assert(B > 0, "Bit width can't be 0.");
763  static_assert(B <= 32, "Bit width out of range.");
764  return int32_t(X << (32 - B)) >> (32 - B);
765}
766 
767/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
768/// Requires 0 < B <= 32.
769inline int32_t SignExtend32(uint32_t X, unsigned B) {
770  assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 770, __extension__ __PRETTY_FUNCTION__));
771  assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 771, __extension__ __PRETTY_FUNCTION__));
772  return int32_t(X << (32 - B)) >> (32 - B);
773}
774 
775/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
776/// Requires 0 < B <= 64.
777template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
778  static_assert(B > 0, "Bit width can't be 0.");
779  static_assert(B <= 64, "Bit width out of range.");
780  return int64_t(x << (64 - B)) >> (64 - B);
781}
782 
783/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
784/// Requires 0 < B <= 64.
785inline int64_t SignExtend64(uint64_t X, unsigned B) {
786  assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 786, __extension__ __PRETTY_FUNCTION__));
787  assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 787, __extension__ __PRETTY_FUNCTION__));
788  return int64_t(X << (64 - B)) >> (64 - B);
789}
790 
791/// Subtract two unsigned integers, X and Y, of type T and return the absolute
792/// value of the result.
793template <typename T>
794std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
795  return X > Y ? (X - Y) : (Y - X);
796}
797 
798/// Add two unsigned integers, X and Y, of type T.  Clamp the result to the
799/// maximum representable value of T on overflow.  ResultOverflowed indicates if
800/// the result is larger than the maximum representable value of type T.
801template <typename T>
802std::enable_if_t<std::is_unsigned<T>::value, T>
803SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
804  bool Dummy;
805  bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
806  // Hacker's Delight, p. 29
807  T Z = X + Y;
808  Overflowed = (Z < X || Z < Y);
809  if (Overflowed)
810    return std::numeric_limits<T>::max();
811  else
812    return Z;
813}
814 
815/// Multiply two unsigned integers, X and Y, of type T.  Clamp the result to the
816/// maximum representable value of T on overflow.  ResultOverflowed indicates if
817/// the result is larger than the maximum representable value of type T.
818template <typename T>
819std::enable_if_t<std::is_unsigned<T>::value, T>
820SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
821  bool Dummy;
822  bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
823 
824  // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
825  // because it fails for uint16_t (where multiplication can have undefined
826  // behavior due to promotion to int), and requires a division in addition
827  // to the multiplication.
828 
829  Overflowed = false;
830 
831  // Log2(Z) would be either Log2Z or Log2Z + 1.
832  // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
833  // will necessarily be less than Log2Max as desired.
834  int Log2Z = Log2_64(X) + Log2_64(Y);
835  const T Max = std::numeric_limits<T>::max();
836  int Log2Max = Log2_64(Max);
837  if (Log2Z < Log2Max) {
838    return X * Y;
839  }
840  if (Log2Z > Log2Max) {
841    Overflowed = true;
842    return Max;
843  }
844 
845  // We're going to use the top bit, and maybe overflow one
846  // bit past it. Multiply all but the bottom bit then add
847  // that on at the end.
848  T Z = (X >> 1) * Y;
849  if (Z & ~(Max >> 1)) {
850    Overflowed = true;
851    return Max;
852  }
853  Z <<= 1;
854  if (X & 1)
855    return SaturatingAdd(Z, Y, ResultOverflowed);
856 
857  return Z;
858}
859 
860/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
861/// the product. Clamp the result to the maximum representable value of T on
862/// overflow. ResultOverflowed indicates if the result is larger than the
863/// maximum representable value of type T.
864template <typename T>
865std::enable_if_t<std::is_unsigned<T>::value, T>
866SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
867  bool Dummy;
868  bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
869 
870  T Product = SaturatingMultiply(X, Y, &Overflowed);
871  if (Overflowed)
872    return Product;
873 
874  return SaturatingAdd(A, Product, &Overflowed);
875}
876 
877/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
878extern const float huge_valf;
879 
880 
881/// Add two signed integers, computing the two's complement truncated result,
882/// returning true if overflow occured.
883template <typename T>
884std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
885#if __has_builtin(__builtin_add_overflow)1
886  return __builtin_add_overflow(X, Y, &Result);
887#else
888  // Perform the unsigned addition.
889  using U = std::make_unsigned_t<T>;
890  const U UX = static_cast<U>(X);
891  const U UY = static_cast<U>(Y);
892  const U UResult = UX + UY;
893 
894  // Convert to signed.
895  Result = static_cast<T>(UResult);
896 
897  // Adding two positive numbers should result in a positive number.
898  if (X > 0 && Y > 0)
899    return Result <= 0;
900  // Adding two negatives should result in a negative number.
901  if (X < 0 && Y < 0)
902    return Result >= 0;
903  return false;
904#endif
905}
906 
907/// Subtract two signed integers, computing the two's complement truncated
908/// result, returning true if an overflow ocurred.
909template <typename T>
910std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
911#if __has_builtin(__builtin_sub_overflow)1
912  return __builtin_sub_overflow(X, Y, &Result);
913#else
914  // Perform the unsigned addition.
915  using U = std::make_unsigned_t<T>;
916  const U UX = static_cast<U>(X);
917  const U UY = static_cast<U>(Y);
918  const U UResult = UX - UY;
919 
920  // Convert to signed.
921  Result = static_cast<T>(UResult);
922 
923  // Subtracting a positive number from a negative results in a negative number.
924  if (X <= 0 && Y > 0)
925    return Result >= 0;
926  // Subtracting a negative number from a positive results in a positive number.
927  if (X >= 0 && Y < 0)
928    return Result <= 0;
929  return false;
930#endif
931}
932 
933/// Multiply two signed integers, computing the two's complement truncated
934/// result, returning true if an overflow ocurred.
935template <typename T>
936std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
937  // Perform the unsigned multiplication on absolute values.
938  using U = std::make_unsigned_t<T>;
939  const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
940  const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
941  const U UResult = UX * UY;
942 
943  // Convert to signed.
944  const bool IsNegative = (X < 0) ^ (Y < 0);
945  Result = IsNegative ? (0 - UResult) : UResult;
946 
947  // If any of the args was 0, result is 0 and no overflow occurs.
948  if (UX == 0 || UY == 0)
949    return false;
950 
951  // UX and UY are in [1, 2^n], where n is the number of digits.
952  // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
953  // positive) divided by an argument compares to the other.
954  if (IsNegative)
955    return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
956  else
957    return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
958}
959 
960} // End llvm namespace
961 
962#endif