doxygen/AArch64ISelLowering_8cpp_source.html

//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// This file implements the AArch64TargetLowering class.

//

//===----------------------------------------------------------------------===//


#include "AArch64ISelLowering.h"

#include "AArch64CallingConvention.h"

#include "AArch64ExpandImm.h"

#include "AArch64MachineFunctionInfo.h"

#include "AArch64PerfectShuffle.h"

#include "AArch64RegisterInfo.h"

#include "AArch64Subtarget.h"

#include "AArch64TargetMachine.h"

#include "MCTargetDesc/AArch64AddressingModes.h"

#include "Utils/AArch64BaseInfo.h"

#include "Utils/AArch64SMEAttributes.h"

#include "llvm/ADT/APFloat.h"

#include "llvm/ADT/APInt.h"

#include "llvm/ADT/ArrayRef.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/ADT/SmallSet.h"

#include "llvm/ADT/SmallVector.h"

#include "llvm/ADT/SmallVectorExtras.h"

#include "llvm/ADT/Statistic.h"

#include "llvm/ADT/StringRef.h"

#include "llvm/ADT/Twine.h"

#include "llvm/Analysis/LoopInfo.h"

#include "llvm/Analysis/MemoryLocation.h"

#include "llvm/Analysis/ObjCARCUtil.h"

#include "llvm/Analysis/OptimizationRemarkEmitter.h"

#include "llvm/Analysis/TargetTransformInfo.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/Analysis/VectorUtils.h"

#include "llvm/CodeGen/Analysis.h"

#include "llvm/CodeGen/CallingConvLower.h"

#include "llvm/CodeGen/ComplexDeinterleavingPass.h"

#include "llvm/CodeGen/GlobalISel/Utils.h"

#include "llvm/CodeGen/ISDOpcodes.h"

#include "llvm/CodeGen/MachineBasicBlock.h"

#include "llvm/CodeGen/MachineFrameInfo.h"

#include "llvm/CodeGen/MachineFunction.h"

#include "llvm/CodeGen/MachineInstr.h"

#include "llvm/CodeGen/MachineInstrBuilder.h"

#include "llvm/CodeGen/MachineMemOperand.h"

#include "llvm/CodeGen/MachineRegisterInfo.h"

#include "llvm/CodeGen/SelectionDAG.h"

#include "llvm/CodeGen/SelectionDAGNodes.h"

#include "llvm/CodeGen/TargetCallingConv.h"

#include "llvm/CodeGen/TargetInstrInfo.h"

#include "llvm/CodeGen/TargetOpcodes.h"

#include "llvm/CodeGen/ValueTypes.h"

#include "llvm/CodeGenTypes/MachineValueType.h"

#include "llvm/IR/Attributes.h"

#include "llvm/IR/Constants.h"

#include "llvm/IR/DataLayout.h"

#include "llvm/IR/DebugLoc.h"

#include "llvm/IR/DerivedTypes.h"

#include "llvm/IR/Function.h"

#include "llvm/IR/GetElementPtrTypeIterator.h"

#include "llvm/IR/GlobalValue.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/Instruction.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/Intrinsics.h"

#include "llvm/IR/IntrinsicsAArch64.h"

#include "llvm/IR/Module.h"

#include "llvm/IR/PatternMatch.h"

#include "llvm/IR/Type.h"

#include "llvm/IR/Use.h"

#include "llvm/IR/Value.h"

#include "llvm/Support/AtomicOrdering.h"

#include "llvm/Support/Casting.h"

#include "llvm/Support/CodeGen.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/Debug.h"

#include "llvm/Support/ErrorHandling.h"

#include "llvm/Support/InstructionCost.h"

#include "llvm/Support/KnownBits.h"

#include "llvm/Support/MathExtras.h"

#include "llvm/Support/SipHash.h"

#include "llvm/Support/raw_ostream.h"

#include "llvm/Target/TargetMachine.h"

#include "llvm/Target/TargetOptions.h"

#include "llvm/TargetParser/Triple.h"

#include <algorithm>

#include <bitset>

#include <cassert>

#include <cctype>

#include <cstdint>

#include <cstdlib>

#include <iterator>

#include <limits>

#include <optional>

#include <tuple>

#include <utility>

#include <vector>


using namespace llvm;

using namespace llvm::PatternMatch;


#define DEBUG_TYPE "aarch64-lower"


STATISTIC(NumTailCalls, "Number of tail calls");

STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");


// FIXME: The necessary dtprel relocations don't seem to be supported

// well in the GNU bfd and gold linkers at the moment. Therefore, by

// default, for now, fall back to GeneralDynamic code generation.

cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(

    "aarch64-elf-ldtls-generation", cl::Hidden,

    cl::desc("Allow AArch64 Local Dynamic TLS code generation"),

    cl::init(false));


static cl::opt<bool>

EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,

                         cl::desc("Enable AArch64 logical imm instruction "

                                  "optimization"),

                         cl::init(true));


// Temporary option added for the purpose of testing functionality added

// to DAGCombiner.cpp in D92230. It is expected that this can be removed

// in future when both implementations will be based off MGATHER rather

// than the GLD1 nodes added for the SVE gather load intrinsics.

static cl::opt<bool>

EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,

                                cl::desc("Combine extends of AArch64 masked "

                                         "gather intrinsics"),

                                cl::init(true));


static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,

                                    cl::desc("Combine ext and trunc to TBL"),

                                    cl::init(true));


// All of the XOR, OR and CMP use ALU ports, and data dependency will become the

// bottleneck after this transform on high end CPU. So this max leaf node

// limitation is guard cmp+ccmp will be profitable.

static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,

                                 cl::desc("Maximum of xors"));


// By turning this on, we will not fallback to DAG ISel when encountering

// scalable vector types for all instruction, even if SVE is not yet supported

// with some instructions.

// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.

cl::opt<bool> EnableSVEGISel(

    "aarch64-enable-gisel-sve", cl::Hidden,

    cl::desc("Enable / disable SVE scalable vectors in Global ISel"),

    cl::init(false));


// TODO: This option should be removed once we switch to always using PTRADD in

// the SelectionDAG.

static cl::opt<bool> UseFEATCPACodegen(

    "aarch64-use-featcpa-codegen", cl::Hidden,

    cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in "

             "SelectionDAG for FEAT_CPA"),

    cl::init(false));


/// Value type used for condition codes.

constexpr MVT CondCodeVT = MVT::i32;


/// Value type used for NZCV flags.

constexpr MVT FlagsVT = MVT::i32;


static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,

                                       AArch64::X3, AArch64::X4, AArch64::X5,

                                       AArch64::X6, AArch64::X7};


static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,

                                       AArch64::Q3, AArch64::Q4, AArch64::Q5,

                                       AArch64::Q6, AArch64::Q7};


ArrayRef<MCPhysReg> llvm::AArch64::getGPRArgRegs() { return GPRArgRegs; }


ArrayRef<MCPhysReg> llvm::AArch64::getFPRArgRegs() { return FPRArgRegs; }


static inline EVT getPackedSVEVectorVT(EVT VT) {

  switch (VT.getSimpleVT().SimpleTy) {

  default:

    llvm_unreachable("unexpected element type for vector");

  case MVT::i8:

    return MVT::nxv16i8;

  case MVT::i16:

    return MVT::nxv8i16;

  case MVT::i32:

    return MVT::nxv4i32;

  case MVT::i64:

    return MVT::nxv2i64;

  case MVT::f16:

    return MVT::nxv8f16;

  case MVT::f32:

    return MVT::nxv4f32;

  case MVT::f64:

    return MVT::nxv2f64;

  case MVT::bf16:

    return MVT::nxv8bf16;

  }

}


// NOTE: Currently there's only a need to return integer vector types. If this

// changes then just add an extra "type" parameter.


static inline EVT getPackedSVEVectorVT(ElementCount EC) {

  switch (EC.getKnownMinValue()) {

  default:

    llvm_unreachable("unexpected element count for vector");

  case 16:

    return MVT::nxv16i8;

  case 8:

    return MVT::nxv8i16;

  case 4:

    return MVT::nxv4i32;

  case 2:

    return MVT::nxv2i64;

  }

}


static inline EVT getPromotedVTForPredicate(EVT VT) {

  assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&

         "Expected scalable predicate vector type!");

  switch (VT.getVectorMinNumElements()) {

  default:

    llvm_unreachable("unexpected element count for vector");

  case 2:

    return MVT::nxv2i64;

  case 4:

    return MVT::nxv4i32;

  case 8:

    return MVT::nxv8i16;

  case 16:

    return MVT::nxv16i8;

  }

}


/// Returns true if VT's elements occupy the lowest bit positions of its

/// associated register class without any intervening space.

///

/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the

/// same register class, but only nxv8f16 can be treated as a packed vector.


static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {

  assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&

         "Expected legal vector type!");

  return VT.isFixedLengthVector() ||

         VT.getSizeInBits().getKnownMinValue() == AArch64::SVEBitsPerBlock;

}


// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading

// predicate and end with a passthru value matching the result type.


static bool isMergePassthruOpcode(unsigned Opc) {

  switch (Opc) {

  default:

    return false;

  case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:

  case AArch64ISD::BSWAP_MERGE_PASSTHRU:

  case AArch64ISD::REVH_MERGE_PASSTHRU:

  case AArch64ISD::REVW_MERGE_PASSTHRU:

  case AArch64ISD::REVD_MERGE_PASSTHRU:

  case AArch64ISD::CTLZ_MERGE_PASSTHRU:

  case AArch64ISD::CTPOP_MERGE_PASSTHRU:

  case AArch64ISD::DUP_MERGE_PASSTHRU:

  case AArch64ISD::ABS_MERGE_PASSTHRU:

  case AArch64ISD::NEG_MERGE_PASSTHRU:

  case AArch64ISD::FNEG_MERGE_PASSTHRU:

  case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:

  case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:

  case AArch64ISD::FCEIL_MERGE_PASSTHRU:

  case AArch64ISD::FFLOOR_MERGE_PASSTHRU:

  case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:

  case AArch64ISD::FRINT_MERGE_PASSTHRU:

  case AArch64ISD::FROUND_MERGE_PASSTHRU:

  case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:

  case AArch64ISD::FTRUNC_MERGE_PASSTHRU:

  case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:

  case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:

  case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:

  case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:

  case AArch64ISD::FCVTX_MERGE_PASSTHRU:

  case AArch64ISD::FCVTZU_MERGE_PASSTHRU:

  case AArch64ISD::FCVTZS_MERGE_PASSTHRU:

  case AArch64ISD::FSQRT_MERGE_PASSTHRU:

  case AArch64ISD::FRECPX_MERGE_PASSTHRU:

  case AArch64ISD::FABS_MERGE_PASSTHRU:

    return true;

  }

}


// Returns true if inactive lanes are known to be zeroed by construction.


static bool isZeroingInactiveLanes(SDValue Op) {

  switch (Op.getOpcode()) {

  default:

    return false;

  // We guarantee i1 splat_vectors to zero the other lanes

  case ISD::SPLAT_VECTOR:

  case ISD::GET_ACTIVE_LANE_MASK:

  case AArch64ISD::PTRUE:

  case AArch64ISD::SETCC_MERGE_ZERO:

    return true;

  case ISD::INTRINSIC_WO_CHAIN:

    switch (Op.getConstantOperandVal(0)) {

    default:

      return false;

    case Intrinsic::aarch64_sve_ptrue:

    case Intrinsic::aarch64_sve_pnext:

    case Intrinsic::aarch64_sve_cmpeq:

    case Intrinsic::aarch64_sve_cmpne:

    case Intrinsic::aarch64_sve_cmpge:

    case Intrinsic::aarch64_sve_cmpgt:

    case Intrinsic::aarch64_sve_cmphs:

    case Intrinsic::aarch64_sve_cmphi:

    case Intrinsic::aarch64_sve_cmpeq_wide:

    case Intrinsic::aarch64_sve_cmpne_wide:

    case Intrinsic::aarch64_sve_cmpge_wide:

    case Intrinsic::aarch64_sve_cmpgt_wide:

    case Intrinsic::aarch64_sve_cmplt_wide:

    case Intrinsic::aarch64_sve_cmple_wide:

    case Intrinsic::aarch64_sve_cmphs_wide:

    case Intrinsic::aarch64_sve_cmphi_wide:

    case Intrinsic::aarch64_sve_cmplo_wide:

    case Intrinsic::aarch64_sve_cmpls_wide:

    case Intrinsic::aarch64_sve_fcmpeq:

    case Intrinsic::aarch64_sve_fcmpne:

    case Intrinsic::aarch64_sve_fcmpge:

    case Intrinsic::aarch64_sve_fcmpgt:

    case Intrinsic::aarch64_sve_fcmpuo:

    case Intrinsic::aarch64_sve_facgt:

    case Intrinsic::aarch64_sve_facge:

    case Intrinsic::aarch64_sve_whilege:

    case Intrinsic::aarch64_sve_whilegt:

    case Intrinsic::aarch64_sve_whilehi:

    case Intrinsic::aarch64_sve_whilehs:

    case Intrinsic::aarch64_sve_whilele:

    case Intrinsic::aarch64_sve_whilelo:

    case Intrinsic::aarch64_sve_whilels:

    case Intrinsic::aarch64_sve_whilelt:

    case Intrinsic::aarch64_sve_match:

    case Intrinsic::aarch64_sve_nmatch:

    case Intrinsic::aarch64_sve_whilege_x2:

    case Intrinsic::aarch64_sve_whilegt_x2:

    case Intrinsic::aarch64_sve_whilehi_x2:

    case Intrinsic::aarch64_sve_whilehs_x2:

    case Intrinsic::aarch64_sve_whilele_x2:

    case Intrinsic::aarch64_sve_whilelo_x2:

    case Intrinsic::aarch64_sve_whilels_x2:

    case Intrinsic::aarch64_sve_whilelt_x2:

      return true;

    }

  }

}


static std::tuple<SDValue, SDValue>


extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG) {

  SDLoc DL(Disc);

  SDValue AddrDisc;

  SDValue ConstDisc;


  // If this is a blend, remember the constant and address discriminators.

  // Otherwise, it's either a constant discriminator, or a non-blended

  // address discriminator.

  if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&

      Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {

    AddrDisc = Disc->getOperand(1);

    ConstDisc = Disc->getOperand(2);

  } else {

    ConstDisc = Disc;

  }


  // If the constant discriminator (either the blend RHS, or the entire

  // discriminator value) isn't a 16-bit constant, bail out, and let the

  // discriminator be computed separately.

  const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);

  if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))

    return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);


  // If there's no address discriminator, use NoRegister, which we'll later

  // replace with XZR, or directly use a Z variant of the inst. when available.

  if (!AddrDisc)

    AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);


  return std::make_tuple(

      DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),

      AddrDisc);

}


AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,

                                             const AArch64Subtarget &STI)

    : TargetLowering(TM), Subtarget(&STI) {

  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so

  // we have to make something up. Arbitrarily, choose ZeroOrOne.

  setBooleanContents(ZeroOrOneBooleanContent);

  // When comparing vectors the result sets the different elements in the

  // vector to all-one or all-zero.

  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);


  // Set up the register classes.

  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);

  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);


  if (Subtarget->hasLS64()) {

    addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);

    setOperationAction(ISD::LOAD, MVT::i64x8, Custom);

    setOperationAction(ISD::STORE, MVT::i64x8, Custom);

  }


  if (Subtarget->hasFPARMv8()) {

    addRegisterClass(MVT::aarch64mfp8, &AArch64::FPR8RegClass);

    addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);

    addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);

    addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);

    addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);

    addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);

  }


  if (Subtarget->hasNEON()) {

    addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);

    addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);


    addDRType(MVT::v2f32);

    addDRType(MVT::v8i8);

    addDRType(MVT::v4i16);

    addDRType(MVT::v2i32);

    addDRType(MVT::v1i64);

    addDRType(MVT::v1f64);

    addDRType(MVT::v4f16);

    addDRType(MVT::v4bf16);


    addQRType(MVT::v4f32);

    addQRType(MVT::v2f64);

    addQRType(MVT::v16i8);

    addQRType(MVT::v8i16);

    addQRType(MVT::v4i32);

    addQRType(MVT::v2i64);

    addQRType(MVT::v8f16);

    addQRType(MVT::v8bf16);

  }


  if (Subtarget->isSVEorStreamingSVEAvailable()) {

    // Add legal sve predicate types

    addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);

    addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);

    addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);

    addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);

    addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);


    // Add legal sve data types

    addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);

    addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);

    addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);

    addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);


    addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);

    addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);

    addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);

    addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);

    addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);

    addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);


    addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);

    addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);

    addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);


    if (Subtarget->useSVEForFixedLengthVectors()) {

      for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())

        if (useSVEForFixedLengthVectorVT(VT))

          addRegisterClass(VT, &AArch64::ZPRRegClass);


      for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())

        if (useSVEForFixedLengthVectorVT(VT))

          addRegisterClass(VT, &AArch64::ZPRRegClass);

    }

  }


  if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {

    addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);

    setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);

    setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);


    setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);

    setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);

  }


  // Compute derived properties from the register classes

  computeRegisterProperties(Subtarget->getRegisterInfo());


  // Provide all sorts of operation actions

  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);

  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);

  setOperationAction(ISD::SETCC, MVT::i32, Custom);

  setOperationAction(ISD::SETCC, MVT::i64, Custom);

  setOperationAction(ISD::SETCC, MVT::bf16, Custom);

  setOperationAction(ISD::SETCC, MVT::f16, Custom);

  setOperationAction(ISD::SETCC, MVT::f32, Custom);

  setOperationAction(ISD::SETCC, MVT::f64, Custom);

  setOperationAction(ISD::STRICT_FSETCC, MVT::bf16, Custom);

  setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);

  setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);

  setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);

  setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);

  setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);

  setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);

  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);

  setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);

  setOperationAction(ISD::BRCOND, MVT::Other, Custom);

  setOperationAction(ISD::BR_CC, MVT::i32, Custom);

  setOperationAction(ISD::BR_CC, MVT::i64, Custom);

  setOperationAction(ISD::BR_CC, MVT::f16, Custom);

  setOperationAction(ISD::BR_CC, MVT::f32, Custom);

  setOperationAction(ISD::BR_CC, MVT::f64, Custom);

  setOperationAction(ISD::SELECT, MVT::i32, Custom);

  setOperationAction(ISD::SELECT, MVT::i64, Custom);

  if (Subtarget->hasFPARMv8()) {

    setOperationAction(ISD::SELECT, MVT::f16, Custom);

    setOperationAction(ISD::SELECT, MVT::bf16, Custom);

  }

  setOperationAction(ISD::SELECT, MVT::f32, Custom);

  setOperationAction(ISD::SELECT, MVT::f64, Custom);

  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);

  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);

  setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);

  setOperationAction(ISD::SELECT_CC, MVT::bf16, Custom);

  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);

  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);

  setOperationAction(ISD::BR_JT, MVT::Other, Custom);

  setOperationAction(ISD::JumpTable, MVT::i64, Custom);

  setOperationAction(ISD::BRIND, MVT::Other, Custom);

  setOperationAction(ISD::SETCCCARRY, MVT::i64, Custom);


  setOperationAction(ISD::PtrAuthGlobalAddress, MVT::i64, Custom);


  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);

  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);

  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);


  setOperationAction(ISD::FREM, MVT::f32, Expand);

  setOperationAction(ISD::FREM, MVT::f64, Expand);

  setOperationAction(ISD::FREM, MVT::f80, Expand);


  setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);


  // Custom lowering hooks are needed for XOR

  // to fold it into CSINC/CSINV.

  setOperationAction(ISD::XOR, MVT::i32, Custom);

  setOperationAction(ISD::XOR, MVT::i64, Custom);


  setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);

  setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);


  // Virtually no operation on f128 is legal, but LLVM can't expand them when

  // there's a valid register class, so we need custom operations in most cases.

  setOperationAction(ISD::FABS, MVT::f128, Expand);

  setOperationAction(ISD::FADD, MVT::f128, LibCall);

  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);

  setOperationAction(ISD::FCOS, MVT::f128, Expand);

  setOperationAction(ISD::FDIV, MVT::f128, LibCall);

  setOperationAction(ISD::FMA, MVT::f128, Expand);

  setOperationAction(ISD::FMUL, MVT::f128, LibCall);

  setOperationAction(ISD::FNEG, MVT::f128, Expand);

  setOperationAction(ISD::FPOW, MVT::f128, Expand);

  setOperationAction(ISD::FREM, MVT::f128, Expand);

  setOperationAction(ISD::FRINT, MVT::f128, Expand);

  setOperationAction(ISD::FSIN, MVT::f128, Expand);

  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);

  setOperationAction(ISD::FSQRT, MVT::f128, Expand);

  setOperationAction(ISD::FSUB, MVT::f128, LibCall);

  setOperationAction(ISD::FTAN, MVT::f128, Expand);

  setOperationAction(ISD::FTRUNC, MVT::f128, Expand);

  setOperationAction(ISD::SETCC, MVT::f128, Custom);

  setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);

  setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);

  setOperationAction(ISD::BR_CC, MVT::f128, Custom);

  setOperationAction(ISD::SELECT, MVT::f128, Custom);

  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);

  setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);

  // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently

  // aren't handled.


  // Lowering for many of the conversions is actually specified by the non-f128

  // type. The LowerXXX function will be trivial when f128 isn't involved.

  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);

  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);

  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);

  setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);

  setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);

  setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);

  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);

  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);

  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);

  setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);

  setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);

  setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);

  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);

  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);

  setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);

  setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);

  setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);

  setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);

  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);

  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);

  setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);

  setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);

  setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);

  setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);

  if (Subtarget->hasFPARMv8()) {

    setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);

    setOperationAction(ISD::FP_ROUND, MVT::bf16, Custom);

  }

  setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);

  setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);

  if (Subtarget->hasFPARMv8()) {

    setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);

    setOperationAction(ISD::STRICT_FP_ROUND, MVT::bf16, Custom);

  }

  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);

  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);


  setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);

  setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);

  setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);

  setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);


  // Variable arguments.

  setOperationAction(ISD::VASTART, MVT::Other, Custom);

  setOperationAction(ISD::VAARG, MVT::Other, Custom);

  setOperationAction(ISD::VACOPY, MVT::Other, Custom);

  setOperationAction(ISD::VAEND, MVT::Other, Expand);


  // Variable-sized objects.

  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);

  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);


  // Lowering Funnel Shifts to EXTR

  setOperationAction(ISD::FSHR, MVT::i32, Custom);

  setOperationAction(ISD::FSHR, MVT::i64, Custom);

  setOperationAction(ISD::FSHL, MVT::i32, Custom);

  setOperationAction(ISD::FSHL, MVT::i64, Custom);


  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);


  // Constant pool entries

  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);


  // BlockAddress

  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);


  // AArch64 lacks both left-rotate and popcount instructions.

  setOperationAction(ISD::ROTL, MVT::i32, Expand);

  setOperationAction(ISD::ROTL, MVT::i64, Expand);

  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {

    setOperationAction(ISD::ROTL, VT, Expand);

    setOperationAction(ISD::ROTR, VT, Expand);

  }


  // AArch64 doesn't have i32 MULH{S|U}.

  setOperationAction(ISD::MULHU, MVT::i32, Expand);

  setOperationAction(ISD::MULHS, MVT::i32, Expand);


  // AArch64 doesn't have {U|S}MUL_LOHI.

  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);

  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);

  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);

  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);


  if (Subtarget->hasCSSC()) {

    setOperationAction(ISD::CTPOP, MVT::i32, Legal);

    setOperationAction(ISD::CTPOP, MVT::i64, Legal);

    setOperationAction(ISD::CTPOP, MVT::i128, Expand);


    setOperationAction(ISD::PARITY, MVT::i128, Expand);


    setOperationAction(ISD::CTTZ, MVT::i32, Legal);

    setOperationAction(ISD::CTTZ, MVT::i64, Legal);

    setOperationAction(ISD::CTTZ, MVT::i128, Expand);


    setOperationAction(ISD::ABS, MVT::i32, Legal);

    setOperationAction(ISD::ABS, MVT::i64, Legal);


    setOperationAction(ISD::SMAX, MVT::i32, Legal);

    setOperationAction(ISD::SMAX, MVT::i64, Legal);

    setOperationAction(ISD::UMAX, MVT::i32, Legal);

    setOperationAction(ISD::UMAX, MVT::i64, Legal);


    setOperationAction(ISD::SMIN, MVT::i32, Legal);

    setOperationAction(ISD::SMIN, MVT::i64, Legal);

    setOperationAction(ISD::UMIN, MVT::i32, Legal);

    setOperationAction(ISD::UMIN, MVT::i64, Legal);

  } else {

    setOperationAction(ISD::CTPOP, MVT::i32, Custom);

    setOperationAction(ISD::CTPOP, MVT::i64, Custom);

    setOperationAction(ISD::CTPOP, MVT::i128, Custom);


    setOperationAction(ISD::PARITY, MVT::i64, Custom);

    setOperationAction(ISD::PARITY, MVT::i128, Custom);


    setOperationAction(ISD::ABS, MVT::i32, Custom);

    setOperationAction(ISD::ABS, MVT::i64, Custom);

  }


  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);

  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);

  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {

    setOperationAction(ISD::SDIVREM, VT, Expand);

    setOperationAction(ISD::UDIVREM, VT, Expand);

  }

  setOperationAction(ISD::SREM, MVT::i32, Expand);

  setOperationAction(ISD::SREM, MVT::i64, Expand);

  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);

  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);

  setOperationAction(ISD::UREM, MVT::i32, Expand);

  setOperationAction(ISD::UREM, MVT::i64, Expand);


  // Custom lower Add/Sub/Mul with overflow.

  setOperationAction(ISD::SADDO, MVT::i32, Custom);

  setOperationAction(ISD::SADDO, MVT::i64, Custom);

  setOperationAction(ISD::UADDO, MVT::i32, Custom);

  setOperationAction(ISD::UADDO, MVT::i64, Custom);

  setOperationAction(ISD::SSUBO, MVT::i32, Custom);

  setOperationAction(ISD::SSUBO, MVT::i64, Custom);

  setOperationAction(ISD::USUBO, MVT::i32, Custom);

  setOperationAction(ISD::USUBO, MVT::i64, Custom);

  setOperationAction(ISD::SMULO, MVT::i32, Custom);

  setOperationAction(ISD::SMULO, MVT::i64, Custom);

  setOperationAction(ISD::UMULO, MVT::i32, Custom);

  setOperationAction(ISD::UMULO, MVT::i64, Custom);


  setOperationAction(ISD::UADDO_CARRY, MVT::i32, Custom);

  setOperationAction(ISD::UADDO_CARRY, MVT::i64, Custom);

  setOperationAction(ISD::USUBO_CARRY, MVT::i32, Custom);

  setOperationAction(ISD::USUBO_CARRY, MVT::i64, Custom);

  setOperationAction(ISD::SADDO_CARRY, MVT::i32, Custom);

  setOperationAction(ISD::SADDO_CARRY, MVT::i64, Custom);

  setOperationAction(ISD::SSUBO_CARRY, MVT::i32, Custom);

  setOperationAction(ISD::SSUBO_CARRY, MVT::i64, Custom);


  setOperationAction(ISD::FSIN, MVT::f32, Expand);

  setOperationAction(ISD::FSIN, MVT::f64, Expand);

  setOperationAction(ISD::FCOS, MVT::f32, Expand);

  setOperationAction(ISD::FCOS, MVT::f64, Expand);

  setOperationAction(ISD::FPOW, MVT::f32, Expand);

  setOperationAction(ISD::FPOW, MVT::f64, Expand);

  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);

  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

  if (Subtarget->hasFullFP16()) {

    setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);

    setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Custom);

  } else {

    setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);

    setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Promote);

  }


  for (auto Op : {ISD::FREM,          ISD::FPOW,         ISD::FPOWI,

                  ISD::FCOS,          ISD::FSIN,         ISD::FSINCOS,

                  ISD::FSINCOSPI,     ISD::FMODF,        ISD::FACOS,

                  ISD::FASIN,         ISD::FATAN,        ISD::FATAN2,

                  ISD::FCOSH,         ISD::FSINH,        ISD::FTANH,

                  ISD::FTAN,          ISD::FEXP,         ISD::FEXP2,

                  ISD::FEXP10,        ISD::FLOG,         ISD::FLOG2,

                  ISD::FLOG10,        ISD::STRICT_FREM,  ISD::STRICT_FPOW,

                  ISD::STRICT_FPOWI,  ISD::STRICT_FCOS,  ISD::STRICT_FSIN,

                  ISD::STRICT_FACOS,  ISD::STRICT_FASIN, ISD::STRICT_FATAN,

                  ISD::STRICT_FATAN2, ISD::STRICT_FCOSH, ISD::STRICT_FSINH,

                  ISD::STRICT_FTANH,  ISD::STRICT_FEXP,  ISD::STRICT_FEXP2,

                  ISD::STRICT_FLOG,   ISD::STRICT_FLOG2, ISD::STRICT_FLOG10,

                  ISD::STRICT_FTAN}) {

    setOperationAction(Op, MVT::f16, Promote);

    setOperationAction(Op, MVT::v4f16, Expand);

    setOperationAction(Op, MVT::v8f16, Expand);

    setOperationAction(Op, MVT::bf16, Promote);

    setOperationAction(Op, MVT::v4bf16, Expand);

    setOperationAction(Op, MVT::v8bf16, Expand);

  }


  // Legalize fcanonicalize to circumvent default expansion

  setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);

  if (Subtarget->hasFullFP16()) {

    setOperationAction(ISD::FCANONICALIZE, MVT::f16, Legal);

  }


  // fpextend from f16 or bf16 to f32 is legal

  setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);

  setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Legal);

  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);

  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f32, Legal);

  // fpextend from bf16 to f64 needs to be split into two fpextends

  setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);

  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);


  auto LegalizeNarrowFP = [this](MVT ScalarVT) {

    for (auto Op : {

             ISD::SETCC,

             ISD::SELECT_CC,

             ISD::BR_CC,

             ISD::FADD,

             ISD::FSUB,

             ISD::FMUL,

             ISD::FDIV,

             ISD::FMA,

             ISD::FCEIL,

             ISD::FSQRT,

             ISD::FFLOOR,

             ISD::FNEARBYINT,

             ISD::FRINT,

             ISD::FROUND,

             ISD::FROUNDEVEN,

             ISD::FTRUNC,

             ISD::FMINNUM,

             ISD::FMAXNUM,

             ISD::FMINIMUM,

             ISD::FMAXIMUM,

             ISD::FMINIMUMNUM,

             ISD::FMAXIMUMNUM,

             ISD::FCANONICALIZE,

             ISD::STRICT_FADD,

             ISD::STRICT_FSUB,

             ISD::STRICT_FMUL,

             ISD::STRICT_FDIV,

             ISD::STRICT_FMA,

             ISD::STRICT_FCEIL,

             ISD::STRICT_FFLOOR,

             ISD::STRICT_FSQRT,

             ISD::STRICT_FRINT,

             ISD::STRICT_FNEARBYINT,

             ISD::STRICT_FROUND,

             ISD::STRICT_FTRUNC,

             ISD::STRICT_FROUNDEVEN,

             ISD::STRICT_FMINNUM,

             ISD::STRICT_FMAXNUM,

             ISD::STRICT_FMINIMUM,

             ISD::STRICT_FMAXIMUM,

         })

      setOperationAction(Op, ScalarVT, Promote);


    for (auto Op : {ISD::FNEG, ISD::FABS})

      setOperationAction(Op, ScalarVT, Legal);


    // Round-to-integer need custom lowering for fp16, as Promote doesn't work

    // because the result type is integer.

    for (auto Op : {ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT,

                    ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT,

                    ISD::STRICT_LLRINT})

      setOperationAction(Op, ScalarVT, Custom);


    // promote v4f16 to v4f32 when that is known to be safe.

    auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);

    setOperationPromotedToType(ISD::FADD,       V4Narrow, MVT::v4f32);

    setOperationPromotedToType(ISD::FSUB,       V4Narrow, MVT::v4f32);

    setOperationPromotedToType(ISD::FMUL,       V4Narrow, MVT::v4f32);

    setOperationPromotedToType(ISD::FDIV,       V4Narrow, MVT::v4f32);

    setOperationPromotedToType(ISD::FCEIL,      V4Narrow, MVT::v4f32);

    setOperationPromotedToType(ISD::FFLOOR,     V4Narrow, MVT::v4f32);

    setOperationPromotedToType(ISD::FROUND,     V4Narrow, MVT::v4f32);

    setOperationPromotedToType(ISD::FTRUNC,     V4Narrow, MVT::v4f32);

    setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);

    setOperationPromotedToType(ISD::FRINT,      V4Narrow, MVT::v4f32);

    setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);

    setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);

    setOperationPromotedToType(ISD::SETCC,         V4Narrow, MVT::v4f32);


    setOperationAction(ISD::FABS,        V4Narrow, Legal);

    setOperationAction(ISD::FNEG,        V4Narrow, Legal);

    setOperationAction(ISD::FMA,         V4Narrow, Expand);

    setOperationAction(ISD::BR_CC,       V4Narrow, Expand);

    setOperationAction(ISD::SELECT,      V4Narrow, Expand);

    setOperationAction(ISD::SELECT_CC,   V4Narrow, Expand);

    setOperationAction(ISD::FCOPYSIGN,   V4Narrow, Custom);

    setOperationAction(ISD::FSQRT,       V4Narrow, Expand);


    auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);

    setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);

    setOperationPromotedToType(ISD::SETCC,         V8Narrow, MVT::v8f32);


    setOperationAction(ISD::FABS,        V8Narrow, Legal);

    setOperationAction(ISD::FADD,        V8Narrow, Legal);

    setOperationAction(ISD::FCEIL,       V8Narrow, Legal);

    setOperationAction(ISD::FCOPYSIGN,   V8Narrow, Custom);

    setOperationAction(ISD::FDIV,        V8Narrow, Legal);

    setOperationAction(ISD::FFLOOR,      V8Narrow, Legal);

    setOperationAction(ISD::FMA,         V8Narrow, Expand);

    setOperationAction(ISD::FMUL,        V8Narrow, Legal);

    setOperationAction(ISD::FNEARBYINT,  V8Narrow, Legal);

    setOperationAction(ISD::FNEG,        V8Narrow, Legal);

    setOperationAction(ISD::FROUND,      V8Narrow, Legal);

    setOperationAction(ISD::FROUNDEVEN,  V8Narrow, Legal);

    setOperationAction(ISD::FRINT,       V8Narrow, Legal);

    setOperationAction(ISD::FSQRT,       V8Narrow, Expand);

    setOperationAction(ISD::FSUB,        V8Narrow, Legal);

    setOperationAction(ISD::FTRUNC,      V8Narrow, Legal);

    setOperationAction(ISD::BR_CC,       V8Narrow, Expand);

    setOperationAction(ISD::SELECT,      V8Narrow, Expand);

    setOperationAction(ISD::SELECT_CC,   V8Narrow, Expand);

    setOperationAction(ISD::FP_EXTEND,   V8Narrow, Expand);

  };


  if (!Subtarget->hasFullFP16()) {

    LegalizeNarrowFP(MVT::f16);

  }

  LegalizeNarrowFP(MVT::bf16);

  setOperationAction(ISD::FP_ROUND, MVT::v4f32, Custom);

  setOperationAction(ISD::FP_ROUND, MVT::v4bf16, Custom);


  // AArch64 has implementations of a lot of rounding-like FP operations.

  // clang-format off

  for (auto Op :

       {ISD::FFLOOR,          ISD::FNEARBYINT,      ISD::FCEIL,

        ISD::FRINT,           ISD::FTRUNC,          ISD::FROUND,

        ISD::FROUNDEVEN,      ISD::FMINNUM,         ISD::FMAXNUM,

        ISD::FMINIMUM,        ISD::FMAXIMUM,        ISD::LROUND,

        ISD::LLROUND,         ISD::LRINT,           ISD::LLRINT,

        ISD::FMINNUM_IEEE,    ISD::FMAXNUM_IEEE,

        ISD::STRICT_FFLOOR,   ISD::STRICT_FCEIL,    ISD::STRICT_FNEARBYINT,

        ISD::STRICT_FRINT,    ISD::STRICT_FTRUNC,   ISD::STRICT_FROUNDEVEN,

        ISD::STRICT_FROUND,   ISD::STRICT_FMINNUM,  ISD::STRICT_FMAXNUM,

        ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND,

        ISD::STRICT_LLROUND,  ISD::STRICT_LRINT,    ISD::STRICT_LLRINT}) {

    for (MVT Ty : {MVT::f32, MVT::f64})

      setOperationAction(Op, Ty, Legal);

    if (Subtarget->hasFullFP16())

      setOperationAction(Op, MVT::f16, Legal);

  }

  // clang-format on


  // Basic strict FP operations are legal

  for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,

                  ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) {

    for (MVT Ty : {MVT::f32, MVT::f64})

      setOperationAction(Op, Ty, Legal);

    if (Subtarget->hasFullFP16())

      setOperationAction(Op, MVT::f16, Legal);

  }


  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);


  setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);

  setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);

  setOperationAction(ISD::GET_FPMODE, MVT::i32, Custom);

  setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);

  setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);


  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);

  if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {

    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, LibCall);

  } else {

    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);

    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Expand);

  }

  setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);

  setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);


  // Generate outline atomics library calls only if LSE was not specified for

  // subtarget

  if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {

    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);

    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);

    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);

    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);

    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);

    setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);

    setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);

    setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);

    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);

  }


  if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {

    setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f16, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f32, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f64, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::bf16, LibCall);


    setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f16, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f32, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f64, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::bf16, LibCall);


    setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f16, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f32, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f64, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::bf16, LibCall);


    setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f16, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f32, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f64, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::bf16, LibCall);


    setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f16, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f32, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f64, LibCall);

    setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::bf16, LibCall);

  }


  if (Subtarget->hasLSE128()) {

    // Custom lowering because i128 is not legal. Must be replaced by 2x64

    // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.

    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom);

    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom);

    setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom);

  }


  // 128-bit loads and stores can be done without expanding

  setOperationAction(ISD::LOAD, MVT::i128, Custom);

  setOperationAction(ISD::STORE, MVT::i128, Custom);


  // Aligned 128-bit loads and stores are single-copy atomic according to the

  // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.

  if (Subtarget->hasLSE2()) {

    setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);

    setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);

  }


  // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the

  // custom lowering, as there are no un-paired non-temporal stores and

  // legalization will break up 256 bit inputs.

  setOperationAction(ISD::STORE, MVT::v32i8, Custom);

  setOperationAction(ISD::STORE, MVT::v16i16, Custom);

  setOperationAction(ISD::STORE, MVT::v16f16, Custom);

  setOperationAction(ISD::STORE, MVT::v16bf16, Custom);

  setOperationAction(ISD::STORE, MVT::v8i32, Custom);

  setOperationAction(ISD::STORE, MVT::v8f32, Custom);

  setOperationAction(ISD::STORE, MVT::v4f64, Custom);

  setOperationAction(ISD::STORE, MVT::v4i64, Custom);


  // 256 bit non-temporal loads can be lowered to LDNP. This is done using

  // custom lowering, as there are no un-paired non-temporal loads legalization

  // will break up 256 bit inputs.

  setOperationAction(ISD::LOAD, MVT::v32i8, Custom);

  setOperationAction(ISD::LOAD, MVT::v16i16, Custom);

  setOperationAction(ISD::LOAD, MVT::v16f16, Custom);

  setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);

  setOperationAction(ISD::LOAD, MVT::v8i32, Custom);

  setOperationAction(ISD::LOAD, MVT::v8f32, Custom);

  setOperationAction(ISD::LOAD, MVT::v4f64, Custom);

  setOperationAction(ISD::LOAD, MVT::v4i64, Custom);


  // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.

  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);


  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&

      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {

    // Issue __sincos_stret if available.

    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);

    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);

  } else {

    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);

    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

  }


  // Make floating-point constants legal for the large code model, so they don't

  // become loads from the constant pool.

  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {

    setOperationAction(ISD::ConstantFP, MVT::f32, Legal);

    setOperationAction(ISD::ConstantFP, MVT::f64, Legal);

  }


  // AArch64 does not have floating-point extending loads, i1 sign-extending

  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.

  for (MVT VT : MVT::fp_valuetypes()) {

    setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);

    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);

    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);

    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);

    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);

  }

  for (MVT VT : MVT::integer_valuetypes())

    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);


  for (MVT WideVT : MVT::fp_valuetypes()) {

    for (MVT NarrowVT : MVT::fp_valuetypes()) {

      if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {

        setTruncStoreAction(WideVT, NarrowVT, Expand);

      }

    }

  }


  if (Subtarget->hasFPARMv8()) {

    setOperationAction(ISD::BITCAST, MVT::i16, Custom);

    setOperationAction(ISD::BITCAST, MVT::f16, Custom);

    setOperationAction(ISD::BITCAST, MVT::bf16, Custom);

  }


  // Indexed loads and stores are supported.

  for (unsigned im = (unsigned)ISD::PRE_INC;

       im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {

    setIndexedLoadAction(im, MVT::i8, Legal);

    setIndexedLoadAction(im, MVT::i16, Legal);

    setIndexedLoadAction(im, MVT::i32, Legal);

    setIndexedLoadAction(im, MVT::i64, Legal);

    setIndexedLoadAction(im, MVT::f64, Legal);

    setIndexedLoadAction(im, MVT::f32, Legal);

    setIndexedLoadAction(im, MVT::f16, Legal);

    setIndexedLoadAction(im, MVT::bf16, Legal);

    setIndexedStoreAction(im, MVT::i8, Legal);

    setIndexedStoreAction(im, MVT::i16, Legal);

    setIndexedStoreAction(im, MVT::i32, Legal);

    setIndexedStoreAction(im, MVT::i64, Legal);

    setIndexedStoreAction(im, MVT::f64, Legal);

    setIndexedStoreAction(im, MVT::f32, Legal);

    setIndexedStoreAction(im, MVT::f16, Legal);

    setIndexedStoreAction(im, MVT::bf16, Legal);

  }


  // Trap.

  setOperationAction(ISD::TRAP, MVT::Other, Legal);

  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

  setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);


  // We combine OR nodes for ccmp operations.

  setTargetDAGCombine(ISD::OR);

  // Try to create BICs for vector ANDs.

  setTargetDAGCombine(ISD::AND);


  // llvm.init.trampoline and llvm.adjust.trampoline

  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);

  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);


  // Vector add and sub nodes may conceal a high-half opportunity.

  // Also, try to fold ADD into CSINC/CSINV..

  setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP,

                       ISD::UINT_TO_FP});


  setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,

                       ISD::FP_TO_UINT_SAT, ISD::FADD});


  // Try and combine setcc with csel

  setTargetDAGCombine(ISD::SETCC);


  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);


  setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND,

                       ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS,

                       ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR,

                       ISD::STORE, ISD::BUILD_VECTOR});

  setTargetDAGCombine(ISD::SMIN);

  setTargetDAGCombine(ISD::TRUNCATE);

  setTargetDAGCombine(ISD::LOAD);


  setTargetDAGCombine(ISD::MSTORE);


  setTargetDAGCombine(ISD::MUL);


  setTargetDAGCombine({ISD::SELECT, ISD::VSELECT});


  setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,

                       ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,

                       ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});


  setTargetDAGCombine(

      {ISD::MGATHER, ISD::MSCATTER, ISD::EXPERIMENTAL_VECTOR_HISTOGRAM});


  setTargetDAGCombine(ISD::FP_EXTEND);


  setTargetDAGCombine(ISD::GlobalAddress);


  setTargetDAGCombine(ISD::CTLZ);


  setTargetDAGCombine(ISD::GET_ACTIVE_LANE_MASK);


  setTargetDAGCombine(ISD::VECREDUCE_AND);

  setTargetDAGCombine(ISD::VECREDUCE_OR);

  setTargetDAGCombine(ISD::VECREDUCE_XOR);


  setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);


  setTargetDAGCombine(ISD::SHL);

  setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE);


  // In case of strict alignment, avoid an excessive number of byte wide stores.

  MaxStoresPerMemsetOptSize = 8;

  MaxStoresPerMemset =

      Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;


  MaxGluedStoresPerMemcpy = 4;

  MaxStoresPerMemcpyOptSize = 4;

  MaxStoresPerMemcpy =

      Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;


  MaxStoresPerMemmoveOptSize = 4;

  MaxStoresPerMemmove =

      Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;


  MaxLoadsPerMemcmpOptSize = 4;

  MaxLoadsPerMemcmp =

      Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;


  setStackPointerRegisterToSaveRestore(AArch64::SP);


  setSchedulingPreference(Sched::Hybrid);


  EnableExtLdPromotion = true;


  // Set required alignment.

  setMinFunctionAlignment(Align(4));

  // Set preferred alignments.


  // Don't align loops on Windows. The SEH unwind info generation needs to

  // know the exact length of functions before the alignments have been

  // expanded.

  if (!Subtarget->isTargetWindows())

    setPrefLoopAlignment(STI.getPrefLoopAlignment());

  setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment());

  setPrefFunctionAlignment(STI.getPrefFunctionAlignment());


  // Only change the limit for entries in a jump table if specified by

  // the sub target, but not at the command line.

  unsigned MaxJT = STI.getMaximumJumpTableSize();

  if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)

    setMaximumJumpTableSize(MaxJT);


  setHasExtractBitsInsn(true);


  setMaxDivRemBitWidthSupported(128);


  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

  if (Subtarget->hasSME())

    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);


  if (Subtarget->isNeonAvailable()) {

    // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to

    // silliness like this:

    // clang-format off

    for (auto Op :

         {ISD::SELECT,            ISD::SELECT_CC,      ISD::FATAN2,

          ISD::BR_CC,             ISD::FADD,           ISD::FSUB,

          ISD::FMUL,              ISD::FDIV,           ISD::FMA,

          ISD::FNEG,              ISD::FABS,           ISD::FCEIL,

          ISD::FSQRT,             ISD::FFLOOR,         ISD::FNEARBYINT,

          ISD::FSIN,              ISD::FCOS,           ISD::FTAN,

          ISD::FASIN,             ISD::FACOS,          ISD::FATAN,

          ISD::FSINH,             ISD::FCOSH,          ISD::FTANH,

          ISD::FPOW,              ISD::FLOG,           ISD::FLOG2,

          ISD::FLOG10,            ISD::FEXP,           ISD::FEXP2,

          ISD::FEXP10,            ISD::FRINT,          ISD::FROUND,

          ISD::FROUNDEVEN,        ISD::FTRUNC,         ISD::FMINNUM,

          ISD::FMAXNUM,           ISD::FMINIMUM,       ISD::FMAXIMUM,

          ISD::FMAXNUM_IEEE,      ISD::FMINNUM_IEEE,

          ISD::STRICT_FADD,       ISD::STRICT_FSUB,    ISD::STRICT_FMUL,

          ISD::STRICT_FDIV,       ISD::STRICT_FMA,     ISD::STRICT_FCEIL,

          ISD::STRICT_FFLOOR,     ISD::STRICT_FSQRT,   ISD::STRICT_FRINT,

          ISD::STRICT_FNEARBYINT, ISD::STRICT_FROUND,  ISD::STRICT_FTRUNC,

          ISD::STRICT_FROUNDEVEN, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,

          ISD::STRICT_FMINIMUM,   ISD::STRICT_FMAXIMUM})

      setOperationAction(Op, MVT::v1f64, Expand);

    // clang-format on


    for (auto Op :

         {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP,

          ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL,

          ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT,

          ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND})

      setOperationAction(Op, MVT::v1i64, Expand);


    // AArch64 doesn't have a direct vector ->f32 conversion instructions for

    // elements smaller than i32, so promote the input to i32 first.

    setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);

    setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);


    // Similarly, there is no direct i32 -> f64 vector conversion instruction.

    // Or, direct i32 -> f16 vector conversion.  Set it so custom, so the

    // conversion happens in two steps: v4i32 -> v4f32 -> v4f16

    for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,

                    ISD::STRICT_UINT_TO_FP})

      for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})

        setOperationAction(Op, VT, Custom);


    if (Subtarget->hasFullFP16()) {

      setOperationAction(ISD::ConstantFP, MVT::f16, Legal);

      setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);


      setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom);

      setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);

      setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Custom);

      setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);

      setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);

      setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);

      setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);

      setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);

    } else {

      // when AArch64 doesn't have fullfp16 support, promote the input

      // to i32 first.

      setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);

      setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);

      setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);

      setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);

      setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);

      setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);

      setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);

      setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);

    }


    setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);

    setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);

    setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal);

    setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal);

    setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom);

    setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom);

    setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);

    setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom);

    for (auto VT : {MVT::v1i64, MVT::v2i64}) {

      setOperationAction(ISD::UMAX, VT, Custom);

      setOperationAction(ISD::SMAX, VT, Custom);

      setOperationAction(ISD::UMIN, VT, Custom);

      setOperationAction(ISD::SMIN, VT, Custom);

    }


    // Custom handling for some quad-vector types to detect MULL.

    setOperationAction(ISD::MUL, MVT::v8i16, Custom);

    setOperationAction(ISD::MUL, MVT::v4i32, Custom);

    setOperationAction(ISD::MUL, MVT::v2i64, Custom);

    setOperationAction(ISD::MUL, MVT::v4i16, Custom);

    setOperationAction(ISD::MUL, MVT::v2i32, Custom);

    setOperationAction(ISD::MUL, MVT::v1i64, Custom);


    // Saturates

    for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64,

                    MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

      setOperationAction(ISD::SADDSAT, VT, Legal);

      setOperationAction(ISD::UADDSAT, VT, Legal);

      setOperationAction(ISD::SSUBSAT, VT, Legal);

      setOperationAction(ISD::USUBSAT, VT, Legal);

    }


    for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,

                   MVT::v4i32}) {

      setOperationAction(ISD::AVGFLOORS, VT, Legal);

      setOperationAction(ISD::AVGFLOORU, VT, Legal);

      setOperationAction(ISD::AVGCEILS, VT, Legal);

      setOperationAction(ISD::AVGCEILU, VT, Legal);

      setOperationAction(ISD::ABDS, VT, Legal);

      setOperationAction(ISD::ABDU, VT, Legal);

    }


    // Vector reductions

    for (MVT VT : { MVT::v4f16, MVT::v2f32,

                    MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {

      if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {

        setOperationAction(ISD::VECREDUCE_FMAX, VT, Legal);

        setOperationAction(ISD::VECREDUCE_FMIN, VT, Legal);

        setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Legal);

        setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Legal);


        setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);

      }

    }

    if (Subtarget->hasFullFP16())

      setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);


    for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,

                    MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {

      setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);

      setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);

      setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);

      setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);

      setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);

      setOperationAction(ISD::VECREDUCE_AND, VT, Custom);

      setOperationAction(ISD::VECREDUCE_OR, VT, Custom);

      setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);

    }

    setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);

    setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Custom);

    setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Custom);

    setOperationAction(ISD::VECREDUCE_XOR, MVT::v2i64, Custom);


    setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);

    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);

    // Likewise, narrowing and extending vector loads/stores aren't handled

    // directly.

    for (MVT VT : MVT::fixedlen_vector_valuetypes()) {

      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);


      if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {

        setOperationAction(ISD::MULHS, VT, Legal);

        setOperationAction(ISD::MULHU, VT, Legal);

      } else {

        setOperationAction(ISD::MULHS, VT, Expand);

        setOperationAction(ISD::MULHU, VT, Expand);

      }

      setOperationAction(ISD::SMUL_LOHI, VT, Expand);

      setOperationAction(ISD::UMUL_LOHI, VT, Expand);


      setOperationAction(ISD::BSWAP, VT, Expand);

      setOperationAction(ISD::CTTZ, VT, Expand);


      for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {

        setTruncStoreAction(VT, InnerVT, Expand);

        setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);

        setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);

        setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);

      }

    }


    for (auto Op :

         {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,

          ISD::FROUND, ISD::FROUNDEVEN, ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,

          ISD::STRICT_FFLOOR, ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL,

          ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUND,

          ISD::STRICT_FROUNDEVEN}) {

      for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})

        setOperationAction(Op, Ty, Legal);

      if (Subtarget->hasFullFP16())

        for (MVT Ty : {MVT::v4f16, MVT::v8f16})

          setOperationAction(Op, Ty, Legal);

    }


    // LRINT and LLRINT.

    for (auto Op : {ISD::LRINT, ISD::LLRINT}) {

      for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})

        setOperationAction(Op, Ty, Custom);

      if (Subtarget->hasFullFP16())

        for (MVT Ty : {MVT::v4f16, MVT::v8f16})

          setOperationAction(Op, Ty, Custom);

    }


    setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);


    setOperationAction(ISD::BITCAST, MVT::i2, Custom);

    setOperationAction(ISD::BITCAST, MVT::i4, Custom);

    setOperationAction(ISD::BITCAST, MVT::i8, Custom);

    setOperationAction(ISD::BITCAST, MVT::i16, Custom);


    setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);

    setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);

    setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);


    setLoadExtAction(ISD::EXTLOAD,  MVT::v4i16, MVT::v4i8, Custom);

    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);

    setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);

    setLoadExtAction(ISD::EXTLOAD,  MVT::v4i32, MVT::v4i8, Custom);

    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);

    setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);


    // ADDP custom lowering

    for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })

      setOperationAction(ISD::ADD, VT, Custom);

    // FADDP custom lowering

    for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })

      setOperationAction(ISD::FADD, VT, Custom);


    if (Subtarget->hasDotProd()) {

      static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,

                                        ISD::PARTIAL_REDUCE_UMLA};


      setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal);

      setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal);

      setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v16i8, Custom);

      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);


      if (Subtarget->hasMatMulInt8()) {

        setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v4i32,

                                  MVT::v16i8, Legal);

        setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i64,

                                  MVT::v16i8, Custom);


        setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i32,

                                  MVT::v8i8, Legal);

      }

    }


  } else /* !isNeonAvailable */ {

    for (MVT VT : MVT::fixedlen_vector_valuetypes()) {

      for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)

        setOperationAction(Op, VT, Expand);


      if (VT.is128BitVector() || VT.is64BitVector()) {

        setOperationAction(ISD::LOAD, VT, Legal);

        setOperationAction(ISD::STORE, VT, Legal);

        setOperationAction(ISD::BITCAST, VT,

                           Subtarget->isLittleEndian() ? Legal : Expand);

      }

      for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {

        setTruncStoreAction(VT, InnerVT, Expand);

        setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);

        setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);

        setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);

      }

    }

  }


  for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {

    setOperationAction(ISD::TRUNCATE_SSAT_S, VT, Legal);

    setOperationAction(ISD::TRUNCATE_SSAT_U, VT, Legal);

    setOperationAction(ISD::TRUNCATE_USAT_U, VT, Legal);

  }


  if (Subtarget->hasSME()) {

    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);

  }


  // FIXME: Move lowering for more nodes here if those are common between

  // SVE and SME.

  if (Subtarget->isSVEorStreamingSVEAvailable()) {

    for (auto VT :

         {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {

      setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);

      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

      setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);

      setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);

    }

    for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {

      setOperationAction(ISD::VECTOR_FIND_LAST_ACTIVE, VT, Legal);

      setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Legal);

    }


    if (Subtarget->hasSVE2p1() ||

        (Subtarget->hasSME2() && Subtarget->isStreaming()))

      setOperationAction(ISD::GET_ACTIVE_LANE_MASK, MVT::nxv32i1, Custom);


    for (auto VT : {MVT::v16i8, MVT::v8i8, MVT::v4i16, MVT::v2i32})

      setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Custom);

  }


  if (Subtarget->isSVEorStreamingSVEAvailable()) {

    for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {

      setOperationAction(ISD::BITREVERSE, VT, Custom);

      setOperationAction(ISD::BSWAP, VT, Custom);

      setOperationAction(ISD::CTLZ, VT, Custom);

      setOperationAction(ISD::CTPOP, VT, Custom);

      setOperationAction(ISD::CTTZ, VT, Custom);

      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

      setOperationAction(ISD::UINT_TO_FP, VT, Custom);

      setOperationAction(ISD::SINT_TO_FP, VT, Custom);

      setOperationAction(ISD::FP_TO_UINT, VT, Custom);

      setOperationAction(ISD::FP_TO_SINT, VT, Custom);

      setOperationAction(ISD::MLOAD, VT, Custom);

      setOperationAction(ISD::MSTORE, VT, Legal);

      setOperationAction(ISD::MUL, VT, Custom);

      setOperationAction(ISD::MULHS, VT, Custom);

      setOperationAction(ISD::MULHU, VT, Custom);

      setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);

      setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);

      setOperationAction(ISD::SELECT, VT, Custom);

      setOperationAction(ISD::SETCC, VT, Custom);

      setOperationAction(ISD::SDIV, VT, Custom);

      setOperationAction(ISD::UDIV, VT, Custom);

      setOperationAction(ISD::SMIN, VT, Custom);

      setOperationAction(ISD::UMIN, VT, Custom);

      setOperationAction(ISD::SMAX, VT, Custom);

      setOperationAction(ISD::UMAX, VT, Custom);

      setOperationAction(ISD::SHL, VT, Custom);

      setOperationAction(ISD::SRL, VT, Custom);

      setOperationAction(ISD::SRA, VT, Custom);

      setOperationAction(ISD::ABS, VT, Custom);

      setOperationAction(ISD::ABDS, VT, Custom);

      setOperationAction(ISD::ABDU, VT, Custom);

      setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);

      setOperationAction(ISD::VECREDUCE_AND, VT, Custom);

      setOperationAction(ISD::VECREDUCE_OR, VT, Custom);

      setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);

      setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);

      setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);

      setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);

      setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);

      setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);

      setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);

      setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);


      setOperationAction(ISD::UMUL_LOHI, VT, Expand);

      setOperationAction(ISD::SMUL_LOHI, VT, Expand);

      setOperationAction(ISD::SELECT_CC, VT, Expand);

      setOperationAction(ISD::ROTL, VT, Expand);

      setOperationAction(ISD::ROTR, VT, Expand);


      setOperationAction(ISD::SADDSAT, VT, Legal);

      setOperationAction(ISD::UADDSAT, VT, Legal);

      setOperationAction(ISD::SSUBSAT, VT, Legal);

      setOperationAction(ISD::USUBSAT, VT, Legal);

      setOperationAction(ISD::UREM, VT, Expand);

      setOperationAction(ISD::SREM, VT, Expand);

      setOperationAction(ISD::SDIVREM, VT, Expand);

      setOperationAction(ISD::UDIVREM, VT, Expand);


      setOperationAction(ISD::AVGFLOORS, VT, Custom);

      setOperationAction(ISD::AVGFLOORU, VT, Custom);

      setOperationAction(ISD::AVGCEILS, VT, Custom);

      setOperationAction(ISD::AVGCEILU, VT, Custom);


      if (!Subtarget->isLittleEndian())

        setOperationAction(ISD::BITCAST, VT, Custom);


      if (Subtarget->hasSVE2() ||

          (Subtarget->hasSME() && Subtarget->isStreaming()))

        // For SLI/SRI.

        setOperationAction(ISD::OR, VT, Custom);

    }


    // Illegal unpacked integer vector types.

    for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {

      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

    }


    // Type legalize unpacked bitcasts.

    for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})

      setOperationAction(ISD::BITCAST, VT, Custom);


    for (auto VT :

         { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,

           MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })

      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);


    for (auto VT :

         {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {

      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

      setOperationAction(ISD::SELECT, VT, Custom);

      setOperationAction(ISD::SETCC, VT, Custom);

      setOperationAction(ISD::TRUNCATE, VT, Custom);

      setOperationAction(ISD::VECREDUCE_AND, VT, Custom);

      setOperationAction(ISD::VECREDUCE_OR, VT, Custom);

      setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);


      setOperationAction(ISD::SELECT_CC, VT, Expand);

      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);


      // There are no legal MVT::nxv16f## based types.

      if (VT != MVT::nxv16i1) {

        setOperationAction(ISD::FP_TO_SINT, VT, Custom);

        setOperationAction(ISD::FP_TO_UINT, VT, Custom);

        setOperationAction(ISD::SINT_TO_FP, VT, Custom);

        setOperationAction(ISD::UINT_TO_FP, VT, Custom);

      }

    }


    // NEON doesn't support masked loads/stores, but SME and SVE do.

    for (auto VT :

         {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,

          MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,

          MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {

      setOperationAction(ISD::MLOAD, VT, Custom);

      setOperationAction(ISD::MSTORE, VT, Custom);

    }


    // Firstly, exclude all scalable vector extending loads/truncating stores,

    // include both integer and floating scalable vector.

    for (MVT VT : MVT::scalable_vector_valuetypes()) {

      for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {

        setTruncStoreAction(VT, InnerVT, Expand);

        setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);

        setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);

        setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);

      }

    }


    // Then, selectively enable those which we directly support.

    setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);

    setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);

    setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);

    setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);

    setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);

    setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);

    for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {

      setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);

      setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);

      setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);

      setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);

      setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);

      setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);

    }


    // SVE supports truncating stores of 64 and 128-bit vectors

    setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);

    setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);

    setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);

    setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);

    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);


    for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,

                    MVT::nxv4f32, MVT::nxv2f64}) {

      setOperationAction(ISD::BITCAST, VT, Custom);

      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

      setOperationAction(ISD::MLOAD, VT, Custom);

      setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);

      setOperationAction(ISD::SELECT, VT, Custom);

      setOperationAction(ISD::SETCC, VT, Custom);

      setOperationAction(ISD::FADD, VT, Custom);

      setOperationAction(ISD::FCOPYSIGN, VT, Custom);

      setOperationAction(ISD::FDIV, VT, Custom);

      setOperationAction(ISD::FMA, VT, Custom);

      setOperationAction(ISD::FMAXIMUM, VT, Custom);

      setOperationAction(ISD::FMAXNUM, VT, Custom);

      setOperationAction(ISD::FMINIMUM, VT, Custom);

      setOperationAction(ISD::FMINNUM, VT, Custom);

      setOperationAction(ISD::FMUL, VT, Custom);

      setOperationAction(ISD::FNEG, VT, Custom);

      setOperationAction(ISD::FSUB, VT, Custom);

      setOperationAction(ISD::FCEIL, VT, Custom);

      setOperationAction(ISD::FFLOOR, VT, Custom);

      setOperationAction(ISD::FNEARBYINT, VT, Custom);

      setOperationAction(ISD::FRINT, VT, Custom);

      setOperationAction(ISD::LRINT, VT, Custom);

      setOperationAction(ISD::LLRINT, VT, Custom);

      setOperationAction(ISD::FROUND, VT, Custom);

      setOperationAction(ISD::FROUNDEVEN, VT, Custom);

      setOperationAction(ISD::FTRUNC, VT, Custom);

      setOperationAction(ISD::FSQRT, VT, Custom);

      setOperationAction(ISD::FABS, VT, Custom);

      setOperationAction(ISD::FP_EXTEND, VT, Custom);

      setOperationAction(ISD::FP_ROUND, VT, Custom);

      setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);

      setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);

      setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);

      setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);

      setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);

      setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);

      setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);

      setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);

      setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);


      setOperationAction(ISD::SELECT_CC, VT, Expand);

      setOperationAction(ISD::FREM, VT, Expand);

      setOperationAction(ISD::FPOW, VT, Expand);

      setOperationAction(ISD::FPOWI, VT, Expand);

      setOperationAction(ISD::FCOS, VT, Expand);

      setOperationAction(ISD::FSIN, VT, Expand);

      setOperationAction(ISD::FSINCOS, VT, Expand);

      setOperationAction(ISD::FTAN, VT, Expand);

      setOperationAction(ISD::FACOS, VT, Expand);

      setOperationAction(ISD::FASIN, VT, Expand);

      setOperationAction(ISD::FATAN, VT, Expand);

      setOperationAction(ISD::FATAN2, VT, Expand);

      setOperationAction(ISD::FCOSH, VT, Expand);

      setOperationAction(ISD::FSINH, VT, Expand);

      setOperationAction(ISD::FTANH, VT, Expand);

      setOperationAction(ISD::FEXP, VT, Expand);

      setOperationAction(ISD::FEXP2, VT, Expand);

      setOperationAction(ISD::FEXP10, VT, Expand);

      setOperationAction(ISD::FLOG, VT, Expand);

      setOperationAction(ISD::FLOG2, VT, Expand);

      setOperationAction(ISD::FLOG10, VT, Expand);


      setCondCodeAction(ISD::SETO, VT, Expand);

      setCondCodeAction(ISD::SETOLT, VT, Expand);

      setCondCodeAction(ISD::SETLT, VT, Expand);

      setCondCodeAction(ISD::SETOLE, VT, Expand);

      setCondCodeAction(ISD::SETLE, VT, Expand);

      setCondCodeAction(ISD::SETULT, VT, Expand);

      setCondCodeAction(ISD::SETULE, VT, Expand);

      setCondCodeAction(ISD::SETUGE, VT, Expand);

      setCondCodeAction(ISD::SETUGT, VT, Expand);

      setCondCodeAction(ISD::SETUEQ, VT, Expand);

      setCondCodeAction(ISD::SETONE, VT, Expand);

    }


    for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {

      setOperationAction(ISD::BITCAST, VT, Custom);

      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

      setOperationAction(ISD::FABS, VT, Custom);

      setOperationAction(ISD::FCOPYSIGN, VT, Custom);

      setOperationAction(ISD::FNEG, VT, Custom);

      setOperationAction(ISD::FP_EXTEND, VT, Custom);

      setOperationAction(ISD::FP_ROUND, VT, Custom);

      setOperationAction(ISD::MLOAD, VT, Custom);

      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

      setOperationAction(ISD::SELECT, VT, Custom);

      setOperationAction(ISD::SELECT_CC, VT, Expand);

      setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);

      setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);

      setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);

      setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);


      if (Subtarget->hasSVEB16B16() &&

          Subtarget->isNonStreamingSVEorSME2Available()) {

        setOperationAction(ISD::FADD, VT, Legal);

        setOperationAction(ISD::FMA, VT, Custom);

        setOperationAction(ISD::FMAXIMUM, VT, Custom);

        setOperationAction(ISD::FMAXNUM, VT, Custom);

        setOperationAction(ISD::FMINIMUM, VT, Custom);

        setOperationAction(ISD::FMINNUM, VT, Custom);

        setOperationAction(ISD::FMUL, VT, Legal);

        setOperationAction(ISD::FSUB, VT, Legal);

      }

    }


    for (auto Opcode :

         {ISD::FCEIL, ISD::FDIV, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,

          ISD::FROUND, ISD::FROUNDEVEN, ISD::FSQRT, ISD::FTRUNC, ISD::SETCC,

          ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMAXIMUM,

          ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMINIMUM}) {

      setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);

      setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);

      setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);

    }


    if (!Subtarget->hasSVEB16B16() ||

        !Subtarget->isNonStreamingSVEorSME2Available()) {

      for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM,

                          ISD::FMINIMUM, ISD::FMINNUM, ISD::FMUL, ISD::FSUB}) {

        setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);

        setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);

        setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);

      }

    }


    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);

    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);


    // NEON doesn't support integer divides, but SVE does

    for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,

                    MVT::v4i32, MVT::v1i64, MVT::v2i64}) {

      setOperationAction(ISD::SDIV, VT, Custom);

      setOperationAction(ISD::UDIV, VT, Custom);

    }


    // NEON doesn't support 64-bit vector integer muls, but SVE does.

    setOperationAction(ISD::MUL, MVT::v1i64, Custom);

    setOperationAction(ISD::MUL, MVT::v2i64, Custom);


    // NOTE: Currently this has to happen after computeRegisterProperties rather

    // than the preferred option of combining it with the addRegisterClass call.

    if (Subtarget->useSVEForFixedLengthVectors()) {

      for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {

        if (useSVEForFixedLengthVectorVT(

                VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))

          addTypeForFixedLengthSVE(VT);

      }

      for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) {

        if (useSVEForFixedLengthVectorVT(

                VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))

          addTypeForFixedLengthSVE(VT);

      }


      // 64bit results can mean a bigger than NEON input.

      for (auto VT : {MVT::v8i8, MVT::v4i16})

        setOperationAction(ISD::TRUNCATE, VT, Custom);

      setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);


      // 128bit results imply a bigger than NEON input.

      for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})

        setOperationAction(ISD::TRUNCATE, VT, Custom);

      for (auto VT : {MVT::v8f16, MVT::v4f32})

        setOperationAction(ISD::FP_ROUND, VT, Custom);


      // These operations are not supported on NEON but SVE can do them.

      setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);

      setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);

      setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);

      setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);

      setOperationAction(ISD::MULHS, MVT::v1i64, Custom);

      setOperationAction(ISD::MULHS, MVT::v2i64, Custom);

      setOperationAction(ISD::MULHU, MVT::v1i64, Custom);

      setOperationAction(ISD::MULHU, MVT::v2i64, Custom);

      setOperationAction(ISD::SMAX, MVT::v1i64, Custom);

      setOperationAction(ISD::SMAX, MVT::v2i64, Custom);

      setOperationAction(ISD::SMIN, MVT::v1i64, Custom);

      setOperationAction(ISD::SMIN, MVT::v2i64, Custom);

      setOperationAction(ISD::UMAX, MVT::v1i64, Custom);

      setOperationAction(ISD::UMAX, MVT::v2i64, Custom);

      setOperationAction(ISD::UMIN, MVT::v1i64, Custom);

      setOperationAction(ISD::UMIN, MVT::v2i64, Custom);

      setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);

      setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);

      setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);

      setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);


      // Int operations with no NEON support.

      for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,

                      MVT::v2i32, MVT::v4i32, MVT::v2i64}) {

        setOperationAction(ISD::BITREVERSE, VT, Custom);

        setOperationAction(ISD::CTTZ, VT, Custom);

        setOperationAction(ISD::VECREDUCE_AND, VT, Custom);

        setOperationAction(ISD::VECREDUCE_OR, VT, Custom);

        setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);

        setOperationAction(ISD::MULHS, VT, Custom);

        setOperationAction(ISD::MULHU, VT, Custom);

      }


      // Use SVE for vectors with more than 2 elements.

      for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})

        setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);

    }


    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);

    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);

    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);

    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);


    setOperationAction(ISD::VSCALE, MVT::i32, Custom);


    for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})

      setOperationAction(ISD::INTRINSIC_WO_CHAIN, VT, Custom);

  }


  // Handle partial reduction operations

  if (Subtarget->isSVEorStreamingSVEAvailable()) {

    // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).

    // Other pairs will default to 'Expand'.

    static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,

                                      ISD::PARTIAL_REDUCE_UMLA};

    setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv8i16, Legal);

    setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv16i8, Legal);


    setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv16i8, Custom);


    if (Subtarget->hasMatMulInt8()) {

      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::nxv4i32,

                                MVT::nxv16i8, Legal);

      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::nxv2i64,

                                MVT::nxv16i8, Custom);

    }


    // Wide add types

    if (Subtarget->hasSVE2() || Subtarget->hasSME()) {

      setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv4i32, Legal);

      setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);

      setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);

    }

  }


  // Handle non-aliasing elements mask

  if (Subtarget->hasSVE2() ||

      (Subtarget->hasSME() && Subtarget->isStreaming())) {

    // FIXME: Support wider fixed-length types when msve-vector-bits is used.

    for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) {

      setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Custom);

      setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Custom);

    }

    for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) {

      setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Custom);

      setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Custom);

    }

  }


  // Handle operations that are only available in non-streaming SVE mode.

  if (Subtarget->isSVEAvailable()) {

    for (auto VT : {MVT::nxv16i8,  MVT::nxv8i16, MVT::nxv4i32,  MVT::nxv2i64,

                    MVT::nxv2f16,  MVT::nxv4f16, MVT::nxv8f16,  MVT::nxv2f32,

                    MVT::nxv4f32,  MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,

                    MVT::nxv8bf16, MVT::v4f16,   MVT::v8f16,    MVT::v2f32,

                    MVT::v4f32,    MVT::v1f64,   MVT::v2f64,    MVT::v8i8,

                    MVT::v16i8,    MVT::v4i16,   MVT::v8i16,    MVT::v2i32,

                    MVT::v4i32,    MVT::v1i64,   MVT::v2i64}) {

      setOperationAction(ISD::MGATHER, VT, Custom);

      setOperationAction(ISD::MSCATTER, VT, Custom);

    }


    for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,

                    MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,

                    MVT::v2f32, MVT::v4f32, MVT::v2f64})

      setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);


    // We can lower types that have <vscale x {2|4}> elements to compact.

    for (auto VT :

         {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,

          MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})

      setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);


    // If we have SVE, we can use SVE logic for legal (or smaller than legal)

    // NEON vectors in the lowest bits of the SVE register.

    for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,

                    MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})

      setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);


    // Histcnt is SVE2 only

    if (Subtarget->hasSVE2()) {

      setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv4i32,

                         Custom);

      setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv2i64,

                         Custom);


      static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,

                                        ISD::PARTIAL_REDUCE_UMLA};

      // Must be lowered to SVE instructions.

      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);

      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);

      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);

      setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);

      setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);

      setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);

    }

  }


  if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {

    // Only required for llvm.aarch64.mops.memset.tag

    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);

  }


  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);


  if (Subtarget->hasSVE()) {

    setOperationAction(ISD::FLDEXP, MVT::f64, Custom);

    setOperationAction(ISD::FLDEXP, MVT::f32, Custom);

    setOperationAction(ISD::FLDEXP, MVT::f16, Custom);

    setOperationAction(ISD::FLDEXP, MVT::bf16, Custom);

  }


  PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();


  IsStrictFPEnabled = true;

  setMaxAtomicSizeInBitsSupported(128);


  // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined.  MinGW has

  // it, but it's just a wrapper around ldexp.

  if (Subtarget->isTargetWindows()) {

    for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})

      if (isOperationExpand(Op, MVT::f32))

        setOperationAction(Op, MVT::f32, Promote);

  }


  // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16

  // isn't legal.

  for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})

    if (isOperationExpand(Op, MVT::f16))

      setOperationAction(Op, MVT::f16, Promote);

}


const AArch64TargetMachine &AArch64TargetLowering::getTM() const {

  return static_cast<const AArch64TargetMachine &>(getTargetMachine());

}


void AArch64TargetLowering::addTypeForNEON(MVT VT) {

  assert(VT.isVector() && "VT should be a vector type");


  if (VT.isFloatingPoint()) {

    MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();

    setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);

    setOperationPromotedToType(ISD::STORE, VT, PromoteTo);

  }


  // Mark vector float intrinsics as expand.

  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {

    setOperationAction(ISD::FSIN, VT, Expand);

    setOperationAction(ISD::FCOS, VT, Expand);

    setOperationAction(ISD::FTAN, VT, Expand);

    setOperationAction(ISD::FASIN, VT, Expand);

    setOperationAction(ISD::FACOS, VT, Expand);

    setOperationAction(ISD::FATAN, VT, Expand);

    setOperationAction(ISD::FATAN2, VT, Expand);

    setOperationAction(ISD::FSINH, VT, Expand);

    setOperationAction(ISD::FCOSH, VT, Expand);

    setOperationAction(ISD::FTANH, VT, Expand);

    setOperationAction(ISD::FPOW, VT, Expand);

    setOperationAction(ISD::FLOG, VT, Expand);

    setOperationAction(ISD::FLOG2, VT, Expand);

    setOperationAction(ISD::FLOG10, VT, Expand);

    setOperationAction(ISD::FEXP, VT, Expand);

    setOperationAction(ISD::FEXP2, VT, Expand);

    setOperationAction(ISD::FEXP10, VT, Expand);

  }


  // But we do support custom-lowering for FCOPYSIGN.

  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||

      ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||

        VT == MVT::v8f16) &&

       Subtarget->hasFullFP16()))

    setOperationAction(ISD::FCOPYSIGN, VT, Custom);


  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

  setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

  setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

  setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

  setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

  setOperationAction(ISD::SRA, VT, Custom);

  setOperationAction(ISD::SRL, VT, Custom);

  setOperationAction(ISD::SHL, VT, Custom);

  setOperationAction(ISD::OR, VT, Custom);

  setOperationAction(ISD::SETCC, VT, Custom);

  setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);


  setOperationAction(ISD::SELECT, VT, Expand);

  setOperationAction(ISD::SELECT_CC, VT, Expand);

  setOperationAction(ISD::VSELECT, VT, Expand);

  for (MVT InnerVT : MVT::all_valuetypes())

    setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);


  // CNT supports only B element sizes, then use UADDLP to widen.

  if (VT != MVT::v8i8 && VT != MVT::v16i8)

    setOperationAction(ISD::CTPOP, VT, Custom);


  setOperationAction(ISD::UDIV, VT, Expand);

  setOperationAction(ISD::SDIV, VT, Expand);

  setOperationAction(ISD::UREM, VT, Expand);

  setOperationAction(ISD::SREM, VT, Expand);

  setOperationAction(ISD::FREM, VT, Expand);


  for (unsigned Opcode :

       {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,

        ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})

    setOperationAction(Opcode, VT, Custom);


  if (!VT.isFloatingPoint())

    setOperationAction(ISD::ABS, VT, Legal);


  // [SU][MIN|MAX] are available for all NEON types apart from i64.

  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)

    for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})

      setOperationAction(Opcode, VT, Legal);


  // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP

  // NEON types.

  if (VT.isFloatingPoint() &&

      VT.getVectorElementType() != MVT::bf16 &&

      (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))

    for (unsigned Opcode :

         {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,

          ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::STRICT_FMINIMUM,

          ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,

          ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,

          ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT})

      setOperationAction(Opcode, VT, Legal);


  // Strict fp extend and trunc are legal

  if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)

    setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);

  if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)

    setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);


  // FIXME: We could potentially make use of the vector comparison instructions

  // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of

  // complications:

  //  * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,

  //    so we would need to expand when the condition code doesn't match the

  //    kind of comparison.

  //  * Some kinds of comparison require more than one FCMXY instruction so

  //    would need to be expanded instead.

  //  * The lowering of the non-strict versions involves target-specific ISD

  //    nodes so we would likely need to add strict versions of all of them and

  //    handle them appropriately.

  setOperationAction(ISD::STRICT_FSETCC, VT, Expand);

  setOperationAction(ISD::STRICT_FSETCCS, VT, Expand);


  // When little-endian we can use ordinary d and q register loads/stores for

  // vector types, but when big-endian we need to use structure load/store which

  // only allow post-index addressing.

  if (Subtarget->isLittleEndian()) {

    for (unsigned im = (unsigned)ISD::PRE_INC;

         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {

      setIndexedLoadAction(im, VT, Legal);

      setIndexedStoreAction(im, VT, Legal);

    }

  } else {

    setIndexedLoadAction(ISD::POST_INC, VT, Legal);

    setIndexedStoreAction(ISD::POST_INC, VT, Legal);

  }


  if (Subtarget->hasD128()) {

    setOperationAction(ISD::READ_REGISTER, MVT::i128, Custom);

    setOperationAction(ISD::WRITE_REGISTER, MVT::i128, Custom);

  }


  if (VT.isInteger()) {

    // Let common code emit inverted variants of compares we do support.

    setCondCodeAction(ISD::SETNE, VT, Expand);

    setCondCodeAction(ISD::SETLE, VT, Expand);

    setCondCodeAction(ISD::SETLT, VT, Expand);

    setCondCodeAction(ISD::SETULE, VT, Expand);

    setCondCodeAction(ISD::SETULT, VT, Expand);

  }

}


bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,

                                                          EVT OpVT) const {

  // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).

  if (!Subtarget->isSVEorStreamingSVEAvailable() ||

      ResVT.getVectorElementType() != MVT::i1)

    return true;


  // Only support illegal types if the result is scalable and min elements > 1.

  if (ResVT.getVectorMinNumElements() == 1 ||

      (ResVT.isFixedLengthVector() && (ResVT.getVectorNumElements() > 16 ||

                                       (OpVT != MVT::i32 && OpVT != MVT::i64))))

    return true;


  // 32 & 64 bit operands are supported. We can promote anything < 64 bits,

  // but anything larger should be expanded.

  if (OpVT.getFixedSizeInBits() > 64)

    return true;


  return false;

}


bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {

  if (!Subtarget->isSVEorStreamingSVEAvailable())

    return true;


  // We can only use the BRKB + CNTP sequence with legal predicate types. We can

  // also support fixed-width predicates.

  return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&

         VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&

         VT != MVT::v4i1 && VT != MVT::v2i1;

}


bool AArch64TargetLowering::shouldExpandVectorMatch(EVT VT,

                                                    unsigned SearchSize) const {

  // MATCH is SVE2 and only available in non-streaming mode.

  if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())

    return true;

  // Furthermore, we can only use it for 8-bit or 16-bit elements.

  if (VT == MVT::nxv8i16 || VT == MVT::v8i16)

    return SearchSize != 8;

  if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)

    return SearchSize != 8 && SearchSize != 16;

  return true;

}


void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {

  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");


  // By default everything must be expanded.

  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)

    setOperationAction(Op, VT, Expand);


  if (VT.isFloatingPoint()) {

    setCondCodeAction(ISD::SETO, VT, Expand);

    setCondCodeAction(ISD::SETOLT, VT, Expand);

    setCondCodeAction(ISD::SETOLE, VT, Expand);

    setCondCodeAction(ISD::SETULT, VT, Expand);

    setCondCodeAction(ISD::SETULE, VT, Expand);

    setCondCodeAction(ISD::SETUGE, VT, Expand);

    setCondCodeAction(ISD::SETUGT, VT, Expand);

    setCondCodeAction(ISD::SETUEQ, VT, Expand);

    setCondCodeAction(ISD::SETONE, VT, Expand);

  }


  TargetLoweringBase::LegalizeAction Default =

      VT == MVT::v1f64 ? Expand : Custom;


  // Mark integer truncating stores/extending loads as having custom lowering

  if (VT.isInteger()) {

    MVT InnerVT = VT.changeVectorElementType(MVT::i8);

    while (InnerVT != VT) {

      setTruncStoreAction(VT, InnerVT, Default);

      setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);

      setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);

      setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);

      InnerVT = InnerVT.changeVectorElementType(

          MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));

    }

  }


  // Mark floating-point truncating stores/extending loads as having custom

  // lowering

  if (VT.isFloatingPoint()) {

    MVT InnerVT = VT.changeVectorElementType(MVT::f16);

    while (InnerVT != VT) {

      setTruncStoreAction(VT, InnerVT, Custom);

      setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);

      InnerVT = InnerVT.changeVectorElementType(

          MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits()));

    }

  }


  bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();

  bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();


  static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,

                                    ISD::PARTIAL_REDUCE_UMLA};

  unsigned NumElts = VT.getVectorNumElements();

  if (VT.getVectorElementType() == MVT::i64) {

    setPartialReduceMLAAction(MLAOps, VT,

                              MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);

    setPartialReduceMLAAction(MLAOps, VT,

                              MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);

    setPartialReduceMLAAction(MLAOps, VT,

                              MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);

  } else if (VT.getVectorElementType() == MVT::i32) {

    setPartialReduceMLAAction(MLAOps, VT,

                              MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);

    setPartialReduceMLAAction(MLAOps, VT,

                              MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);

  } else if (VT.getVectorElementType() == MVT::i16) {

    setPartialReduceMLAAction(MLAOps, VT,

                              MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);

  }

  if (Subtarget->hasMatMulInt8()) {

    if (VT.getVectorElementType() == MVT::i32)

      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,

                                MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);

    else if (VT.getVectorElementType() == MVT::i64)

      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,

                                MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);

  }


  // Lower fixed length vector operations to scalable equivalents.

  setOperationAction(ISD::ABDS, VT, Default);

  setOperationAction(ISD::ABDU, VT, Default);

  setOperationAction(ISD::ABS, VT, Default);

  setOperationAction(ISD::ADD, VT, Default);

  setOperationAction(ISD::AND, VT, Default);

  setOperationAction(ISD::ANY_EXTEND, VT, Default);

  setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);

  setOperationAction(ISD::BITREVERSE, VT, Default);

  setOperationAction(ISD::BSWAP, VT, Default);

  setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

  setOperationAction(ISD::CONCAT_VECTORS, VT, Default);

  setOperationAction(ISD::CTLZ, VT, Default);

  setOperationAction(ISD::CTPOP, VT, Default);

  setOperationAction(ISD::CTTZ, VT, Default);

  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Default);

  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Default);

  setOperationAction(ISD::FABS, VT, Default);

  setOperationAction(ISD::FADD, VT, Default);

  setOperationAction(ISD::FCEIL, VT, Default);

  setOperationAction(ISD::FCOPYSIGN, VT, Default);

  setOperationAction(ISD::FDIV, VT, Default);

  setOperationAction(ISD::FFLOOR, VT, Default);

  setOperationAction(ISD::FMA, VT, Default);

  setOperationAction(ISD::FMAXIMUM, VT, Default);

  setOperationAction(ISD::FMAXNUM, VT, Default);

  setOperationAction(ISD::FMINIMUM, VT, Default);

  setOperationAction(ISD::FMINNUM, VT, Default);

  setOperationAction(ISD::FMUL, VT, Default);

  setOperationAction(ISD::FNEARBYINT, VT, Default);

  setOperationAction(ISD::FNEG, VT, Default);

  setOperationAction(ISD::FP_EXTEND, VT, Default);

  setOperationAction(ISD::FP_ROUND, VT, Default);

  setOperationAction(ISD::FP_TO_SINT, VT, Default);

  setOperationAction(ISD::FP_TO_UINT, VT, Default);

  setOperationAction(ISD::FRINT, VT, Default);

  setOperationAction(ISD::LRINT, VT, Default);

  setOperationAction(ISD::LLRINT, VT, Default);

  setOperationAction(ISD::FROUND, VT, Default);

  setOperationAction(ISD::FROUNDEVEN, VT, Default);

  setOperationAction(ISD::FSQRT, VT, Default);

  setOperationAction(ISD::FSUB, VT, Default);

  setOperationAction(ISD::FTRUNC, VT, Default);

  setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Default);

  setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Default);

  setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);

  setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);

  setOperationAction(ISD::MLOAD, VT, Default);

  setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);

  setOperationAction(ISD::MSTORE, VT, Default);

  setOperationAction(ISD::MUL, VT, Default);

  setOperationAction(ISD::MULHS, VT, Default);

  setOperationAction(ISD::MULHU, VT, Default);

  setOperationAction(ISD::OR, VT, Default);

  setOperationAction(ISD::SCALAR_TO_VECTOR, VT, PreferNEON ? Legal : Expand);

  setOperationAction(ISD::SDIV, VT, Default);

  setOperationAction(ISD::SELECT, VT, Default);

  setOperationAction(ISD::SETCC, VT, Default);

  setOperationAction(ISD::SHL, VT, Default);

  setOperationAction(ISD::SIGN_EXTEND, VT, Default);

  setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Default);

  setOperationAction(ISD::SINT_TO_FP, VT, Default);

  setOperationAction(ISD::SMAX, VT, Default);

  setOperationAction(ISD::SMIN, VT, Default);

  setOperationAction(ISD::SPLAT_VECTOR, VT, Default);

  setOperationAction(ISD::SRA, VT, Default);

  setOperationAction(ISD::SRL, VT, Default);

  setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);

  setOperationAction(ISD::SUB, VT, Default);

  setOperationAction(ISD::TRUNCATE, VT, Default);

  setOperationAction(ISD::UDIV, VT, Default);

  setOperationAction(ISD::UINT_TO_FP, VT, Default);

  setOperationAction(ISD::UMAX, VT, Default);

  setOperationAction(ISD::UMIN, VT, Default);

  setOperationAction(ISD::VECREDUCE_ADD, VT, Default);

  setOperationAction(ISD::VECREDUCE_AND, VT, Default);

  setOperationAction(ISD::VECREDUCE_FADD, VT, Default);

  setOperationAction(ISD::VECREDUCE_FMAX, VT, Default);

  setOperationAction(ISD::VECREDUCE_FMIN, VT, Default);

  setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Default);

  setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Default);

  setOperationAction(ISD::VECREDUCE_OR, VT, Default);

  setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, PreferSVE ? Default : Expand);

  setOperationAction(ISD::VECREDUCE_SMAX, VT, Default);

  setOperationAction(ISD::VECREDUCE_SMIN, VT, Default);

  setOperationAction(ISD::VECREDUCE_UMAX, VT, Default);

  setOperationAction(ISD::VECREDUCE_UMIN, VT, Default);

  setOperationAction(ISD::VECREDUCE_XOR, VT, Default);

  setOperationAction(ISD::VECTOR_SHUFFLE, VT, Default);

  setOperationAction(ISD::VECTOR_SPLICE, VT, Default);

  setOperationAction(ISD::VSELECT, VT, Default);

  setOperationAction(ISD::XOR, VT, Default);

  setOperationAction(ISD::ZERO_EXTEND, VT, Default);

}


void AArch64TargetLowering::addDRType(MVT VT) {

  addRegisterClass(VT, &AArch64::FPR64RegClass);

  if (Subtarget->isNeonAvailable())

    addTypeForNEON(VT);

}


void AArch64TargetLowering::addQRType(MVT VT) {

  addRegisterClass(VT, &AArch64::FPR128RegClass);

  if (Subtarget->isNeonAvailable())

    addTypeForNEON(VT);

}


EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,

                                              LLVMContext &C, EVT VT) const {

  if (!VT.isVector())

    return MVT::i32;

  if (VT.isScalableVector())

    return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());

  return VT.changeVectorElementTypeToInteger();

}


// isIntImmediate - This method tests to see if the node is a constant

// operand. If so Imm will receive the value.


static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {

  if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {

    Imm = C->getZExtValue();

    return true;

  }

  return false;

}


bool isVectorizedBinOp(unsigned Opcode) {

  switch (Opcode) {

  case AArch64ISD::SQDMULH:

    return true;

  default:

    return false;

  }

}


// isOpcWithIntImmediate - This method tests to see if the node is a specific

// opcode and that it has a immediate integer right operand.

// If so Imm will receive the value.


static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,

                                  uint64_t &Imm) {

  return N->getOpcode() == Opc &&

         isIntImmediate(N->getOperand(1).getNode(), Imm);

}


static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,

                               const APInt &Demanded,

                               TargetLowering::TargetLoweringOpt &TLO,

                               unsigned NewOpc) {

  uint64_t OldImm = Imm, NewImm, Enc;

  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;


  // Return if the immediate is already all zeros, all ones, a bimm32 or a

  // bimm64.

  if (Imm == 0 || Imm == Mask ||

      AArch64_AM::isLogicalImmediate(Imm & Mask, Size))

    return false;


  unsigned EltSize = Size;

  uint64_t DemandedBits = Demanded.getZExtValue();


  // Clear bits that are not demanded.

  Imm &= DemandedBits;


  while (true) {

    // The goal here is to set the non-demanded bits in a way that minimizes

    // the number of switching between 0 and 1. In order to achieve this goal,

    // we set the non-demanded bits to the value of the preceding demanded bits.

    // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a

    // non-demanded bit), we copy bit0 (1) to the least significant 'x',

    // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.

    // The final result is 0b11000011.

    uint64_t NonDemandedBits = ~DemandedBits;

    uint64_t InvertedImm = ~Imm & DemandedBits;

    uint64_t RotatedImm =

        ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &

        NonDemandedBits;

    uint64_t Sum = RotatedImm + NonDemandedBits;

    bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));

    uint64_t Ones = (Sum + Carry) & NonDemandedBits;

    NewImm = (Imm | Ones) & Mask;


    // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate

    // or all-ones or all-zeros, in which case we can stop searching. Otherwise,

    // we halve the element size and continue the search.

    if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))

      break;


    // We cannot shrink the element size any further if it is 2-bits.

    if (EltSize == 2)

      return false;


    EltSize /= 2;

    Mask >>= EltSize;

    uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;


    // Return if there is mismatch in any of the demanded bits of Imm and Hi.

    if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)

      return false;


    // Merge the upper and lower halves of Imm and DemandedBits.

    Imm |= Hi;

    DemandedBits |= DemandedBitsHi;

  }


  ++NumOptimizedImms;


  // Replicate the element across the register width.

  while (EltSize < Size) {

    NewImm |= NewImm << EltSize;

    EltSize *= 2;

  }


  (void)OldImm;

  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&

         "demanded bits should never be altered");

  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");


  // Create the new constant immediate node.

  EVT VT = Op.getValueType();

  SDLoc DL(Op);

  SDValue New;


  // If the new constant immediate is all-zeros or all-ones, let the target

  // independent DAG combine optimize this node.

  if (NewImm == 0 || NewImm == OrigMask) {

    New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),

                          TLO.DAG.getConstant(NewImm, DL, VT));

  // Otherwise, create a machine node so that target independent DAG combine

  // doesn't undo this optimization.

  } else {

    Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);

    SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);

    New = SDValue(

        TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);

  }


  return TLO.CombineTo(Op, New);

}


bool AArch64TargetLowering::targetShrinkDemandedConstant(

    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,

    TargetLoweringOpt &TLO) const {

  // Delay this optimization to as late as possible.

  if (!TLO.LegalOps)

    return false;


  if (!EnableOptimizeLogicalImm)

    return false;


  EVT VT = Op.getValueType();

  if (VT.isVector())

    return false;


  unsigned Size = VT.getSizeInBits();


  if (Size != 32 && Size != 64)

    return false;


  // Exit early if we demand all bits.

  if (DemandedBits.popcount() == Size)

    return false;


  unsigned NewOpc;

  switch (Op.getOpcode()) {

  default:

    return false;

  case ISD::AND:

    NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;

    break;

  case ISD::OR:

    NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;

    break;

  case ISD::XOR:

    NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;

    break;

  }

  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));

  if (!C)

    return false;

  uint64_t Imm = C->getZExtValue();

  return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);

}


/// computeKnownBitsForTargetNode - Determine which of the bits specified in

/// Mask are known to be either zero or one and return them Known.


void AArch64TargetLowering::computeKnownBitsForTargetNode(

    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,

    const SelectionDAG &DAG, unsigned Depth) const {

  switch (Op.getOpcode()) {

  default:

    break;

  case AArch64ISD::DUP: {

    SDValue SrcOp = Op.getOperand(0);

    Known = DAG.computeKnownBits(SrcOp, Depth + 1);

    if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {

      assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&

             "Expected DUP implicit truncation");

      Known = Known.trunc(Op.getScalarValueSizeInBits());

    }

    break;

  }

  case AArch64ISD::CSEL: {

    KnownBits Known2;

    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);

    Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);

    Known = Known.intersectWith(Known2);

    break;

  }

  case AArch64ISD::CSNEG:

  case AArch64ISD::CSINC:

  case AArch64ISD::CSINV: {

    KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);

    KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);


    // The result is either:

    // CSINC: KnownOp0 or KnownOp1 + 1

    // CSINV: KnownOp0 or ~KnownOp1

    // CSNEG: KnownOp0 or KnownOp1 * -1

    if (Op.getOpcode() == AArch64ISD::CSINC)

      KnownOp1 = KnownBits::add(

          KnownOp1,

          KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1)));

    else if (Op.getOpcode() == AArch64ISD::CSINV)

      std::swap(KnownOp1.Zero, KnownOp1.One);

    else if (Op.getOpcode() == AArch64ISD::CSNEG)

      KnownOp1 =

          KnownBits::mul(KnownOp1, KnownBits::makeConstant(APInt::getAllOnes(

                                       Op.getScalarValueSizeInBits())));


    Known = KnownOp0.intersectWith(KnownOp1);

    break;

  }

  case AArch64ISD::BICi: {

    // Compute the bit cleared value.

    APInt Mask =

        ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))

             .trunc(Known.getBitWidth());

    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);

    Known &= KnownBits::makeConstant(Mask);

    break;

  }

  case AArch64ISD::VLSHR: {

    KnownBits Known2;

    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);

    Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);

    Known = KnownBits::lshr(Known, Known2);

    break;

  }

  case AArch64ISD::VASHR: {

    KnownBits Known2;

    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);

    Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);

    Known = KnownBits::ashr(Known, Known2);

    break;

  }

  case AArch64ISD::VSHL: {

    KnownBits Known2;

    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);

    Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);

    Known = KnownBits::shl(Known, Known2);

    break;

  }

  case AArch64ISD::MOVI: {

    Known = KnownBits::makeConstant(

        APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));

    break;

  }

  case AArch64ISD::MOVIshift: {

    Known = KnownBits::makeConstant(

        APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)

                                       << Op->getConstantOperandVal(1)));

    break;

  }

  case AArch64ISD::MOVImsl: {

    unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));

    Known = KnownBits::makeConstant(APInt(

        Known.getBitWidth(), ~(~Op->getConstantOperandVal(0) << ShiftAmt)));

    break;

  }

  case AArch64ISD::MOVIedit: {

    Known = KnownBits::makeConstant(APInt(

        Known.getBitWidth(),

        AArch64_AM::decodeAdvSIMDModImmType10(Op->getConstantOperandVal(0))));

    break;

  }

  case AArch64ISD::MVNIshift: {

    Known = KnownBits::makeConstant(

        APInt(Known.getBitWidth(),

              ~(Op->getConstantOperandVal(0) << Op->getConstantOperandVal(1)),

              /*isSigned*/ false, /*implicitTrunc*/ true));

    break;

  }

  case AArch64ISD::MVNImsl: {

    unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));

    Known = KnownBits::makeConstant(

        APInt(Known.getBitWidth(), (~Op->getConstantOperandVal(0) << ShiftAmt),

              /*isSigned*/ false, /*implicitTrunc*/ true));

    break;

  }

  case AArch64ISD::LOADgot:

  case AArch64ISD::ADDlow: {

    if (!Subtarget->isTargetILP32())

      break;

    // In ILP32 mode all valid pointers are in the low 4GB of the address-space.

    Known.Zero = APInt::getHighBitsSet(64, 32);

    break;

  }

  case AArch64ISD::ASSERT_ZEXT_BOOL: {

    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);

    Known.Zero |= APInt(Known.getBitWidth(), 0xFE);

    break;

  }

  case ISD::INTRINSIC_W_CHAIN: {

    Intrinsic::ID IntID =

        static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));

    switch (IntID) {

    default: return;

    case Intrinsic::aarch64_ldaxr:

    case Intrinsic::aarch64_ldxr: {

      unsigned BitWidth = Known.getBitWidth();

      EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();

      unsigned MemBits = VT.getScalarSizeInBits();

      Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);

      return;

    }

    }

    break;

  }

  case ISD::INTRINSIC_WO_CHAIN:

  case ISD::INTRINSIC_VOID: {

    unsigned IntNo = Op.getConstantOperandVal(0);

    switch (IntNo) {

    default:

      break;

    case Intrinsic::aarch64_neon_uaddlv: {

      MVT VT = Op.getOperand(1).getValueType().getSimpleVT();

      unsigned BitWidth = Known.getBitWidth();

      if (VT == MVT::v8i8 || VT == MVT::v16i8) {

        unsigned Bound = (VT == MVT::v8i8) ?  11 : 12;

        assert(BitWidth >= Bound && "Unexpected width!");

        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - Bound);

        Known.Zero |= Mask;

      }

      break;

    }

    case Intrinsic::aarch64_neon_umaxv:

    case Intrinsic::aarch64_neon_uminv: {

      // Figure out the datatype of the vector operand. The UMINV instruction

      // will zero extend the result, so we can mark as known zero all the

      // bits larger than the element datatype. 32-bit or larget doesn't need

      // this as those are legal types and will be handled by isel directly.

      MVT VT = Op.getOperand(1).getValueType().getSimpleVT();

      unsigned BitWidth = Known.getBitWidth();

      if (VT == MVT::v8i8 || VT == MVT::v16i8) {

        assert(BitWidth >= 8 && "Unexpected width!");

        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);

        Known.Zero |= Mask;

      } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {

        assert(BitWidth >= 16 && "Unexpected width!");

        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);

        Known.Zero |= Mask;

      }

      break;

    } break;

    }

  }

  }

}


unsigned AArch64TargetLowering::ComputeNumSignBitsForTargetNode(

    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

    unsigned Depth) const {

  EVT VT = Op.getValueType();

  unsigned VTBits = VT.getScalarSizeInBits();

  unsigned Opcode = Op.getOpcode();

  switch (Opcode) {

  case AArch64ISD::FCMEQ:

  case AArch64ISD::FCMGE:

  case AArch64ISD::FCMGT:

    // Compares return either 0 or all-ones

    return VTBits;

  case AArch64ISD::VASHR: {

    unsigned Tmp =

        DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);

    return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);

  }

  }


  return 1;

}


MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,

                                                  EVT) const {

  return MVT::i64;

}


bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(

    EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,

    unsigned *Fast) const {


  // Allow SVE loads/stores where the alignment >= the size of the element type,

  // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used

  // for stores that come from IR, only require element-size alignment (even if

  // unaligned accesses are disabled). Without this, these will be forced to

  // have 16-byte alignment with +strict-align (and fail to lower as we don't

  // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).

  if (VT.isScalableVector()) {

    unsigned ElementSizeBits = VT.getScalarSizeInBits();

    if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))

      return true;

  }


  if (Subtarget->requiresStrictAlign())

    return false;


  if (Fast) {

    // Some CPUs are fine with unaligned stores except for 128-bit ones.

    *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||

            // See comments in performSTORECombine() for more details about

            // these conditions.


            // Code that uses clang vector extensions can mark that it

            // wants unaligned accesses to be treated as fast by

            // underspecifying alignment to be 1 or 2.

            Alignment <= 2 ||


            // Disregard v2i64. Memcpy lowering produces those and splitting

            // them regresses performance on micro-benchmarks and olden/bh.

            VT == MVT::v2i64;

  }

  return true;

}


// Same as above but handling LLTs instead.


bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(

    LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,

    unsigned *Fast) const {

  if (Subtarget->requiresStrictAlign())

    return false;


  if (Fast) {

    // Some CPUs are fine with unaligned stores except for 128-bit ones.

    *Fast = !Subtarget->isMisaligned128StoreSlow() ||

            Ty.getSizeInBytes() != 16 ||

            // See comments in performSTORECombine() for more details about

            // these conditions.


            // Code that uses clang vector extensions can mark that it

            // wants unaligned accesses to be treated as fast by

            // underspecifying alignment to be 1 or 2.

            Alignment <= 2 ||


            // Disregard v2i64. Memcpy lowering produces those and splitting

            // them regresses performance on micro-benchmarks and olden/bh.

            Ty == LLT::fixed_vector(2, 64);

  }

  return true;

}


FastISel *


AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,

                                      const TargetLibraryInfo *libInfo) const {

  return AArch64::createFastISel(funcInfo, libInfo);

}


MachineBasicBlock *


AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,

                                    MachineBasicBlock *MBB) const {

  // We materialise the F128CSEL pseudo-instruction as some control flow and a

  // phi node:


  // OrigBB:

  //     [... previous instrs leading to comparison ...]

  //     b.ne TrueBB

  //     b EndBB

  // TrueBB:

  //     ; Fallthrough

  // EndBB:

  //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]


  MachineFunction *MF = MBB->getParent();

  const TargetInstrInfo *TII = Subtarget->getInstrInfo();

  const BasicBlock *LLVM_BB = MBB->getBasicBlock();

  DebugLoc DL = MI.getDebugLoc();

  MachineFunction::iterator It = ++MBB->getIterator();


  Register DestReg = MI.getOperand(0).getReg();

  Register IfTrueReg = MI.getOperand(1).getReg();

  Register IfFalseReg = MI.getOperand(2).getReg();

  unsigned CondCode = MI.getOperand(3).getImm();

  bool NZCVKilled = MI.getOperand(4).isKill();


  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);

  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);

  MF->insert(It, TrueBB);

  MF->insert(It, EndBB);


  // Transfer rest of current basic-block to EndBB

  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),

                MBB->end());

  EndBB->transferSuccessorsAndUpdatePHIs(MBB);


  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);

  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);

  MBB->addSuccessor(TrueBB);

  MBB->addSuccessor(EndBB);


  // TrueBB falls through to the end.

  TrueBB->addSuccessor(EndBB);


  if (!NZCVKilled) {

    TrueBB->addLiveIn(AArch64::NZCV);

    EndBB->addLiveIn(AArch64::NZCV);

  }


  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)

      .addReg(IfTrueReg)

      .addMBB(TrueBB)

      .addReg(IfFalseReg)

      .addMBB(MBB);


  MI.eraseFromParent();

  return EndBB;

}


MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(

       MachineInstr &MI, MachineBasicBlock *BB) const {

  assert(!isAsynchronousEHPersonality(classifyEHPersonality(

             BB->getParent()->getFunction().getPersonalityFn())) &&

         "SEH does not use catchret!");

  return BB;

}


MachineBasicBlock *


AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,

                                              MachineBasicBlock *MBB) const {

  MachineFunction &MF = *MBB->getParent();

  MachineBasicBlock::iterator MBBI = MI.getIterator();

  const AArch64InstrInfo &TII =

      *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();

  Register TargetReg = MI.getOperand(0).getReg();

  MachineBasicBlock::iterator NextInst =

      TII.probedStackAlloc(MBBI, TargetReg, false);


  MI.eraseFromParent();

  return NextInst->getParent();

}


MachineBasicBlock *


AArch64TargetLowering::EmitCheckMatchingVL(MachineInstr &MI,

                                           MachineBasicBlock *MBB) const {

  MachineFunction *MF = MBB->getParent();

  MachineRegisterInfo &MRI = MF->getRegInfo();


  const TargetRegisterClass *RC_GPR = &AArch64::GPR64RegClass;

  const TargetRegisterClass *RC_GPRsp = &AArch64::GPR64spRegClass;


  Register RegVL_GPR = MRI.createVirtualRegister(RC_GPR);

  Register RegVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL src

  Register RegSVL_GPR = MRI.createVirtualRegister(RC_GPR);

  Register RegSVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL dst


  const TargetInstrInfo *TII = Subtarget->getInstrInfo();

  DebugLoc DL = MI.getDebugLoc();


  // RDVL requires GPR64, ADDSVL requires GPR64sp

  // We need to insert COPY instructions, these will later be removed by the

  // RegisterCoalescer

  BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL_GPR).addImm(1);

  BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegVL_GPRsp)

      .addReg(RegVL_GPR);


  BuildMI(*MBB, MI, DL, TII->get(AArch64::ADDSVL_XXI), RegSVL_GPRsp)

      .addReg(RegVL_GPRsp)

      .addImm(-1);

  BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegSVL_GPR)

      .addReg(RegSVL_GPRsp);


  const BasicBlock *LLVM_BB = MBB->getBasicBlock();

  MachineFunction::iterator It = ++MBB->getIterator();

  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB);

  MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB);

  MF->insert(It, TrapBB);

  MF->insert(It, PassBB);


  // Continue if vector lengths match

  BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX))

      .addReg(RegSVL_GPR)

      .addMBB(PassBB);


  // Transfer rest of current BB to PassBB

  PassBB->splice(PassBB->begin(), MBB,

                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());

  PassBB->transferSuccessorsAndUpdatePHIs(MBB);


  // Trap if vector lengths mismatch

  BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1);


  MBB->addSuccessor(TrapBB);

  MBB->addSuccessor(PassBB);


  MI.eraseFromParent();

  return PassBB;

}


MachineBasicBlock *


AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,

                                    MachineInstr &MI,

                                    MachineBasicBlock *BB) const {

  const TargetInstrInfo *TII = Subtarget->getInstrInfo();

  MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));


  MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);

  MIB.add(MI.getOperand(1)); // slice index register

  MIB.add(MI.getOperand(2)); // slice index offset

  MIB.add(MI.getOperand(3)); // pg

  MIB.add(MI.getOperand(4)); // base

  MIB.add(MI.getOperand(5)); // offset


  MI.eraseFromParent(); // The pseudo is gone now.

  return BB;

}


MachineBasicBlock *


AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {

  const TargetInstrInfo *TII = Subtarget->getInstrInfo();

  MachineInstrBuilder MIB =

      BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));


  MIB.addReg(AArch64::ZA, RegState::Define);

  MIB.add(MI.getOperand(0)); // Vector select register

  MIB.add(MI.getOperand(1)); // Vector select offset

  MIB.add(MI.getOperand(2)); // Base

  MIB.add(MI.getOperand(1)); // Offset, same as vector select offset


  MI.eraseFromParent(); // The pseudo is gone now.

  return BB;

}


MachineBasicBlock *AArch64TargetLowering::EmitZTInstr(MachineInstr &MI,

                                                      MachineBasicBlock *BB,

                                                      unsigned Opcode,

                                                      bool Op0IsDef) const {

  const TargetInstrInfo *TII = Subtarget->getInstrInfo();

  MachineInstrBuilder MIB;


  MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))

            .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);

  for (unsigned I = 1; I < MI.getNumOperands(); ++I)

    MIB.add(MI.getOperand(I));


  MI.eraseFromParent(); // The pseudo is gone now.

  return BB;

}


MachineBasicBlock *


AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,

                                   MachineInstr &MI,

                                   MachineBasicBlock *BB) const {

  const TargetInstrInfo *TII = Subtarget->getInstrInfo();

  MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));

  unsigned StartIdx = 0;


  bool HasTile = BaseReg != AArch64::ZA;

  bool HasZPROut = HasTile && MI.getOperand(0).isReg();

  if (HasZPROut) {

    MIB.add(MI.getOperand(StartIdx)); // Output ZPR

    ++StartIdx;

  }

  if (HasTile) {

    MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),

               RegState::Define);                           // Output ZA Tile

    MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile

    StartIdx++;

  } else {

    // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,

    if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {

      MIB.add(MI.getOperand(StartIdx)); // Output ZPR

      ++StartIdx;

    }

    MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);

  }

  for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)

    MIB.add(MI.getOperand(I));


  MI.eraseFromParent(); // The pseudo is gone now.

  return BB;

}


MachineBasicBlock *


AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {

  const TargetInstrInfo *TII = Subtarget->getInstrInfo();

  MachineInstrBuilder MIB =

      BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));

  MIB.add(MI.getOperand(0)); // Mask


  unsigned Mask = MI.getOperand(0).getImm();

  for (unsigned I = 0; I < 8; I++) {

    if (Mask & (1 << I))

      MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);

  }


  MI.eraseFromParent(); // The pseudo is gone now.

  return BB;

}


MachineBasicBlock *


AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI,

                                            MachineBasicBlock *BB) const {

  MachineFunction *MF = BB->getParent();

  MachineFrameInfo &MFI = MF->getFrameInfo();

  AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();

  TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();

  if (TPIDR2.Uses > 0) {

    // Note: This case just needs to do `SVL << 48`. It is not implemented as we

    // generally don't support big-endian SVE/SME.

    if (!Subtarget->isLittleEndian())

      reportFatalInternalError(

          "TPIDR2 block initialization is not supported on big-endian targets");


    const TargetInstrInfo *TII = Subtarget->getInstrInfo();

    // Store buffer pointer and num_za_save_slices.

    // Bytes 10-15 are implicitly zeroed.

    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi))

        .addReg(MI.getOperand(0).getReg())

        .addReg(MI.getOperand(1).getReg())

        .addFrameIndex(TPIDR2.FrameIndex)

        .addImm(0);

  } else

    MFI.RemoveStackObject(TPIDR2.FrameIndex);


  BB->remove_instr(&MI);

  return BB;

}


MachineBasicBlock *


AArch64TargetLowering::EmitAllocateZABuffer(MachineInstr &MI,

                                            MachineBasicBlock *BB) const {

  MachineFunction *MF = BB->getParent();

  MachineFrameInfo &MFI = MF->getFrameInfo();

  AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();

  // TODO This function grows the stack with a subtraction, which doesn't work

  // on Windows. Some refactoring to share the functionality in

  // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI

  // supports SME

  assert(!MF->getSubtarget<AArch64Subtarget>().isTargetWindows() &&

         "Lazy ZA save is not yet supported on Windows");


  TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();


  if (TPIDR2.Uses > 0) {

    const TargetInstrInfo *TII = Subtarget->getInstrInfo();

    MachineRegisterInfo &MRI = MF->getRegInfo();


    // The SUBXrs below won't always be emitted in a form that accepts SP

    // directly

    Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);

    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)

        .addReg(AArch64::SP);


    // Allocate a lazy-save buffer object of the size given, normally SVL * SVL

    auto Size = MI.getOperand(1).getReg();

    auto Dest = MI.getOperand(0).getReg();

    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)

        .addReg(Size)

        .addReg(Size)

        .addReg(SP);

    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),

            AArch64::SP)

        .addReg(Dest);


    // We have just allocated a variable sized object, tell this to PEI.

    MFI.CreateVariableSizedObject(Align(16), nullptr);

  }


  BB->remove_instr(&MI);

  return BB;

}


// TODO: Find a way to merge this with EmitAllocateZABuffer.

MachineBasicBlock *


AArch64TargetLowering::EmitAllocateSMESaveBuffer(MachineInstr &MI,

                                                 MachineBasicBlock *BB) const {

  MachineFunction *MF = BB->getParent();

  MachineFrameInfo &MFI = MF->getFrameInfo();

  AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();

  assert(!MF->getSubtarget<AArch64Subtarget>().isTargetWindows() &&

         "Lazy ZA save is not yet supported on Windows");


  const TargetInstrInfo *TII = Subtarget->getInstrInfo();

  if (FuncInfo->isSMESaveBufferUsed()) {

    // Allocate a buffer object of the size given by MI.getOperand(1).

    auto Size = MI.getOperand(1).getReg();

    auto Dest = MI.getOperand(0).getReg();

    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrx64), AArch64::SP)

        .addReg(AArch64::SP)

        .addReg(Size)

        .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0));

    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Dest)

        .addReg(AArch64::SP);


    // We have just allocated a variable sized object, tell this to PEI.

    MFI.CreateVariableSizedObject(Align(16), nullptr);

  } else

    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),

            MI.getOperand(0).getReg());


  BB->remove_instr(&MI);

  return BB;

}


MachineBasicBlock *


AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI,

                                          MachineBasicBlock *BB) const {

  // If the buffer is used, emit a call to __arm_sme_state_size()

  MachineFunction *MF = BB->getParent();

  AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();

  const TargetInstrInfo *TII = Subtarget->getInstrInfo();

  if (FuncInfo->isSMESaveBufferUsed()) {

    RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;

    const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();

    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))

        .addExternalSymbol(getLibcallName(LC))

        .addReg(AArch64::X0, RegState::ImplicitDefine)

        .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));

    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),

            MI.getOperand(0).getReg())

        .addReg(AArch64::X0);

  } else

    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),

            MI.getOperand(0).getReg())

        .addReg(AArch64::XZR);

  BB->remove_instr(&MI);

  return BB;

}


MachineBasicBlock *


AArch64TargetLowering::EmitEntryPStateSM(MachineInstr &MI,

                                         MachineBasicBlock *BB) const {

  MachineFunction *MF = BB->getParent();

  const TargetInstrInfo *TII = Subtarget->getInstrInfo();

  const DebugLoc &DL = MI.getDebugLoc();

  Register ResultReg = MI.getOperand(0).getReg();

  if (MF->getRegInfo().use_empty(ResultReg)) {

    // Nothing to do. Pseudo erased below.

  } else if (Subtarget->hasSME()) {

    BuildMI(*BB, MI, DL, TII->get(AArch64::MRS), ResultReg)

        .addImm(AArch64SysReg::SVCR)

        .addReg(AArch64::VG, RegState::Implicit);

  } else {

    RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;

    const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();

    BuildMI(*BB, MI, DL, TII->get(AArch64::BL))

        .addExternalSymbol(getLibcallName(LC))

        .addReg(AArch64::X0, RegState::ImplicitDefine)

        .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));

    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), ResultReg)

        .addReg(AArch64::X0);

  }

  MI.eraseFromParent();

  return BB;

}


// Helper function to find the instruction that defined a virtual register.

// If unable to find such instruction, returns nullptr.


static const MachineInstr *stripVRegCopies(const MachineRegisterInfo &MRI,

                                           Register Reg) {

  while (Reg.isVirtual()) {

    MachineInstr *DefMI = MRI.getVRegDef(Reg);

    assert(DefMI && "Virtual register definition not found");

    unsigned Opcode = DefMI->getOpcode();


    if (Opcode == AArch64::COPY) {

      Reg = DefMI->getOperand(1).getReg();

      // Vreg is defined by copying from physreg.

      if (Reg.isPhysical())

        return DefMI;

      continue;

    }

    if (Opcode == AArch64::SUBREG_TO_REG) {

      Reg = DefMI->getOperand(2).getReg();

      continue;

    }


    return DefMI;

  }

  return nullptr;

}


void AArch64TargetLowering::fixupPtrauthDiscriminator(

    MachineInstr &MI, MachineBasicBlock *BB, MachineOperand &IntDiscOp,

    MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const {

  const TargetInstrInfo *TII = Subtarget->getInstrInfo();

  MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();

  const DebugLoc &DL = MI.getDebugLoc();


  Register AddrDisc = AddrDiscOp.getReg();

  int64_t IntDisc = IntDiscOp.getImm();

  assert(IntDisc == 0 && "Blend components are already expanded");


  const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc);

  if (DiscMI) {

    switch (DiscMI->getOpcode()) {

    case AArch64::MOVKXi:

      // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48".

      // #imm should be an immediate and not a global symbol, for example.

      if (DiscMI->getOperand(2).isImm() &&

          DiscMI->getOperand(3).getImm() == 48) {

        AddrDisc = DiscMI->getOperand(1).getReg();

        IntDisc = DiscMI->getOperand(2).getImm();

      }

      break;

    case AArch64::MOVi32imm:

    case AArch64::MOVi64imm:

      // Small immediate integer constant passed via VReg.

      if (DiscMI->getOperand(1).isImm() &&

          isUInt<16>(DiscMI->getOperand(1).getImm())) {

        AddrDisc = AArch64::NoRegister;

        IntDisc = DiscMI->getOperand(1).getImm();

      }

      break;

    }

  }


  // For uniformity, always use NoRegister, as XZR is not necessarily contained

  // in the requested register class.

  if (AddrDisc == AArch64::XZR)

    AddrDisc = AArch64::NoRegister;


  // Make sure AddrDisc operand respects the register class imposed by MI.

  if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) {

    Register TmpReg = MRI.createVirtualRegister(AddrDiscRC);

    BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc);

    AddrDisc = TmpReg;

  }


  AddrDiscOp.setReg(AddrDisc);

  IntDiscOp.setImm(IntDisc);

}


MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(

    MachineInstr &MI, MachineBasicBlock *BB) const {


  int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());

  if (SMEOrigInstr != -1) {

    const TargetInstrInfo *TII = Subtarget->getInstrInfo();

    uint64_t SMEMatrixType =

        TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;

    switch (SMEMatrixType) {

    case (AArch64::SMEMatrixArray):

      return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);

    case (AArch64::SMEMatrixTileB):

      return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);

    case (AArch64::SMEMatrixTileH):

      return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);

    case (AArch64::SMEMatrixTileS):

      return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);

    case (AArch64::SMEMatrixTileD):

      return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);

    case (AArch64::SMEMatrixTileQ):

      return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);

    }

  }


  switch (MI.getOpcode()) {

  default:

#ifndef NDEBUG

    MI.dump();

#endif

    llvm_unreachable("Unexpected instruction for custom inserter!");

  case AArch64::InitTPIDR2Obj:

    return EmitInitTPIDR2Object(MI, BB);

  case AArch64::AllocateZABuffer:

    return EmitAllocateZABuffer(MI, BB);

  case AArch64::AllocateSMESaveBuffer:

    return EmitAllocateSMESaveBuffer(MI, BB);

  case AArch64::GetSMESaveSize:

    return EmitGetSMESaveSize(MI, BB);

  case AArch64::EntryPStateSM:

    return EmitEntryPStateSM(MI, BB);

  case AArch64::F128CSEL:

    return EmitF128CSEL(MI, BB);

  case TargetOpcode::STATEPOINT:

    // STATEPOINT is a pseudo instruction which has no implicit defs/uses

    // while bl call instruction (where statepoint will be lowered at the end)

    // has implicit def. This def is early-clobber as it will be set at

    // the moment of the call and earlier than any use is read.

    // Add this implicit dead def here as a workaround.

    MI.addOperand(*MI.getMF(),

                  MachineOperand::CreateReg(

                      AArch64::LR, /*isDef*/ true,

                      /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,

                      /*isUndef*/ false, /*isEarlyClobber*/ true));

    [[fallthrough]];

  case TargetOpcode::STACKMAP:

  case TargetOpcode::PATCHPOINT:

    return emitPatchPoint(MI, BB);


  case TargetOpcode::PATCHABLE_EVENT_CALL:

  case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:

    return BB;


  case AArch64::CATCHRET:

    return EmitLoweredCatchRet(MI, BB);


  case AArch64::PROBED_STACKALLOC_DYN:

    return EmitDynamicProbedAlloc(MI, BB);


  case AArch64::CHECK_MATCHING_VL_PSEUDO:

    return EmitCheckMatchingVL(MI, BB);


  case AArch64::LD1_MXIPXX_H_PSEUDO_B:

    return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);

  case AArch64::LD1_MXIPXX_H_PSEUDO_H:

    return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);

  case AArch64::LD1_MXIPXX_H_PSEUDO_S:

    return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);

  case AArch64::LD1_MXIPXX_H_PSEUDO_D:

    return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);

  case AArch64::LD1_MXIPXX_H_PSEUDO_Q:

    return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);

  case AArch64::LD1_MXIPXX_V_PSEUDO_B:

    return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);

  case AArch64::LD1_MXIPXX_V_PSEUDO_H:

    return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);

  case AArch64::LD1_MXIPXX_V_PSEUDO_S:

    return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);

  case AArch64::LD1_MXIPXX_V_PSEUDO_D:

    return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);

  case AArch64::LD1_MXIPXX_V_PSEUDO_Q:

    return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);

  case AArch64::LDR_ZA_PSEUDO:

    return EmitFill(MI, BB);

  case AArch64::LDR_TX_PSEUDO:

    return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);

  case AArch64::STR_TX_PSEUDO:

    return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);

  case AArch64::ZERO_M_PSEUDO:

    return EmitZero(MI, BB);

  case AArch64::ZERO_T_PSEUDO:

    return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);

  case AArch64::MOVT_TIZ_PSEUDO:

    return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);


  case AArch64::PAC:

    fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4),

                              &AArch64::GPR64noipRegClass);

    return BB;

  }

}


//===----------------------------------------------------------------------===//

// AArch64 Lowering private implementation.

//===----------------------------------------------------------------------===//


//===----------------------------------------------------------------------===//

// Lowering Code

//===----------------------------------------------------------------------===//


// Forward declarations of SVE fixed length lowering helpers

static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT);

static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);

static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);

static SDValue convertFixedMaskToScalableVector(SDValue Mask,

                                                SelectionDAG &DAG);

static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT);

static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,

                                             EVT VT);


/// isZerosVector - Check whether SDNode N is a zero-filled vector.


static bool isZerosVector(const SDNode *N) {

  // Look through a bit convert.

  while (N->getOpcode() == ISD::BITCAST)

    N = N->getOperand(0).getNode();


  if (ISD::isConstantSplatVectorAllZeros(N))

    return true;


  if (N->getOpcode() != AArch64ISD::DUP)

    return false;


  auto Opnd0 = N->getOperand(0);

  return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);

}


/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64

/// CC


static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC,

                                                  SDValue RHS = {}) {

  switch (CC) {

  default:

    llvm_unreachable("Unknown condition code!");

  case ISD::SETNE:

    return AArch64CC::NE;

  case ISD::SETEQ:

    return AArch64CC::EQ;

  case ISD::SETGT:

    return AArch64CC::GT;

  case ISD::SETGE:

    return (RHS && isNullConstant(RHS)) ? AArch64CC::PL : AArch64CC::GE;

  case ISD::SETLT:

    return (RHS && isNullConstant(RHS)) ? AArch64CC::MI : AArch64CC::LT;

  case ISD::SETLE:

    return AArch64CC::LE;

  case ISD::SETUGT:

    return AArch64CC::HI;

  case ISD::SETUGE:

    return AArch64CC::HS;

  case ISD::SETULT:

    return AArch64CC::LO;

  case ISD::SETULE:

    return AArch64CC::LS;

  }

}


/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.


static void changeFPCCToAArch64CC(ISD::CondCode CC,

                                  AArch64CC::CondCode &CondCode,

                                  AArch64CC::CondCode &CondCode2) {

  CondCode2 = AArch64CC::AL;

  switch (CC) {

  default:

    llvm_unreachable("Unknown FP condition!");

  case ISD::SETEQ:

  case ISD::SETOEQ:

    CondCode = AArch64CC::EQ;

    break;

  case ISD::SETGT:

  case ISD::SETOGT:

    CondCode = AArch64CC::GT;

    break;

  case ISD::SETGE:

  case ISD::SETOGE:

    CondCode = AArch64CC::GE;

    break;

  case ISD::SETOLT:

    CondCode = AArch64CC::MI;

    break;

  case ISD::SETOLE:

    CondCode = AArch64CC::LS;

    break;

  case ISD::SETONE:

    CondCode = AArch64CC::MI;

    CondCode2 = AArch64CC::GT;

    break;

  case ISD::SETO:

    CondCode = AArch64CC::VC;

    break;

  case ISD::SETUO:

    CondCode = AArch64CC::VS;

    break;

  case ISD::SETUEQ:

    CondCode = AArch64CC::EQ;

    CondCode2 = AArch64CC::VS;

    break;

  case ISD::SETUGT:

    CondCode = AArch64CC::HI;

    break;

  case ISD::SETUGE:

    CondCode = AArch64CC::PL;

    break;

  case ISD::SETLT:

  case ISD::SETULT:

    CondCode = AArch64CC::LT;

    break;

  case ISD::SETLE:

  case ISD::SETULE:

    CondCode = AArch64CC::LE;

    break;

  case ISD::SETNE:

  case ISD::SETUNE:

    CondCode = AArch64CC::NE;

    break;

  }

}


/// Convert a DAG fp condition code to an AArch64 CC.

/// This differs from changeFPCCToAArch64CC in that it returns cond codes that

/// should be AND'ed instead of OR'ed.


static void changeFPCCToANDAArch64CC(ISD::CondCode CC,

                                     AArch64CC::CondCode &CondCode,

                                     AArch64CC::CondCode &CondCode2) {

  CondCode2 = AArch64CC::AL;

  switch (CC) {

  default:

    changeFPCCToAArch64CC(CC, CondCode, CondCode2);

    assert(CondCode2 == AArch64CC::AL);

    break;

  case ISD::SETONE:

    // (a one b)

    // == ((a olt b) || (a ogt b))

    // == ((a ord b) && (a une b))

    CondCode = AArch64CC::VC;

    CondCode2 = AArch64CC::NE;

    break;

  case ISD::SETUEQ:

    // (a ueq b)

    // == ((a uno b) || (a oeq b))

    // == ((a ule b) && (a uge b))

    CondCode = AArch64CC::PL;

    CondCode2 = AArch64CC::LE;

    break;

  }

}


/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64

/// CC usable with the vector instructions. Fewer operations are available

/// without a real NZCV register, so we have to use less efficient combinations

/// to get the same effect.


static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,

                                        AArch64CC::CondCode &CondCode,

                                        AArch64CC::CondCode &CondCode2,

                                        bool &Invert) {

  Invert = false;

  switch (CC) {

  default:

    // Mostly the scalar mappings work fine.

    changeFPCCToAArch64CC(CC, CondCode, CondCode2);

    break;

  case ISD::SETUO:

    Invert = true;

    [[fallthrough]];

  case ISD::SETO:

    CondCode = AArch64CC::MI;

    CondCode2 = AArch64CC::GE;

    break;

  case ISD::SETUEQ:

  case ISD::SETULT:

  case ISD::SETULE:

  case ISD::SETUGT:

  case ISD::SETUGE:

    // All of the compare-mask comparisons are ordered, but we can switch

    // between the two by a double inversion. E.g. ULE == !OGT.

    Invert = true;

    changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),

                          CondCode, CondCode2);

    break;

  }

}


/// Like SelectionDAG::getCondCode(), but for AArch64 condition codes.


static SDValue getCondCode(SelectionDAG &DAG, AArch64CC::CondCode CC) {

  // TODO: Should be TargetConstant (need to s/imm/timm in patterns).

  return DAG.getConstant(CC, SDLoc(), CondCodeVT);

}


static bool isLegalArithImmed(uint64_t C) {

  // Matches AArch64DAGToDAGISel::SelectArithImmed().

  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);

  LLVM_DEBUG(dbgs() << "Is imm " << C

                    << " legal: " << (IsLegal ? "yes\n" : "no\n"));

  return IsLegal;

}


bool isLegalCmpImmed(APInt C) {

  // Works for negative immediates too, as it can be written as an ADDS

  // instruction with a negated immediate.

  return isLegalArithImmed(C.abs().getZExtValue());

}


unsigned numberOfInstrToLoadImm(APInt C) {

  uint64_t Imm = C.getZExtValue();

  SmallVector<AArch64_IMM::ImmInsnModel> Insn;

  AArch64_IMM::expandMOVImm(Imm, 32, Insn);

  return Insn.size();

}


static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG) {

  // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.

  if (Op->getFlags().hasNoSignedWrap())

    return true;


  // We can still figure out if the second operand is safe to use

  // in a CMN instruction by checking if it is known to be not the minimum

  // signed value. If it is not, then we can safely use CMN.

  // Note: We can eventually remove this check and simply rely on

  // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering

  // consistently sets them appropriately when making said nodes.


  KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));

  return !KnownSrc.getSignedMinValue().isMinSignedValue();

}


// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on

// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags

// can be set differently by this operation. It comes down to whether

// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then

// everything is fine. If not then the optimization is wrong. Thus general

// comparisons are only valid if op2 != 0 and op2 != INT_MIN.

//

// So, finally, the only LLVM-native comparisons that don't mention C or V

// are the ones that aren't unsigned comparisons. They're the only ones we can

// safely use CMN for in the absence of information about op2.


static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG) {

  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&

         (isIntEqualitySetCC(CC) ||

          (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||

          (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));

}


static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL,

                                      SelectionDAG &DAG, SDValue Chain,

                                      bool IsSignaling) {

  EVT VT = LHS.getValueType();

  assert(VT != MVT::f128);


  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();


  if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {

    LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},

                      {Chain, LHS});

    RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},

                      {LHS.getValue(1), RHS});

    Chain = RHS.getValue(1);

  }

  unsigned Opcode =

      IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;

  return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS});

}


static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,

                              const SDLoc &DL, SelectionDAG &DAG) {

  EVT VT = LHS.getValueType();

  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();


  if (VT.isFloatingPoint()) {

    assert(VT != MVT::f128);

    if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {

      LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);

      RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);

    }

    return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS);

  }


  // The CMP instruction is just an alias for SUBS, and representing it as

  // SUBS means that it's possible to get CSE with subtract operations.

  // A later phase can perform the optimization of setting the destination

  // register to WZR/XZR if it ends up being unused.

  unsigned Opcode = AArch64ISD::SUBS;


  if (isCMN(RHS, CC, DAG)) {

    // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?

    Opcode = AArch64ISD::ADDS;

    RHS = RHS.getOperand(1);

  } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&

             isIntEqualitySetCC(CC)) {

    // As we are looking for EQ/NE compares, the operands can be commuted ; can

    // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?

    Opcode = AArch64ISD::ADDS;

    LHS = LHS.getOperand(1);

  } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {

    if (LHS.getOpcode() == ISD::AND) {

      // Similarly, (CMP (and X, Y), 0) can be implemented with a TST

      // (a.k.a. ANDS) except that the flags are only guaranteed to work for one

      // of the signed comparisons.

      const SDValue ANDSNode =

          DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT),

                      LHS.getOperand(0), LHS.getOperand(1));

      // Replace all users of (and X, Y) with newly generated (ands X, Y)

      DAG.ReplaceAllUsesWith(LHS, ANDSNode);

      return ANDSNode.getValue(1);

    } else if (LHS.getOpcode() == AArch64ISD::ANDS) {

      // Use result of ANDS

      return LHS.getValue(1);

    }

  }


  return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS)

      .getValue(1);

}


/// \defgroup AArch64CCMP CMP;CCMP matching

///

/// These functions deal with the formation of CMP;CCMP;... sequences.

/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of

/// a comparison. They set the NZCV flags to a predefined value if their

/// predicate is false. This allows to express arbitrary conjunctions, for

/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"

/// expressed as:

///   cmp A

///   ccmp B, inv(CB), CA

///   check for CB flags

///

/// This naturally lets us implement chains of AND operations with SETCC

/// operands. And we can even implement some other situations by transforming

/// them:

///   - We can implement (NEG SETCC) i.e. negating a single comparison by

///     negating the flags used in a CCMP/FCCMP operations.

///   - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations

///     by negating the flags we test for afterwards. i.e.

///     NEG (CMP CCMP CCCMP ...) can be implemented.

///   - Note that we can only ever negate all previously processed results.

///     What we can not implement by flipping the flags to test is a negation

///     of two sub-trees (because the negation affects all sub-trees emitted so

///     far, so the 2nd sub-tree we emit would also affect the first).

/// With those tools we can implement some OR operations:

///   - (OR (SETCC A) (SETCC B)) can be implemented via:

///     NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))

///   - After transforming OR to NEG/AND combinations we may be able to use NEG

///     elimination rules from earlier to implement the whole thing as a

///     CCMP/FCCMP chain.

///

/// As complete example:

///     or (or (setCA (cmp A)) (setCB (cmp B)))

///        (and (setCC (cmp C)) (setCD (cmp D)))"

/// can be reassociated to:

///     or (and (setCC (cmp C)) setCD (cmp D))

//         (or (setCA (cmp A)) (setCB (cmp B)))

/// can be transformed to:

///     not (and (not (and (setCC (cmp C)) (setCD (cmp D))))

///              (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"

/// which can be implemented as:

///   cmp C

///   ccmp D, inv(CD), CC

///   ccmp A, CA, inv(CD)

///   ccmp B, CB, inv(CA)

///   check for CB flags

///

/// A counterexample is "or (and A B) (and C D)" which translates to

/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we

/// can only implement 1 of the inner (not) operations, but not both!

/// @{


/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.


static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,

                                         ISD::CondCode CC, SDValue CCOp,

                                         AArch64CC::CondCode Predicate,

                                         AArch64CC::CondCode OutCC,

                                         const SDLoc &DL, SelectionDAG &DAG) {

  unsigned Opcode = 0;

  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();


  if (LHS.getValueType().isFloatingPoint()) {

    assert(LHS.getValueType() != MVT::f128);

    if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||

        LHS.getValueType() == MVT::bf16) {

      LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);

      RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);

    }

    Opcode = AArch64ISD::FCCMP;

  } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {

    APInt Imm = Const->getAPIntValue();

    if (Imm.isNegative() && Imm.sgt(-32)) {

      Opcode = AArch64ISD::CCMN;

      RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));

    }

  } else if (isCMN(RHS, CC, DAG)) {

    Opcode = AArch64ISD::CCMN;

    RHS = RHS.getOperand(1);

  } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&

             isIntEqualitySetCC(CC)) {

    // As we are looking for EQ/NE compares, the operands can be commuted ; can

    // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?

    Opcode = AArch64ISD::CCMN;

    LHS = LHS.getOperand(1);

  }

  if (Opcode == 0)

    Opcode = AArch64ISD::CCMP;


  SDValue Condition = getCondCode(DAG, Predicate);

  AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);

  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);

  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);

  return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp);

}


/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be

/// expressed as a conjunction. See \ref AArch64CCMP.

/// \param CanNegate    Set to true if we can negate the whole sub-tree just by

///                     changing the conditions on the SETCC tests.

///                     (this means we can call emitConjunctionRec() with

///                      Negate==true on this sub-tree)

/// \param MustBeFirst  Set to true if this subtree needs to be negated and we

///                     cannot do the negation naturally. We are required to

///                     emit the subtree first in this case.

/// \param WillNegate   Is true if are called when the result of this

///                     subexpression must be negated. This happens when the

///                     outer expression is an OR. We can use this fact to know

///                     that we have a double negation (or (or ...) ...) that

///                     can be implemented for free.


static bool canEmitConjunction(const SDValue Val, bool &CanNegate,

                               bool &MustBeFirst, bool WillNegate,

                               unsigned Depth = 0) {

  if (!Val.hasOneUse())

    return false;

  unsigned Opcode = Val->getOpcode();

  if (Opcode == ISD::SETCC) {

    if (Val->getOperand(0).getValueType() == MVT::f128)

      return false;

    CanNegate = true;

    MustBeFirst = false;

    return true;

  }

  // Protect against exponential runtime and stack overflow.

  if (Depth > 6)

    return false;

  if (Opcode == ISD::AND || Opcode == ISD::OR) {

    bool IsOR = Opcode == ISD::OR;

    SDValue O0 = Val->getOperand(0);

    SDValue O1 = Val->getOperand(1);

    bool CanNegateL;

    bool MustBeFirstL;

    if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))

      return false;

    bool CanNegateR;

    bool MustBeFirstR;

    if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))

      return false;


    if (MustBeFirstL && MustBeFirstR)

      return false;


    if (IsOR) {

      // For an OR expression we need to be able to naturally negate at least

      // one side or we cannot do the transformation at all.

      if (!CanNegateL && !CanNegateR)

        return false;

      // If we the result of the OR will be negated and we can naturally negate

      // the leafs, then this sub-tree as a whole negates naturally.

      CanNegate = WillNegate && CanNegateL && CanNegateR;

      // If we cannot naturally negate the whole sub-tree, then this must be

      // emitted first.

      MustBeFirst = !CanNegate;

    } else {

      assert(Opcode == ISD::AND && "Must be OR or AND");

      // We cannot naturally negate an AND operation.

      CanNegate = false;

      MustBeFirst = MustBeFirstL || MustBeFirstR;

    }

    return true;

  }

  return false;

}


/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain

/// of CCMP/CFCMP ops. See @ref AArch64CCMP.

/// Tries to transform the given i1 producing node @p Val to a series compare

/// and conditional compare operations. @returns an NZCV flags producing node

/// and sets @p OutCC to the flags that should be tested or returns SDValue() if

/// transformation was not possible.

/// \p Negate is true if we want this sub-tree being negated just by changing

/// SETCC conditions.


static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,

    AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,

    AArch64CC::CondCode Predicate) {

  // We're at a tree leaf, produce a conditional comparison operation.

  unsigned Opcode = Val->getOpcode();

  if (Opcode == ISD::SETCC) {

    SDValue LHS = Val->getOperand(0);

    SDValue RHS = Val->getOperand(1);

    ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();

    bool isInteger = LHS.getValueType().isInteger();

    if (Negate)

      CC = getSetCCInverse(CC, LHS.getValueType());

    SDLoc DL(Val);

    // Determine OutCC and handle FP special case.

    if (isInteger) {

      OutCC = changeIntCCToAArch64CC(CC, RHS);

    } else {

      assert(LHS.getValueType().isFloatingPoint());

      AArch64CC::CondCode ExtraCC;

      changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);

      // Some floating point conditions can't be tested with a single condition

      // code. Construct an additional comparison in this case.

      if (ExtraCC != AArch64CC::AL) {

        SDValue ExtraCmp;

        if (!CCOp.getNode())

          ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);

        else

          ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,

                                               ExtraCC, DL, DAG);

        CCOp = ExtraCmp;

        Predicate = ExtraCC;

      }

    }


    // Produce a normal comparison if we are first in the chain

    if (!CCOp)

      return emitComparison(LHS, RHS, CC, DL, DAG);

    // Otherwise produce a ccmp.

    return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,

                                     DAG);

  }

  assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");


  bool IsOR = Opcode == ISD::OR;


  SDValue LHS = Val->getOperand(0);

  bool CanNegateL;

  bool MustBeFirstL;

  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);

  assert(ValidL && "Valid conjunction/disjunction tree");

  (void)ValidL;


  SDValue RHS = Val->getOperand(1);

  bool CanNegateR;

  bool MustBeFirstR;

  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);

  assert(ValidR && "Valid conjunction/disjunction tree");

  (void)ValidR;


  // Swap sub-tree that must come first to the right side.

  if (MustBeFirstL) {

    assert(!MustBeFirstR && "Valid conjunction/disjunction tree");

    std::swap(LHS, RHS);

    std::swap(CanNegateL, CanNegateR);

    std::swap(MustBeFirstL, MustBeFirstR);

  }


  bool NegateR;

  bool NegateAfterR;

  bool NegateL;

  bool NegateAfterAll;

  if (Opcode == ISD::OR) {

    // Swap the sub-tree that we can negate naturally to the left.

    if (!CanNegateL) {

      assert(CanNegateR && "at least one side must be negatable");

      assert(!MustBeFirstR && "invalid conjunction/disjunction tree");

      assert(!Negate);

      std::swap(LHS, RHS);

      NegateR = false;

      NegateAfterR = true;

    } else {

      // Negate the left sub-tree if possible, otherwise negate the result.

      NegateR = CanNegateR;

      NegateAfterR = !CanNegateR;

    }

    NegateL = true;

    NegateAfterAll = !Negate;

  } else {

    assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");

    assert(!Negate && "Valid conjunction/disjunction tree");


    NegateL = false;

    NegateR = false;

    NegateAfterR = false;

    NegateAfterAll = false;

  }


  // Emit sub-trees.

  AArch64CC::CondCode RHSCC;

  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);

  if (NegateAfterR)

    RHSCC = AArch64CC::getInvertedCondCode(RHSCC);

  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);

  if (NegateAfterAll)

    OutCC = AArch64CC::getInvertedCondCode(OutCC);

  return CmpL;

}


/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).

/// In some cases this is even possible with OR operations in the expression.

/// See \ref AArch64CCMP.

/// \see emitConjunctionRec().


static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,

                               AArch64CC::CondCode &OutCC) {

  bool DummyCanNegate;

  bool DummyMustBeFirst;

  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))

    return SDValue();


  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);

}


/// @}


/// Returns how profitable it is to fold a comparison's operand's shift and/or

/// extension operations.


static unsigned getCmpOperandFoldingProfit(SDValue Op) {

  auto isSupportedExtend = [&](SDValue V) {

    if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)

      return true;


    if (V.getOpcode() == ISD::AND)

      if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {

        uint64_t Mask = MaskCst->getZExtValue();

        return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);

      }


    return false;

  };


  if (!Op.hasOneUse())

    return 0;


  if (isSupportedExtend(Op))

    return 1;


  unsigned Opc = Op.getOpcode();

  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)

    if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

      uint64_t Shift = ShiftCst->getZExtValue();

      if (isSupportedExtend(Op.getOperand(0)))

        return (Shift <= 4) ? 2 : 1;

      EVT VT = Op.getValueType();

      if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))

        return 1;

    }


  return 0;

}


// emitComparison() converts comparison with one or negative one to comparison

// with 0. Note that this only works for signed comparisons because of how ANDS

// works.


static bool shouldBeAdjustedToZero(SDValue LHS, APInt C, ISD::CondCode &CC) {

  // Only works for ANDS and AND.

  if (LHS.getOpcode() != ISD::AND && LHS.getOpcode() != AArch64ISD::ANDS)

    return false;


  if (C.isOne() && (CC == ISD::SETLT || CC == ISD::SETGE)) {

    CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;

    return true;

  }


  if (C.isAllOnes() && (CC == ISD::SETLE || CC == ISD::SETGT)) {

    CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;

    return true;

  }


  return false;

}


static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,

                             SDValue &AArch64cc, SelectionDAG &DAG,

                             const SDLoc &DL) {

  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {

    EVT VT = RHS.getValueType();

    APInt C = RHSC->getAPIntValue();

    // shouldBeAdjustedToZero is a special case to better fold with

    // emitComparison().

    if (shouldBeAdjustedToZero(LHS, C, CC)) {

      // Adjust the constant to zero.

      // CC has already been adjusted.

      RHS = DAG.getConstant(0, DL, VT);

    } else if (!isLegalCmpImmed(C)) {

      unsigned NumImmForC = numberOfInstrToLoadImm(C);

      // Constant does not fit, try adjusting it by one?

      switch (CC) {

      default:

        break;

      case ISD::SETLT:

      case ISD::SETGE:

        if (!C.isMinSignedValue()) {

          APInt CMinusOne = C - 1;

          if (isLegalCmpImmed(CMinusOne) ||

              (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {

            CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;

            RHS = DAG.getConstant(CMinusOne, DL, VT);

          }

        }

        break;

      case ISD::SETULT:

      case ISD::SETUGE: {

        // C is not 0 because it is a legal immediate.

        assert(!C.isZero() && "C should not be zero here");

        APInt CMinusOne = C - 1;

        if (isLegalCmpImmed(CMinusOne) ||

            (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {

          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;

          RHS = DAG.getConstant(CMinusOne, DL, VT);

        }

        break;

      }

      case ISD::SETLE:

      case ISD::SETGT:

        if (!C.isMaxSignedValue()) {

          APInt CPlusOne = C + 1;

          if (isLegalCmpImmed(CPlusOne) ||

              (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {

            CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;

            RHS = DAG.getConstant(CPlusOne, DL, VT);

          }

        }

        break;

      case ISD::SETULE:

      case ISD::SETUGT: {

        if (!C.isAllOnes()) {

          APInt CPlusOne = C + 1;

          if (isLegalCmpImmed(CPlusOne) ||

              (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {

            CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;

            RHS = DAG.getConstant(CPlusOne, DL, VT);

          }

        }

        break;

      }

      }

    }

  }


  // Comparisons are canonicalized so that the RHS operand is simpler than the

  // LHS one, the extreme case being when RHS is an immediate. However, AArch64

  // can fold some shift+extend operations on the RHS operand, so swap the

  // operands if that can be done.

  //

  // For example:

  //    lsl     w13, w11, #1

  //    cmp     w13, w12

  // can be turned into:

  //    cmp     w12, w11, lsl #1

  if (!isa<ConstantSDNode>(RHS) || !isLegalCmpImmed(RHS->getAsAPIntVal())) {

    bool LHSIsCMN = isCMN(LHS, CC, DAG);

    bool RHSIsCMN = isCMN(RHS, CC, DAG);

    SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;

    SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;


    if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >

        getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {

      std::swap(LHS, RHS);

      CC = ISD::getSetCCSwappedOperands(CC);

    }

  }


  SDValue Cmp;

  AArch64CC::CondCode AArch64CC;

  if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {

    const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);


    // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.

    // For the i8 operand, the largest immediate is 255, so this can be easily

    // encoded in the compare instruction. For the i16 operand, however, the

    // largest immediate cannot be encoded in the compare.

    // Therefore, use a sign extending load and cmn to avoid materializing the

    // -1 constant. For example,

    // movz w1, #65535

    // ldrh w0, [x0, #0]

    // cmp w0, w1

    // >

    // ldrsh w0, [x0, #0]

    // cmn w0, #1

    // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)

    // if and only if (sext LHS) == (sext RHS). The checks are in place to

    // ensure both the LHS and RHS are truly zero extended and to make sure the

    // transformation is profitable.

    if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&

        cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&

        cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&

        LHS.getNode()->hasNUsesOfValue(1, 0)) {

      int16_t ValueofRHS = RHS->getAsZExtVal();

      if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {

        SDValue SExt =

            DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, LHS.getValueType(), LHS,

                        DAG.getValueType(MVT::i16));

        Cmp = emitComparison(

            SExt, DAG.getSignedConstant(ValueofRHS, DL, RHS.getValueType()), CC,

            DL, DAG);

        AArch64CC = changeIntCCToAArch64CC(CC);

      }

    }


    if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {

      if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {

        if ((CC == ISD::SETNE) ^ RHSC->isZero())

          AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);

      }

    }

  }


  if (!Cmp) {

    Cmp = emitComparison(LHS, RHS, CC, DL, DAG);

    AArch64CC = changeIntCCToAArch64CC(CC, RHS);

  }

  AArch64cc = getCondCode(DAG, AArch64CC);

  return Cmp;

}


static std::pair<SDValue, SDValue>


getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {

  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&

         "Unsupported value type");

  SDValue Value, Overflow;

  SDLoc DL(Op);

  SDValue LHS = Op.getOperand(0);

  SDValue RHS = Op.getOperand(1);

  unsigned Opc = 0;

  switch (Op.getOpcode()) {

  default:

    llvm_unreachable("Unknown overflow instruction!");

  case ISD::SADDO:

    Opc = AArch64ISD::ADDS;

    CC = AArch64CC::VS;

    break;

  case ISD::UADDO:

    Opc = AArch64ISD::ADDS;

    CC = AArch64CC::HS;

    break;

  case ISD::SSUBO:

    Opc = AArch64ISD::SUBS;

    CC = AArch64CC::VS;

    break;

  case ISD::USUBO:

    Opc = AArch64ISD::SUBS;

    CC = AArch64CC::LO;

    break;

  // Multiply needs a little bit extra work.

  case ISD::SMULO:

  case ISD::UMULO: {

    CC = AArch64CC::NE;

    bool IsSigned = Op.getOpcode() == ISD::SMULO;

    if (Op.getValueType() == MVT::i32) {

      // Extend to 64-bits, then perform a 64-bit multiply.

      unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

      LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);

      RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);

      SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);

      Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);


      // Check that the result fits into a 32-bit integer.

      SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);

      if (IsSigned) {

        // cmp xreg, wreg, sxtw

        SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);

        Overflow =

            DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);

      } else {

        // tst xreg, #0xffffffff00000000

        SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);

        Overflow =

            DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);

      }

      break;

    }

    assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");

    // For the 64 bit multiply

    Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);

    if (IsSigned) {

      SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);

      SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,

                                      DAG.getConstant(63, DL, MVT::i64));

      // It is important that LowerBits is last, otherwise the arithmetic

      // shift will not be folded into the compare (SUBS).

      SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);

      Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)

                     .getValue(1);

    } else {

      SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);

      SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);

      Overflow =

          DAG.getNode(AArch64ISD::SUBS, DL, VTs,

                      DAG.getConstant(0, DL, MVT::i64),

                      UpperBits).getValue(1);

    }

    break;

  }

  } // switch (...)


  if (Opc) {

    SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT);


    // Emit the AArch64 operation with overflow check.

    Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);

    Overflow = Value.getValue(1);

  }

  return std::make_pair(Value, Overflow);

}


SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {

  if (useSVEForFixedLengthVectorVT(Op.getValueType(),

                                   !Subtarget->isNeonAvailable()))

    return LowerToScalableOp(Op, DAG);


  SDValue Sel = Op.getOperand(0);

  SDValue Other = Op.getOperand(1);

  SDLoc DL(Sel);


  // If the operand is an overflow checking operation, invert the condition

  // code and kill the Not operation. I.e., transform:

  // (xor (overflow_op_bool, 1))

  //   -->

  // (csel 1, 0, invert(cc), overflow_op_bool)

  // ... which later gets transformed to just a cset instruction with an

  // inverted condition code, rather than a cset + eor sequence.

  if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {

    // Only lower legal XALUO ops.

    if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))

      return SDValue();


    SDValue TVal = DAG.getConstant(1, DL, MVT::i32);

    SDValue FVal = DAG.getConstant(0, DL, MVT::i32);

    AArch64CC::CondCode CC;

    SDValue Value, Overflow;

    std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);

    SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));

    return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,

                       CCVal, Overflow);

  }

  // If neither operand is a SELECT_CC, give up.

  if (Sel.getOpcode() != ISD::SELECT_CC)

    std::swap(Sel, Other);

  if (Sel.getOpcode() != ISD::SELECT_CC)

    return Op;


  // The folding we want to perform is:

  // (xor x, (select_cc a, b, cc, 0, -1) )

  //   -->

  // (csel x, (xor x, -1), cc ...)

  //

  // The latter will get matched to a CSINV instruction.


  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();

  SDValue LHS = Sel.getOperand(0);

  SDValue RHS = Sel.getOperand(1);

  SDValue TVal = Sel.getOperand(2);

  SDValue FVal = Sel.getOperand(3);


  // FIXME: This could be generalized to non-integer comparisons.

  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)

    return Op;


  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);

  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);


  // The values aren't constants, this isn't the pattern we're looking for.

  if (!CFVal || !CTVal)

    return Op;


  // We can commute the SELECT_CC by inverting the condition.  This

  // might be needed to make this fit into a CSINV pattern.

  if (CTVal->isAllOnes() && CFVal->isZero()) {

    std::swap(TVal, FVal);

    std::swap(CTVal, CFVal);

    CC = ISD::getSetCCInverse(CC, LHS.getValueType());

  }


  // If the constants line up, perform the transform!

  if (CTVal->isZero() && CFVal->isAllOnes()) {

    SDValue CCVal;

    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);


    FVal = Other;

    TVal = DAG.getNode(ISD::XOR, DL, Other.getValueType(), Other,

                       DAG.getAllOnesConstant(DL, Other.getValueType()));


    return DAG.getNode(AArch64ISD::CSEL, DL, Sel.getValueType(), FVal, TVal,

                       CCVal, Cmp);

  }


  return Op;

}


// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'

// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else

// sets 'C' bit to 0.


static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {

  SDLoc DL(Value);

  EVT VT = Value.getValueType();

  SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;

  SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);

  SDValue Cmp =

      DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1);

  return Cmp.getValue(1);

}


// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.

// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.


static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG,

                                bool Invert) {

  assert(Glue.getResNo() == 1);

  SDLoc DL(Glue);

  SDValue Zero = DAG.getConstant(0, DL, VT);

  SDValue One = DAG.getConstant(1, DL, VT);

  AArch64CC::CondCode Cond = Invert ? AArch64CC::LO : AArch64CC::HS;

  SDValue CC = getCondCode(DAG, Cond);

  return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);

}


// Value is 1 if 'V' bit of NZCV is 1, else 0


static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG) {

  assert(Glue.getResNo() == 1);

  SDLoc DL(Glue);

  SDValue Zero = DAG.getConstant(0, DL, VT);

  SDValue One = DAG.getConstant(1, DL, VT);

  SDValue CC = getCondCode(DAG, AArch64CC::VS);

  return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);

}


// This lowering is inefficient, but it will get cleaned up by

// `foldOverflowCheck`


static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG,

                                  unsigned Opcode, bool IsSigned) {

  EVT VT0 = Op.getValue(0).getValueType();

  EVT VT1 = Op.getValue(1).getValueType();


  if (VT0 != MVT::i32 && VT0 != MVT::i64)

    return SDValue();


  bool InvertCarry = Opcode == AArch64ISD::SBCS;

  SDValue OpLHS = Op.getOperand(0);

  SDValue OpRHS = Op.getOperand(1);

  SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);


  SDLoc DL(Op);


  SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS,

                            OpRHS, OpCarryIn);


  SDValue OutFlag =

      IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)

               : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);


  return DAG.getMergeValues({Sum, OutFlag}, DL);

}


static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {

  // Let legalize expand this if it isn't a legal type yet.

  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))

    return SDValue();


  SDLoc DL(Op);

  AArch64CC::CondCode CC;

  // The actual operation that sets the overflow or carry flag.

  SDValue Value, Overflow;

  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);


  // We use 0 and 1 as false and true values.

  SDValue TVal = DAG.getConstant(1, DL, MVT::i32);

  SDValue FVal = DAG.getConstant(0, DL, MVT::i32);


  // We use an inverted condition, because the conditional select is inverted

  // too. This will allow it to be selected to a single instruction:

  // CSINC Wd, WZR, WZR, invert(cond).

  SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));

  Overflow =

      DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);


  return DAG.getMergeValues({Value, Overflow}, DL);

}


// Prefetch operands are:

// 1: Address to prefetch

// 2: bool isWrite

// 3: int locality (0 = no locality ... 3 = extreme locality)

// 4: bool isDataCache


static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {

  SDLoc DL(Op);

  unsigned IsWrite = Op.getConstantOperandVal(2);

  unsigned Locality = Op.getConstantOperandVal(3);

  unsigned IsData = Op.getConstantOperandVal(4);


  bool IsStream = !Locality;

  // When the locality number is set

  if (Locality) {

    // The front-end should have filtered out the out-of-range values

    assert(Locality <= 3 && "Prefetch locality out-of-range");

    // The locality degree is the opposite of the cache speed.

    // Put the number the other way around.

    // The encoding starts at 0 for level 1

    Locality = 3 - Locality;

  }


  // built the mask value encoding the expected behavior.

  unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit

                   (!IsData << 3) |     // IsDataCache bit

                   (Locality << 1) |    // Cache level bits

                   (unsigned)IsStream;  // Stream bit

  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),

                     DAG.getTargetConstant(PrfOp, DL, MVT::i32),

                     Op.getOperand(1));

}


// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is

// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS

// (AND X Y) Z which produces a better opt with EmitComparison


static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS,

                                SelectionDAG &DAG, const SDLoc DL) {

  if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {

    ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));

    ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);

    if (LHSConstOp && RHSConst) {

      uint64_t LHSConstValue = LHSConstOp->getZExtValue();

      uint64_t RHSConstant = RHSConst->getZExtValue();

      if (isPowerOf2_64(RHSConstant)) {

        uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);

        LHS =

            DAG.getNode(ISD::AND, DL, LHS.getValueType(), LHS.getOperand(0),

                        DAG.getConstant(NewMaskValue, DL, LHS.getValueType()));

        RHS = DAG.getConstant(0, DL, RHS.getValueType());

        CC = ISD::SETEQ;

      }

    }

  }

}


SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,

                                              SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  if (VT.isScalableVector()) {

    SDValue SrcVal = Op.getOperand(0);


    if (VT == MVT::nxv2f64 && SrcVal.getValueType() == MVT::nxv2bf16) {

      // Break conversion in two with the first part converting to f32 and the

      // second using native f32->VT instructions.

      SDLoc DL(Op);

      return DAG.getNode(ISD::FP_EXTEND, DL, VT,

                         DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));

    }


    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);

  }


  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))

    return LowerFixedLengthFPExtendToSVE(Op, DAG);


  bool IsStrict = Op->isStrictFPOpcode();

  SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

  EVT Op0VT = Op0.getValueType();

  if (VT == MVT::f64) {

    // FP16->FP32 extends are legal for v32 and v4f32.

    if (Op0VT == MVT::f32 || Op0VT == MVT::f16)

      return Op;

    // Split bf16->f64 extends into two fpextends.

    if (Op0VT == MVT::bf16 && IsStrict) {

      SDValue Ext1 =

          DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},

                      {Op0, Op.getOperand(0)});

      return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},

                         {Ext1, Ext1.getValue(1)});

    }

    if (Op0VT == MVT::bf16)

      return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,

                         DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));

    return SDValue();

  }


  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");

  return SDValue();

}


SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,

                                             SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  bool IsStrict = Op->isStrictFPOpcode();

  SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);

  EVT SrcVT = SrcVal.getValueType();

  bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;


  if (VT.isScalableVector()) {

    // Let common code split the operation.

    if (SrcVT == MVT::nxv8f32)

      return Op;


    if (VT.getScalarType() != MVT::bf16)

      return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);


    SDLoc DL(Op);

    constexpr EVT I32 = MVT::nxv4i32;

    auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };


    SDValue NaN;

    SDValue Narrow;


    if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {

      if (Subtarget->hasBF16())

        return LowerToPredicatedOp(Op, DAG,

                                   AArch64ISD::FP_ROUND_MERGE_PASSTHRU);


      Narrow = getSVESafeBitCast(I32, SrcVal, DAG);


      // Set the quiet bit.

      if (!DAG.isKnownNeverSNaN(SrcVal))

        NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));

    } else if (SrcVT == MVT::nxv2f64 &&

               (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {

      // Round to float without introducing rounding errors and try again.

      SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);

      Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,

                           Pg, SrcVal, DAG.getUNDEF(MVT::nxv2f32));


      SmallVector<SDValue, 3> NewOps;

      if (IsStrict)

        NewOps.push_back(Op.getOperand(0));

      NewOps.push_back(Narrow);

      NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));

      return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());

    } else

      return SDValue();


    if (!Trunc) {

      SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));

      Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));

      SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));

      Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);

    }


    // Don't round if we had a NaN, we don't want to turn 0x7fffffff into

    // 0x80000000.

    if (NaN) {

      EVT I1 = I32.changeElementType(MVT::i1);

      EVT CondVT = VT.changeElementType(MVT::i1);

      SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);

      IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);

      Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);

    }


    // Now that we have rounded, shift the bits into position.

    Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));

    return getSVESafeBitCast(VT, Narrow, DAG);

  }


  if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))

    return LowerFixedLengthFPRoundToSVE(Op, DAG);


  // Expand cases where the result type is BF16 but we don't have hardware

  // instructions to lower it.

  if (VT.getScalarType() == MVT::bf16 &&

      !((Subtarget->hasNEON() || Subtarget->hasSME()) &&

        Subtarget->hasBF16())) {

    SDLoc DL(Op);

    SDValue Narrow = SrcVal;

    SDValue NaN;

    EVT I32 = SrcVT.changeElementType(MVT::i32);

    EVT F32 = SrcVT.changeElementType(MVT::f32);

    if (SrcVT.getScalarType() == MVT::f32) {

      bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);

      Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);

      if (!NeverSNaN) {

        // Set the quiet bit.

        NaN = DAG.getNode(ISD::OR, DL, I32, Narrow,

                          DAG.getConstant(0x400000, DL, I32));

      }

    } else if (SrcVT.getScalarType() == MVT::f64) {

      Narrow = DAG.getNode(AArch64ISD::FCVTXN, DL, F32, Narrow);

      Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);

    } else {

      return SDValue();

    }

    if (!Trunc) {

      SDValue One = DAG.getConstant(1, DL, I32);

      SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow,

                                DAG.getShiftAmountConstant(16, I32, DL));

      Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, One);

      SDValue RoundingBias =

          DAG.getNode(ISD::ADD, DL, I32, DAG.getConstant(0x7fff, DL, I32), Lsb);

      Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);

    }


    // Don't round if we had a NaN, we don't want to turn 0x7fffffff into

    // 0x80000000.

    if (NaN) {

      SDValue IsNaN = DAG.getSetCC(

          DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),

          SrcVal, SrcVal, ISD::SETUO);

      Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);

    }


    // Now that we have rounded, shift the bits into position.

    Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow,

                         DAG.getShiftAmountConstant(16, I32, DL));

    if (VT.isVector()) {

      EVT I16 = I32.changeVectorElementType(MVT::i16);

      Narrow = DAG.getNode(ISD::TRUNCATE, DL, I16, Narrow);

      return DAG.getNode(ISD::BITCAST, DL, VT, Narrow);

    }

    Narrow = DAG.getNode(ISD::BITCAST, DL, F32, Narrow);

    SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Narrow);

    return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, DL)

                    : Result;

  }


  if (SrcVT != MVT::f128) {

    // Expand cases where the input is a vector bigger than NEON.

    if (useSVEForFixedLengthVectorVT(SrcVT))

      return SDValue();


    // It's legal except when f128 is involved

    return Op;

  }


  return SDValue();

}


SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,

                                                    SelectionDAG &DAG) const {

  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.

  // Any additional optimization in this function should be recorded

  // in the cost tables.

  bool IsStrict = Op->isStrictFPOpcode();

  EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();

  EVT VT = Op.getValueType();


  assert(!(IsStrict && VT.isScalableVector()) &&

         "Unimplemented SVE support for STRICT_FP_to_INT!");


  // f16 conversions are promoted to f32 when full fp16 is not supported.

  if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||

      InVT.getVectorElementType() == MVT::bf16) {

    EVT NewVT = VT.changeElementType(MVT::f32);

    SDLoc DL(Op);

    if (IsStrict) {

      SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {NewVT, MVT::Other},

                                {Op.getOperand(0), Op.getOperand(1)});

      return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},

                         {Ext.getValue(1), Ext.getValue(0)});

    }

    return DAG.getNode(

        Op.getOpcode(), DL, Op.getValueType(),

        DAG.getNode(ISD::FP_EXTEND, DL, NewVT, Op.getOperand(0)));

  }


  if (VT.isScalableVector()) {

    if (VT.getVectorElementType() == MVT::i1) {

      SDLoc DL(Op);

      EVT CvtVT = getPromotedVTForPredicate(VT);

      SDValue Cvt = DAG.getNode(Op.getOpcode(), DL, CvtVT, Op.getOperand(0));

      SDValue Zero = DAG.getConstant(0, DL, CvtVT);

      return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE);

    }


    // Let common code split the operation.

    if (InVT == MVT::nxv8f32)

      return Op;


    unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT

                          ? AArch64ISD::FCVTZU_MERGE_PASSTHRU

                          : AArch64ISD::FCVTZS_MERGE_PASSTHRU;

    return LowerToPredicatedOp(Op, DAG, Opcode);

  }


  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||

      useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))

    return LowerFixedLengthFPToIntToSVE(Op, DAG);


  uint64_t VTSize = VT.getFixedSizeInBits();

  uint64_t InVTSize = InVT.getFixedSizeInBits();

  if (VTSize < InVTSize) {

    SDLoc DL(Op);

    if (IsStrict) {

      InVT = InVT.changeVectorElementTypeToInteger();

      SDValue Cv = DAG.getNode(Op.getOpcode(), DL, {InVT, MVT::Other},

                               {Op.getOperand(0), Op.getOperand(1)});

      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);

      return DAG.getMergeValues({Trunc, Cv.getValue(1)}, DL);

    }

    SDValue Cv =

        DAG.getNode(Op.getOpcode(), DL, InVT.changeVectorElementTypeToInteger(),

                    Op.getOperand(0));

    return DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);

  }


  if (VTSize > InVTSize) {

    SDLoc DL(Op);

    MVT ExtVT =

        MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),

                         VT.getVectorNumElements());

    if (IsStrict) {

      SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {ExtVT, MVT::Other},

                                {Op.getOperand(0), Op.getOperand(1)});

      return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},

                         {Ext.getValue(1), Ext.getValue(0)});

    }

    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, ExtVT, Op.getOperand(0));

    return DAG.getNode(Op.getOpcode(), DL, VT, Ext);

  }


  // Use a scalar operation for conversions between single-element vectors of

  // the same size.

  if (InVT.getVectorNumElements() == 1) {

    SDLoc DL(Op);

    SDValue Extract = DAG.getNode(

        ISD::EXTRACT_VECTOR_ELT, DL, InVT.getScalarType(),

        Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, DL, MVT::i64));

    EVT ScalarVT = VT.getScalarType();

    if (IsStrict)

      return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},

                         {Op.getOperand(0), Extract});

    return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);

  }


  // Type changing conversions are illegal.

  return Op;

}


SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,

                                              SelectionDAG &DAG) const {

  bool IsStrict = Op->isStrictFPOpcode();

  SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);


  if (SrcVal.getValueType().isVector())

    return LowerVectorFP_TO_INT(Op, DAG);


  // f16 conversions are promoted to f32 when full fp16 is not supported.

  if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||

      SrcVal.getValueType() == MVT::bf16) {

    SDLoc DL(Op);

    if (IsStrict) {

      SDValue Ext =

          DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},

                      {Op.getOperand(0), SrcVal});

      return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},

                         {Ext.getValue(1), Ext.getValue(0)});

    }

    return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(),

                       DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, SrcVal));

  }


  if (SrcVal.getValueType() != MVT::f128) {

    // It's legal except when f128 is involved

    return Op;

  }


  return SDValue();

}


SDValue

AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,

                                                SelectionDAG &DAG) const {

  // AArch64 FP-to-int conversions saturate to the destination element size, so

  // we can lower common saturating conversions to simple instructions.

  SDValue SrcVal = Op.getOperand(0);

  EVT SrcVT = SrcVal.getValueType();

  EVT DstVT = Op.getValueType();

  EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();


  uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();

  uint64_t DstElementWidth = DstVT.getScalarSizeInBits();

  uint64_t SatWidth = SatVT.getScalarSizeInBits();

  assert(SatWidth <= DstElementWidth &&

         "Saturation width cannot exceed result width");


  // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.

  // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable

  // types, so this is hard to reach.

  if (DstVT.isScalableVector())

    return SDValue();


  EVT SrcElementVT = SrcVT.getVectorElementType();


  // In the absence of FP16 support, promote f16 to f32 and saturate the result.

  SDLoc DL(Op);

  SDValue SrcVal2;

  if ((SrcElementVT == MVT::f16 &&

       (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||

      SrcElementVT == MVT::bf16) {

    MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());

    SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);

    // If we are extending to a v8f32, split into two v4f32 to produce legal

    // types.

    if (F32VT.getSizeInBits() > 128) {

      std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);

      F32VT = F32VT.getHalfNumVectorElementsVT();

    }

    SrcVT = F32VT;

    SrcElementVT = MVT::f32;

    SrcElementWidth = 32;

  } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&

             SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)

    return SDValue();


  // Expand to f64 if we are saturating to i64, to help keep the lanes the same

  // width and produce a fcvtzu.

  if (SatWidth == 64 && SrcElementWidth < 64) {

    MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());

    SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);

    SrcVT = F64VT;

    SrcElementVT = MVT::f64;

    SrcElementWidth = 64;

  }

  // Cases that we can emit directly.

  if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {

    SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,

                              DAG.getValueType(DstVT.getScalarType()));

    if (SrcVal2) {

      SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,

                                 DAG.getValueType(DstVT.getScalarType()));

      return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);

    }

    return Res;

  }


  // Otherwise we emit a cvt that saturates to a higher BW, and saturate the

  // result. This is only valid if the legal cvt is larger than the saturate

  // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize

  // (at least until sqxtn is selected).

  if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)

    return SDValue();


  EVT IntVT = SrcVT.changeVectorElementTypeToInteger();

  SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,

                                  DAG.getValueType(IntVT.getScalarType()));

  SDValue NativeCvt2 =

      SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,

                            DAG.getValueType(IntVT.getScalarType()))

              : SDValue();

  SDValue Sat, Sat2;

  if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {

    SDValue MinC = DAG.getConstant(

        APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);

    SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);

    SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();

    SDValue MaxC = DAG.getConstant(

        APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);

    Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);

    Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();

  } else {

    SDValue MinC = DAG.getConstant(

        APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);

    Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);

    Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();

  }


  if (SrcVal2)

    Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,

                      IntVT.getDoubleNumVectorElementsVT(*DAG.getContext()),

                      Sat, Sat2);


  return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);

}


SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,

                                                  SelectionDAG &DAG) const {

  // AArch64 FP-to-int conversions saturate to the destination register size, so

  // we can lower common saturating conversions to simple instructions.

  SDValue SrcVal = Op.getOperand(0);

  EVT SrcVT = SrcVal.getValueType();


  if (SrcVT.isVector())

    return LowerVectorFP_TO_INT_SAT(Op, DAG);


  EVT DstVT = Op.getValueType();

  EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();

  uint64_t SatWidth = SatVT.getScalarSizeInBits();

  uint64_t DstWidth = DstVT.getScalarSizeInBits();

  assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");


  // In the absence of FP16 support, promote f16 to f32 and saturate the result.

  if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {

    SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);

    SrcVT = MVT::f32;

  } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&

             SrcVT != MVT::bf16)

    return SDValue();


  SDLoc DL(Op);

  // Cases that we can emit directly.

  if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||

       (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&

      DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))

    return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,

                       DAG.getValueType(DstVT));


  // Otherwise we emit a cvt that saturates to a higher BW, and saturate the

  // result. This is only valid if the legal cvt is larger than the saturate

  // width.

  if (DstWidth < SatWidth)

    return SDValue();


  if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) {

    if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {

      SDValue CVTf32 =

          DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal);

      SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32);

      return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast,

                         DAG.getValueType(SatVT));

    }

    SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal);

    return DAG.getBitcast(DstVT, CVTf32);

  }


  SDValue NativeCvt =

      DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));

  SDValue Sat;

  if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {

    SDValue MinC = DAG.getConstant(

        APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);

    SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);

    SDValue MaxC = DAG.getConstant(

        APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);

    Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);

  } else {

    SDValue MinC = DAG.getConstant(

        APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);

    Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);

  }


  return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);

}


SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,

                                                SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  SDValue Src = Op.getOperand(0);

  SDLoc DL(Op);


  assert(VT.isVector() && "Expected vector type");


  EVT CastVT =

      VT.changeVectorElementType(Src.getValueType().getVectorElementType());


  // Round the floating-point value into a floating-point register with the

  // current rounding mode.

  SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);


  // Truncate the rounded floating point to an integer.

  return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,

                     DAG.getValueType(VT.getVectorElementType()));

}


SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,

                                                    SelectionDAG &DAG) const {

  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.

  // Any additional optimization in this function should be recorded

  // in the cost tables.

  bool IsStrict = Op->isStrictFPOpcode();

  EVT VT = Op.getValueType();

  SDLoc DL(Op);

  SDValue In = Op.getOperand(IsStrict ? 1 : 0);

  EVT InVT = In.getValueType();

  unsigned Opc = Op.getOpcode();

  bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;


  assert(!(IsStrict && VT.isScalableVector()) &&

         "Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");


  // NOTE: i1->bf16 does not require promotion to f32.

  if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) {

    SDValue FalseVal = DAG.getConstantFP(0.0, DL, VT);

    SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, DL, VT)

                               : DAG.getConstantFP(1.0, DL, VT);

    return DAG.getNode(ISD::VSELECT, DL, VT, In, TrueVal, FalseVal);

  }


  // Promote bf16 conversions to f32.

  if (VT.getVectorElementType() == MVT::bf16) {

    EVT F32 = VT.changeElementType(MVT::f32);

    if (IsStrict) {

      SDValue Val = DAG.getNode(Op.getOpcode(), DL, {F32, MVT::Other},

                                {Op.getOperand(0), In});

      return DAG.getNode(ISD::STRICT_FP_ROUND, DL,

                         {Op.getValueType(), MVT::Other},

                         {Val.getValue(1), Val.getValue(0),

                          DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});

    }

    return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),

                       DAG.getNode(Op.getOpcode(), DL, F32, In),

                       DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));

  }


  if (VT.isScalableVector()) {

    // Let common code split the operation.

    if (VT == MVT::nxv8f32)

      return Op;


    unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU

                               : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;

    return LowerToPredicatedOp(Op, DAG, Opcode);

  }


  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||

      useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))

    return LowerFixedLengthIntToFPToSVE(Op, DAG);


  uint64_t VTSize = VT.getFixedSizeInBits();

  uint64_t InVTSize = InVT.getFixedSizeInBits();

  if (VTSize < InVTSize) {

    // AArch64 doesn't have a direct vector instruction to convert

    // fixed point to floating point AND narrow it at the same time.

    // Additional rounding when the target is f32/f64 causes double

    // rounding issues. Conversion to f16 is fine due to narrow width.

    bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;

    bool IsTargetf16 = false;

    if (Op.hasOneUse() &&

        Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {

      // Some vector types are split during legalization into half, followed by

      // concatenation, followed by rounding to the original vector type. If we

      // end up resolving to f16 type, we shouldn't worry about rounding errors.

      SDNode *U = *Op->user_begin();

      if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {

        EVT TmpVT = U->user_begin()->getValueType(0);

        if (TmpVT.getScalarType() == MVT::f16)

          IsTargetf16 = true;

      }

    }


    if (IsTargetf32 && !IsTargetf16) {

      return !IsStrict ? DAG.UnrollVectorOp(Op.getNode()) : SDValue();

    }


    MVT CastVT =

        MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),

                         InVT.getVectorNumElements());

    if (IsStrict) {

      In = DAG.getNode(Opc, DL, {CastVT, MVT::Other}, {Op.getOperand(0), In});

      return DAG.getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},

                         {In.getValue(1), In.getValue(0),

                          DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});

    }

    In = DAG.getNode(Opc, DL, CastVT, In);

    return DAG.getNode(ISD::FP_ROUND, DL, VT, In,

                       DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));

  }


  if (VTSize > InVTSize) {

    unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

    EVT CastVT = VT.changeVectorElementTypeToInteger();

    In = DAG.getNode(CastOpc, DL, CastVT, In);

    if (IsStrict)

      return DAG.getNode(Opc, DL, {VT, MVT::Other}, {Op.getOperand(0), In});

    return DAG.getNode(Opc, DL, VT, In);

  }


  // Use a scalar operation for conversions between single-element vectors of

  // the same size.

  if (VT.getVectorNumElements() == 1) {

    SDValue Extract =

        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InVT.getScalarType(), In,

                    DAG.getConstant(0, DL, MVT::i64));

    EVT ScalarVT = VT.getScalarType();

    if (IsStrict)

      return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},

                         {Op.getOperand(0), Extract});

    return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);

  }


  return Op;

}


SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,

                                            SelectionDAG &DAG) const {

  if (Op.getValueType().isVector())

    return LowerVectorINT_TO_FP(Op, DAG);


  bool IsStrict = Op->isStrictFPOpcode();

  SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);


  bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||

                  Op->getOpcode() == ISD::SINT_TO_FP;


  auto IntToFpViaPromotion = [&](EVT PromoteVT) {

    SDLoc DL(Op);

    if (IsStrict) {

      SDValue Val = DAG.getNode(Op.getOpcode(), DL, {PromoteVT, MVT::Other},

                                {Op.getOperand(0), SrcVal});

      return DAG.getNode(ISD::STRICT_FP_ROUND, DL,

                         {Op.getValueType(), MVT::Other},

                         {Val.getValue(1), Val.getValue(0),

                          DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});

    }

    return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),

                       DAG.getNode(Op.getOpcode(), DL, PromoteVT, SrcVal),

                       DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));

  };


  if (Op.getValueType() == MVT::bf16) {

    unsigned MaxWidth = IsSigned

                            ? DAG.ComputeMaxSignificantBits(SrcVal)

                            : DAG.computeKnownBits(SrcVal).countMaxActiveBits();

    // bf16 conversions are promoted to f32 when converting from i16.

    if (MaxWidth <= 24) {

      return IntToFpViaPromotion(MVT::f32);

    }


    // bf16 conversions are promoted to f64 when converting from i32.

    if (MaxWidth <= 53) {

      return IntToFpViaPromotion(MVT::f64);

    }


    // We need to be careful about i64 -> bf16.

    // Consider an i32 22216703.

    // This number cannot be represented exactly as an f32 and so a itofp will

    // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0

    // However, the correct bf16 was supposed to be 22151168.0

    // We need to use sticky rounding to get this correct.

    if (SrcVal.getValueType() == MVT::i64) {

      SDLoc DL(Op);

      // This algorithm is equivalent to the following:

      // uint64_t SrcHi = SrcVal & ~0xfffull;

      // uint64_t SrcLo = SrcVal &  0xfffull;

      // uint64_t Highest = SrcVal >> 53;

      // bool HasHighest = Highest != 0;

      // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;

      // double  Rounded = static_cast<double>(ToRound);

      // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);

      // uint64_t HasLo = SrcLo != 0;

      // bool NeedsAdjustment = HasHighest & HasLo;

      // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};

      // double Adjusted = std::bit_cast<double>(AdjustedBits);

      // return static_cast<__bf16>(Adjusted);

      //

      // Essentially, what happens is that SrcVal either fits perfectly in a

      // double-precision value or it is too big. If it is sufficiently small,

      // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we

      // ensure that u64 -> double has no rounding error by only using the 52

      // MSB of the input. The low order bits will get merged into a sticky bit

      // which will avoid issues incurred by double rounding.


      // Signed conversion is more or less like so:

      // copysign((__bf16)abs(SrcVal), SrcVal)

      SDValue SignBit;

      if (IsSigned) {

        SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,

                              DAG.getConstant(1ull << 63, DL, MVT::i64));

        SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);

      }

      SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,

                                  DAG.getConstant(~0xfffull, DL, MVT::i64));

      SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,

                                  DAG.getConstant(0xfffull, DL, MVT::i64));

      SDValue Highest =

          DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,

                      DAG.getShiftAmountConstant(53, MVT::i64, DL));

      SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);

      SDValue ToRound =

          DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);

      SDValue Rounded =

          IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},

                                 {Op.getOperand(0), ToRound})

                   : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);


      SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);

      if (SignBit) {

        RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);

      }


      SDValue HasHighest = DAG.getSetCC(

          DL,

          getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),

          Highest, Zero64, ISD::SETNE);


      SDValue HasLo = DAG.getSetCC(

          DL,

          getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),

          SrcLo, Zero64, ISD::SETNE);


      SDValue NeedsAdjustment =

          DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);

      NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);


      SDValue AdjustedBits =

          DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);

      SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);

      return IsStrict

                 ? DAG.getNode(

                       ISD::STRICT_FP_ROUND, DL,

                       {Op.getValueType(), MVT::Other},

                       {Rounded.getValue(1), Adjusted,

                        DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})

                 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,

                               DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));

    }

  }


  // f16 conversions are promoted to f32 when full fp16 is not supported.

  if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {

    return IntToFpViaPromotion(MVT::f32);

  }


  // i128 conversions are libcalls.

  if (SrcVal.getValueType() == MVT::i128)

    return SDValue();


  // Other conversions are legal, unless it's to the completely software-based

  // fp128.

  if (Op.getValueType() != MVT::f128)

    return Op;

  return SDValue();

}


SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,

                                            SelectionDAG &DAG) const {

  // For iOS, we want to call an alternative entry point: __sincos_stret,

  // which returns the values in two S / D registers.

  SDLoc DL(Op);

  SDValue Arg = Op.getOperand(0);

  EVT ArgVT = Arg.getValueType();

  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());


  ArgListTy Args;

  Args.emplace_back(Arg, ArgTy);


  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64

                                        : RTLIB::SINCOS_STRET_F32;

  const char *LibcallName = getLibcallName(LC);

  SDValue Callee =

      DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));


  StructType *RetTy = StructType::get(ArgTy, ArgTy);

  TargetLowering::CallLoweringInfo CLI(DAG);

  CallingConv::ID CC = getLibcallCallingConv(LC);

  CLI.setDebugLoc(DL)

      .setChain(DAG.getEntryNode())

      .setLibCallee(CC, RetTy, Callee, std::move(Args));


  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);

  return CallResult.first;

}


static MVT getSVEContainerType(EVT ContentTy);


SDValue

AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,

                                                 SelectionDAG &DAG) const {

  SDLoc DL(Op);

  uint64_t EltSize = Op.getConstantOperandVal(2);

  EVT VT = Op.getValueType();

  switch (EltSize) {

  case 1:

    if (VT != MVT::v16i8 && VT != MVT::nxv16i1)

      return SDValue();

    break;

  case 2:

    if (VT != MVT::v8i8 && VT != MVT::nxv8i1)

      return SDValue();

    break;

  case 4:

    if (VT != MVT::v4i16 && VT != MVT::nxv4i1)

      return SDValue();

    break;

  case 8:

    if (VT != MVT::v2i32 && VT != MVT::nxv2i1)

      return SDValue();

    break;

  default:

    // Other element sizes are incompatible with whilewr/rw, so expand instead

    return SDValue();

  }


  SDValue PtrA = Op.getOperand(0);

  SDValue PtrB = Op.getOperand(1);


  if (VT.isScalableVT())

    return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2));


  // We can use the SVE whilewr/whilerw instruction to lower this

  // intrinsic by creating the appropriate sequence of scalable vector

  // operations and then extracting a fixed-width subvector from the scalable

  // vector. Scalable vector variants are already legal.

  EVT ContainerVT =

      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),

                       VT.getVectorNumElements(), true);

  EVT WhileVT = ContainerVT.changeElementType(MVT::i1);


  SDValue Mask =

      DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2));

  SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);

  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,

                     DAG.getVectorIdxConstant(0, DL));

}


SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,

                                            SelectionDAG &DAG) const {

  EVT OpVT = Op.getValueType();

  EVT ArgVT = Op.getOperand(0).getValueType();


  if (useSVEForFixedLengthVectorVT(OpVT))

    return LowerFixedLengthBitcastToSVE(Op, DAG);


  if (OpVT.isScalableVector()) {

    assert(isTypeLegal(OpVT) && "Unexpected result type!");


    // Handle type legalisation first.

    if (!isTypeLegal(ArgVT)) {

      assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&

             "Expected int->fp bitcast!");


      // Bitcasting between unpacked vector types of different element counts is

      // not a NOP because the live elements are laid out differently.

      //                01234567

      // e.g. nxv2i32 = XX??XX??

      //      nxv4f16 = X?X?X?X?

      if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())

        return SDValue();


      SDValue ExtResult =

          DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),

                      Op.getOperand(0));

      return getSVESafeBitCast(OpVT, ExtResult, DAG);

    }


    // Bitcasts between legal types with the same element count are legal.

    if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())

      return Op;


    // getSVESafeBitCast does not support casting between unpacked types.

    if (!isPackedVectorType(OpVT, DAG))

      return SDValue();


    return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);

  }


  if (OpVT != MVT::f16 && OpVT != MVT::bf16)

    return SDValue();


  // Bitcasts between f16 and bf16 are legal.

  if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)

    return Op;


  assert(ArgVT == MVT::i16);

  SDLoc DL(Op);


  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));

  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);

  return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);

}


// Returns lane if Op extracts from a two-element vector and lane is constant

// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.

static std::optional<uint64_t>


getConstantLaneNumOfExtractHalfOperand(SDValue &Op) {

  SDNode *OpNode = Op.getNode();

  if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)

    return std::nullopt;


  EVT VT = OpNode->getOperand(0).getValueType();

  ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));

  if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)

    return std::nullopt;


  return C->getZExtValue();

}


static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG,

                                   bool isSigned) {

  EVT VT = N.getValueType();


  if (N.getOpcode() != ISD::BUILD_VECTOR)

    return false;


  for (const SDValue &Elt : N->op_values()) {

    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {

      unsigned EltSize = VT.getScalarSizeInBits();

      unsigned HalfSize = EltSize / 2;

      if (isSigned) {

        if (!isIntN(HalfSize, C->getSExtValue()))

          return false;

      } else {

        if (!isUIntN(HalfSize, C->getZExtValue()))

          return false;

      }

      continue;

    }

    return false;

  }


  return true;

}


static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG) {

  EVT VT = N.getValueType();

  assert(VT.is128BitVector() && "Unexpected vector MULL size");

  EVT HalfVT = EVT::getVectorVT(

      *DAG.getContext(),

      VT.getScalarType().getHalfSizedIntegerVT(*DAG.getContext()),

      VT.getVectorElementCount());

  return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);

}


static bool isSignExtended(SDValue N, SelectionDAG &DAG) {

  return N.getOpcode() == ISD::SIGN_EXTEND ||

         N.getOpcode() == ISD::ANY_EXTEND ||

         isExtendedBUILD_VECTOR(N, DAG, true);

}


static bool isZeroExtended(SDValue N, SelectionDAG &DAG) {

  return N.getOpcode() == ISD::ZERO_EXTEND ||

         N.getOpcode() == ISD::ANY_EXTEND ||

         isExtendedBUILD_VECTOR(N, DAG, false);

}


static bool isAddSubSExt(SDValue N, SelectionDAG &DAG) {

  unsigned Opcode = N.getOpcode();

  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {

    SDValue N0 = N.getOperand(0);

    SDValue N1 = N.getOperand(1);

    return N0->hasOneUse() && N1->hasOneUse() &&

      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);

  }

  return false;

}


static bool isAddSubZExt(SDValue N, SelectionDAG &DAG) {

  unsigned Opcode = N.getOpcode();

  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {

    SDValue N0 = N.getOperand(0);

    SDValue N1 = N.getOperand(1);

    return N0->hasOneUse() && N1->hasOneUse() &&

      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);

  }

  return false;

}


SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,

                                                 SelectionDAG &DAG) const {

  // The rounding mode is in bits 23:22 of the FPSCR.

  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0

  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)

  // so that the shift + and get folded into a bitfield extract.

  SDLoc DL(Op);


  SDValue Chain = Op.getOperand(0);

  SDValue FPCR_64 = DAG.getNode(

      ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other},

      {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)});

  Chain = FPCR_64.getValue(1);

  SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR_64);

  SDValue FltRounds = DAG.getNode(ISD::ADD, DL, MVT::i32, FPCR_32,

                                  DAG.getConstant(1U << 22, DL, MVT::i32));

  SDValue RMODE = DAG.getNode(ISD::SRL, DL, MVT::i32, FltRounds,

                              DAG.getConstant(22, DL, MVT::i32));

  SDValue AND = DAG.getNode(ISD::AND, DL, MVT::i32, RMODE,

                            DAG.getConstant(3, DL, MVT::i32));

  return DAG.getMergeValues({AND, Chain}, DL);

}


SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,

                                                 SelectionDAG &DAG) const {

  SDLoc DL(Op);

  SDValue Chain = Op->getOperand(0);

  SDValue RMValue = Op->getOperand(1);


  // The rounding mode is in bits 23:22 of the FPCR.

  // The llvm.set.rounding argument value to the rounding mode in FPCR mapping

  // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is

  // ((arg - 1) & 3) << 22).

  //

  // The argument of llvm.set.rounding must be within the segment [0, 3], so

  // NearestTiesToAway (4) is not handled here. It is responsibility of the code

  // generated llvm.set.rounding to ensure this condition.


  // Calculate new value of FPCR[23:22].

  RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,

                        DAG.getConstant(1, DL, MVT::i32));

  RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,

                        DAG.getConstant(0x3, DL, MVT::i32));

  RMValue =

      DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,

                  DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));

  RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);


  // Get current value of FPCR.

  SDValue Ops[] = {

      Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};

  SDValue FPCR =

      DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);

  Chain = FPCR.getValue(1);

  FPCR = FPCR.getValue(0);


  // Put new rounding mode into FPSCR[23:22].

  const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);

  FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,

                     DAG.getConstant(RMMask, DL, MVT::i64));

  FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);

  SDValue Ops2[] = {

      Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),

      FPCR};

  return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);

}


SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,

                                               SelectionDAG &DAG) const {

  SDLoc DL(Op);

  SDValue Chain = Op->getOperand(0);


  // Get current value of FPCR.

  SDValue Ops[] = {

      Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};

  SDValue FPCR =

      DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);

  Chain = FPCR.getValue(1);

  FPCR = FPCR.getValue(0);


  // Truncate FPCR to 32 bits.

  SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);


  return DAG.getMergeValues({Result, Chain}, DL);

}


SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,

                                               SelectionDAG &DAG) const {

  SDLoc DL(Op);

  SDValue Chain = Op->getOperand(0);

  SDValue Mode = Op->getOperand(1);


  // Extend the specified value to 64 bits.

  SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);


  // Set new value of FPCR.

  SDValue Ops2[] = {

      Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};

  return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);

}


SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,

                                                 SelectionDAG &DAG) const {

  SDLoc DL(Op);

  SDValue Chain = Op->getOperand(0);


  // Get current value of FPCR.

  SDValue Ops[] = {

      Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};

  SDValue FPCR =

      DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);

  Chain = FPCR.getValue(1);

  FPCR = FPCR.getValue(0);


  // Clear bits that are not reserved.

  SDValue FPSCRMasked = DAG.getNode(

      ISD::AND, DL, MVT::i64, FPCR,

      DAG.getConstant(AArch64::ReservedFPControlBits, DL, MVT::i64));


  // Set new value of FPCR.

  SDValue Ops2[] = {Chain,

                    DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),

                    FPSCRMasked};

  return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);

}


static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,

                                 SDLoc DL, bool &IsMLA) {

  bool IsN0SExt = isSignExtended(N0, DAG);

  bool IsN1SExt = isSignExtended(N1, DAG);

  if (IsN0SExt && IsN1SExt)

    return AArch64ISD::SMULL;


  bool IsN0ZExt = isZeroExtended(N0, DAG);

  bool IsN1ZExt = isZeroExtended(N1, DAG);


  if (IsN0ZExt && IsN1ZExt)

    return AArch64ISD::UMULL;


  // Select UMULL if we can replace the other operand with an extend.

  EVT VT = N0.getValueType();

  unsigned EltSize = VT.getScalarSizeInBits();

  APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);

  if (IsN0ZExt || IsN1ZExt) {

    if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))

      return AArch64ISD::UMULL;

  } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&

             DAG.MaskedValueIsZero(N1, Mask)) {

    // For v2i64 we look more aggressively at both operands being zero, to avoid

    // scalarization.

    return AArch64ISD::UMULL;

  }


  if (IsN0SExt || IsN1SExt) {

    if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)

      return AArch64ISD::SMULL;

  } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&

             DAG.ComputeNumSignBits(N1) > EltSize / 2) {

    return AArch64ISD::SMULL;

  }


  if (!IsN1SExt && !IsN1ZExt)

    return 0;


  // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these

  // into (s/zext A * s/zext C) + (s/zext B * s/zext C)

  if (IsN1SExt && isAddSubSExt(N0, DAG)) {

    IsMLA = true;

    return AArch64ISD::SMULL;

  }

  if (IsN1ZExt && isAddSubZExt(N0, DAG)) {

    IsMLA = true;

    return AArch64ISD::UMULL;

  }

  if (IsN0ZExt && isAddSubZExt(N1, DAG)) {

    std::swap(N0, N1);

    IsMLA = true;

    return AArch64ISD::UMULL;

  }

  return 0;

}


SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();


  bool OverrideNEON = !Subtarget->isNeonAvailable();

  if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);


  // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so

  // that VMULL can be detected.  Otherwise v2i64 multiplications are not legal.

  assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&

         "unexpected type for custom-lowering ISD::MUL");

  SDValue N0 = Op.getOperand(0);

  SDValue N1 = Op.getOperand(1);

  bool isMLA = false;

  EVT OVT = VT;

  if (VT.is64BitVector()) {

    if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

        isNullConstant(N0.getOperand(1)) &&

        N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

        isNullConstant(N1.getOperand(1))) {

      N0 = N0.getOperand(0);

      N1 = N1.getOperand(0);

      VT = N0.getValueType();

    } else {

      if (VT == MVT::v1i64) {

        if (Subtarget->hasSVE())

          return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);

        // Fall through to expand this.  It is not legal.

        return SDValue();

      } else

        // Other vector multiplications are legal.

        return Op;

    }

  }


  SDLoc DL(Op);

  unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);


  if (!NewOpc) {

    if (VT.getVectorElementType() == MVT::i64) {

      // If SVE is available then i64 vector multiplications can also be made

      // legal.

      if (Subtarget->hasSVE())

        return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);

      // Fall through to expand this.  It is not legal.

      return SDValue();

    } else

      // Other vector multiplications are legal.

      return Op;

  }


  // Legalize to a S/UMULL instruction

  SDValue Op0;

  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);

  if (!isMLA) {

    Op0 = skipExtensionForVectorMULL(N0, DAG);

    assert(Op0.getValueType().is64BitVector() &&

           Op1.getValueType().is64BitVector() &&

           "unexpected types for extended operands to VMULL");

    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,

                       DAG.getNode(NewOpc, DL, VT, Op0, Op1),

                       DAG.getConstant(0, DL, MVT::i64));

  }

  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during

  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.

  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57

  SDValue N00 = skipExtensionForVectorMULL(N0.getOperand(0), DAG);

  SDValue N01 = skipExtensionForVectorMULL(N0.getOperand(1), DAG);

  EVT Op1VT = Op1.getValueType();

  return DAG.getNode(

      ISD::EXTRACT_SUBVECTOR, DL, OVT,

      DAG.getNode(N0.getOpcode(), DL, VT,

                  DAG.getNode(NewOpc, DL, VT,

                              DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),

                  DAG.getNode(NewOpc, DL, VT,

                              DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),

      DAG.getConstant(0, DL, MVT::i64));

}


static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,

                               int Pattern) {

  if (Pattern == AArch64SVEPredPattern::all)

    return DAG.getConstant(1, DL, VT);

  return DAG.getNode(AArch64ISD::PTRUE, DL, VT,

                     DAG.getTargetConstant(Pattern, DL, MVT::i32));

}


static SDValue optimizeIncrementingWhile(SDNode *N, SelectionDAG &DAG,

                                         bool IsSigned, bool IsEqual) {

  unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 1 : 0;

  unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 2 : 1;


  if (!N->getValueType(0).isScalableVector() ||

      !isa<ConstantSDNode>(N->getOperand(Op1)))

    return SDValue();


  SDLoc DL(N);

  APInt Y = N->getConstantOperandAPInt(Op1);


  // When the second operand is the maximum value, comparisons that include

  // equality can never fail and thus we can return an all active predicate.

  if (IsEqual)

    if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())

      return DAG.getConstant(1, DL, N->getValueType(0));


  if (!isa<ConstantSDNode>(N->getOperand(Op0)))

    return SDValue();


  APInt X = N->getConstantOperandAPInt(Op0);


  bool Overflow;

  APInt NumActiveElems =

      IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);


  if (Overflow)

    return SDValue();


  if (IsEqual) {

    APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);

    NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)

                              : NumActiveElems.uadd_ov(One, Overflow);

    if (Overflow)

      return SDValue();

  }


  std::optional<unsigned> PredPattern =

      getSVEPredPatternFromNumElements(NumActiveElems.getZExtValue());

  unsigned MinSVEVectorSize = std::max(

      DAG.getSubtarget<AArch64Subtarget>().getMinSVEVectorSizeInBits(), 128u);

  unsigned ElementSize = 128 / N->getValueType(0).getVectorMinNumElements();

  if (PredPattern != std::nullopt &&

      NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))

    return getPTrue(DAG, DL, N->getValueType(0), *PredPattern);


  return SDValue();

}


// Returns a safe bitcast between two scalable vector predicates, where

// any newly created lanes from a widening bitcast are defined as zero.


static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {

  SDLoc DL(Op);

  EVT InVT = Op.getValueType();


  assert(InVT.getVectorElementType() == MVT::i1 &&

         VT.getVectorElementType() == MVT::i1 &&

         "Expected a predicate-to-predicate bitcast");

  assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&

         InVT.isScalableVector() &&

         DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&

         "Only expect to cast between legal scalable predicate types!");


  // Return the operand if the cast isn't changing type,

  if (InVT == VT)

    return Op;


  // Look through casts to <vscale x 16 x i1> when their input has more lanes

  // than VT. This will increase the chances of removing casts that introduce

  // new lanes, which have to be explicitly zero'd.

  if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&

      Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&

      Op.getOperand(1).getValueType().bitsGT(VT))

    Op = Op.getOperand(1);


  SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);


  // We only have to zero the lanes if new lanes are being defined, e.g. when

  // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the

  // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then

  // we can return here.

  if (InVT.bitsGT(VT))

    return Reinterpret;


  // Check if the other lanes are already known to be zeroed by

  // construction.

  if (isZeroingInactiveLanes(Op))

    return Reinterpret;


  // Zero the newly introduced lanes.

  SDValue Mask = DAG.getConstant(1, DL, InVT);

  Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);

  return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);

}


SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,

                                                  SDValue Chain, SDLoc DL,

                                                  EVT VT) const {

  RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;

  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),

                                         getPointerTy(DAG.getDataLayout()));

  Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());

  Type *RetTy = StructType::get(Int64Ty, Int64Ty);

  TargetLowering::CallLoweringInfo CLI(DAG);

  ArgListTy Args;

  CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(

      getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));

  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);

  SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);

  return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),

                     Mask);

}


// Lower an SME LDR/STR ZA intrinsic

// Case 1: If the vector number (vecnum) is an immediate in range, it gets

// folded into the instruction

//    ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]

// Case 2: If the vecnum is not an immediate, then it is used to modify the base

// and tile slice registers

//    ldr(%tileslice, %ptr, %vecnum)

//    ->

//    %svl = rdsvl

//    %ptr2 = %ptr + %svl * %vecnum

//    %tileslice2 = %tileslice + %vecnum

//    ldr [%tileslice2, 0], [%ptr2, 0]

// Case 3: If the vecnum is an immediate out of range, then the same is done as

// case 2, but the base and slice registers are modified by the greatest

// multiple of 15 lower than the vecnum and the remainder is folded into the

// instruction. This means that successive loads and stores that are offset from

// each other can share the same base and slice register updates.

//    ldr(%tileslice, %ptr, 22)

//    ldr(%tileslice, %ptr, 23)

//    ->

//    %svl = rdsvl

//    %ptr2 = %ptr + %svl * 15

//    %tileslice2 = %tileslice + 15

//    ldr [%tileslice2, 7], [%ptr2, 7]

//    ldr [%tileslice2, 8], [%ptr2, 8]

// Case 4: If the vecnum is an add of an immediate, then the non-immediate

// operand and the immediate can be folded into the instruction, like case 2.

//    ldr(%tileslice, %ptr, %vecnum + 7)

//    ldr(%tileslice, %ptr, %vecnum + 8)

//    ->

//    %svl = rdsvl

//    %ptr2 = %ptr + %svl * %vecnum

//    %tileslice2 = %tileslice + %vecnum

//    ldr [%tileslice2, 7], [%ptr2, 7]

//    ldr [%tileslice2, 8], [%ptr2, 8]

// Case 5: The vecnum being an add of an immediate out of range is also handled,

// in which case the same remainder logic as case 3 is used.


SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) {

  SDLoc DL(N);


  SDValue TileSlice = N->getOperand(2);

  SDValue Base = N->getOperand(3);

  SDValue VecNum = N->getOperand(4);

  int32_t ConstAddend = 0;

  SDValue VarAddend = VecNum;


  // If the vnum is an add of an immediate, we can fold it into the instruction

  if (VecNum.getOpcode() == ISD::ADD &&

      isa<ConstantSDNode>(VecNum.getOperand(1))) {

    ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();

    VarAddend = VecNum.getOperand(0);

  } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {

    ConstAddend = ImmNode->getSExtValue();

    VarAddend = SDValue();

  }


  int32_t ImmAddend = ConstAddend % 16;

  if (int32_t C = (ConstAddend - ImmAddend)) {

    SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);

    VarAddend = VarAddend

                    ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})

                    : CVal;

  }


  if (VarAddend) {

    // Get the vector length that will be multiplied by vnum

    auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,

                           DAG.getConstant(1, DL, MVT::i32));


    // Multiply SVL and vnum then add it to the base

    SDValue Mul = DAG.getNode(

        ISD::MUL, DL, MVT::i64,

        {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});

    Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});

    // Just add vnum to the tileslice

    TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});

  }


  return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,

                     DL, MVT::Other,

                     {/*Chain=*/N.getOperand(0), TileSlice, Base,

                      DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});

}


SDValue LowerVectorMatch(SDValue Op, SelectionDAG &DAG) {

  SDLoc DL(Op);

  SDValue ID =

      DAG.getTargetConstant(Intrinsic::aarch64_sve_match, DL, MVT::i64);


  auto Op1 = Op.getOperand(1);

  auto Op2 = Op.getOperand(2);

  auto Mask = Op.getOperand(3);


  EVT Op1VT = Op1.getValueType();

  EVT Op2VT = Op2.getValueType();

  EVT ResVT = Op.getValueType();


  assert((Op1VT.getVectorElementType() == MVT::i8 ||

          Op1VT.getVectorElementType() == MVT::i16) &&

         "Expected 8-bit or 16-bit characters.");


  // Scalable vector type used to wrap operands.

  // A single container is enough for both operands because ultimately the

  // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).

  EVT OpContainerVT = Op1VT.isScalableVector()

                          ? Op1VT

                          : getContainerForFixedLengthVector(DAG, Op1VT);


  if (Op2VT.is128BitVector()) {

    // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.

    Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);

    // Further, if the result is scalable, broadcast Op2 to a full SVE register.

    if (ResVT.isScalableVector())

      Op2 = DAG.getNode(AArch64ISD::DUPLANE128, DL, OpContainerVT, Op2,

                        DAG.getTargetConstant(0, DL, MVT::i64));

  } else {

    // If Op2 is not a full 128-bit vector, we always need to broadcast it.

    unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();

    MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);

    EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);

    Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);

    Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op2IntVT, Op2,

                      DAG.getConstant(0, DL, MVT::i64));

    Op2 = DAG.getSplatVector(Op2PromotedVT, DL, Op2);

    Op2 = DAG.getBitcast(OpContainerVT, Op2);

  }


  // If the result is scalable, we just need to carry out the MATCH.

  if (ResVT.isScalableVector())

    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResVT, ID, Mask, Op1, Op2);


  // If the result is fixed, we can still use MATCH but we need to wrap the

  // first operand and the mask in scalable vectors before doing so.


  // Wrap the operands.

  Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);

  Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, Op1VT, Mask);

  Mask = convertFixedMaskToScalableVector(Mask, DAG);


  // Carry out the match.

  SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Mask.getValueType(),

                              ID, Mask, Op1, Op2);


  // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT

  // (v16i8/v8i8).

  Match = DAG.getNode(ISD::SIGN_EXTEND, DL, OpContainerVT, Match);

  Match = convertFromScalableVector(DAG, Op1VT, Match);

  return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Match);

}


SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,

                                                   SelectionDAG &DAG) const {

  unsigned IntNo = Op.getConstantOperandVal(1);

  SDLoc DL(Op);

  switch (IntNo) {

  default:

    return SDValue(); // Don't custom lower most intrinsics.

  case Intrinsic::aarch64_prefetch: {

    SDValue Chain = Op.getOperand(0);

    SDValue Addr = Op.getOperand(2);


    unsigned IsWrite = Op.getConstantOperandVal(3);

    unsigned Locality = Op.getConstantOperandVal(4);

    unsigned IsStream = Op.getConstantOperandVal(5);

    unsigned IsData = Op.getConstantOperandVal(6);

    unsigned PrfOp = (IsWrite << 4) |    // Load/Store bit

                     (!IsData << 3) |    // IsDataCache bit

                     (Locality << 1) |   // Cache level bits

                     (unsigned)IsStream; // Stream bit


    return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,

                       DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);

  }

  case Intrinsic::aarch64_sme_str:

  case Intrinsic::aarch64_sme_ldr: {

    return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);

  }

  case Intrinsic::aarch64_sme_za_enable:

    return DAG.getNode(

        AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),

        Op->getOperand(0), // Chain

        DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));

  case Intrinsic::aarch64_sme_za_disable:

    return DAG.getNode(

        AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue),

        Op->getOperand(0), // Chain

        DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));

  }

}


SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

                                                      SelectionDAG &DAG) const {

  unsigned IntNo = Op.getConstantOperandVal(1);

  SDLoc DL(Op);

  switch (IntNo) {

  default:

    return SDValue(); // Don't custom lower most intrinsics.

  case Intrinsic::aarch64_mops_memset_tag: {

    auto Node = cast<MemIntrinsicSDNode>(Op.getNode());

    SDValue Chain = Node->getChain();

    SDValue Dst = Op.getOperand(2);

    SDValue Val = Op.getOperand(3);

    Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);

    SDValue Size = Op.getOperand(4);

    auto Alignment = Node->getMemOperand()->getAlign();

    bool IsVol = Node->isVolatile();

    auto DstPtrInfo = Node->getPointerInfo();


    const auto &SDI =

        static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());

    SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,

                              Chain, Dst, Val, Size, Alignment, IsVol,

                              DstPtrInfo, MachinePointerInfo{});


    // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the

    // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise

    // LowerOperationWrapper will complain that the number of results has

    // changed.

    return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);

  }

  }

}


SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

                                                     SelectionDAG &DAG) const {

  unsigned IntNo = Op.getConstantOperandVal(0);

  SDLoc DL(Op);

  switch (IntNo) {

  default: return SDValue();    // Don't custom lower most intrinsics.

  case Intrinsic::thread_pointer: {

    EVT PtrVT = getPointerTy(DAG.getDataLayout());

    return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);

  }

  case Intrinsic::aarch64_sve_whilewr_b:

    return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2),

                       DAG.getConstant(1, DL, MVT::i64));

  case Intrinsic::aarch64_sve_whilewr_h:

    return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2),

                       DAG.getConstant(2, DL, MVT::i64));

  case Intrinsic::aarch64_sve_whilewr_s:

    return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2),

                       DAG.getConstant(4, DL, MVT::i64));

  case Intrinsic::aarch64_sve_whilewr_d:

    return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2),

                       DAG.getConstant(8, DL, MVT::i64));

  case Intrinsic::aarch64_sve_whilerw_b:

    return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2),

                       DAG.getConstant(1, DL, MVT::i64));

  case Intrinsic::aarch64_sve_whilerw_h:

    return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2),

                       DAG.getConstant(2, DL, MVT::i64));

  case Intrinsic::aarch64_sve_whilerw_s:

    return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2),

                       DAG.getConstant(4, DL, MVT::i64));

  case Intrinsic::aarch64_sve_whilerw_d:

    return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2),

                       DAG.getConstant(8, DL, MVT::i64));

  case Intrinsic::aarch64_neon_abs: {

    EVT Ty = Op.getValueType();

    if (Ty == MVT::i64) {

      SDValue Result =

          DAG.getNode(ISD::BITCAST, DL, MVT::v1i64, Op.getOperand(1));

      Result = DAG.getNode(ISD::ABS, DL, MVT::v1i64, Result);

      return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Result);

    } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {

      return DAG.getNode(ISD::ABS, DL, Ty, Op.getOperand(1));

    } else {

      report_fatal_error("Unexpected type for AArch64 NEON intrinsic");

    }

  }

  case Intrinsic::aarch64_neon_pmull64: {

    SDValue LHS = Op.getOperand(1);

    SDValue RHS = Op.getOperand(2);


    std::optional<uint64_t> LHSLane =

        getConstantLaneNumOfExtractHalfOperand(LHS);

    std::optional<uint64_t> RHSLane =

        getConstantLaneNumOfExtractHalfOperand(RHS);


    assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");

    assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");


    // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2

    // instructions execute on SIMD registers. So canonicalize i64 to v1i64,

    // which ISel recognizes better. For example, generate a ldr into d*

    // registers as opposed to a GPR load followed by a fmov.

    auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,

                                  std::optional<uint64_t> OtherLane,

                                  const SDLoc &DL,

                                  SelectionDAG &DAG) -> SDValue {

      // If the operand is an higher half itself, rewrite it to

      // extract_high_v2i64; this way aarch64_neon_pmull64 could

      // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.

      if (NLane == 1)

        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,

                           N.getOperand(0), DAG.getConstant(1, DL, MVT::i64));


      // Operand N is not a higher half but the other operand is.

      if (OtherLane == 1) {

        // If this operand is a lower half, rewrite it to

        // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to

        // align lanes of two operands. A roundtrip sequence (to move from lane

        // 1 to lane 0) is like this:

        //   mov x8, v0.d[1]

        //   fmov d0, x8

        if (NLane == 0)

          return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,

                             DAG.getNode(AArch64ISD::DUPLANE64, DL, MVT::v2i64,

                                         N.getOperand(0),

                                         DAG.getConstant(0, DL, MVT::i64)),

                             DAG.getConstant(1, DL, MVT::i64));


        // Otherwise just dup from main to all lanes.

        return DAG.getNode(AArch64ISD::DUP, DL, MVT::v1i64, N);

      }


      // Neither operand is an extract of higher half, so codegen may just use

      // the non-high version of PMULL instruction. Use v1i64 to represent i64.

      assert(N.getValueType() == MVT::i64 &&

             "Intrinsic aarch64_neon_pmull64 requires i64 parameters");

      return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, N);

    };


    LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, DL, DAG);

    RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, DL, DAG);


    return DAG.getNode(AArch64ISD::PMULL, DL, Op.getValueType(), LHS, RHS);

  }

  case Intrinsic::aarch64_neon_smax:

    return DAG.getNode(ISD::SMAX, DL, Op.getValueType(), Op.getOperand(1),

                       Op.getOperand(2));

  case Intrinsic::aarch64_neon_umax:

    return DAG.getNode(ISD::UMAX, DL, Op.getValueType(), Op.getOperand(1),

                       Op.getOperand(2));

  case Intrinsic::aarch64_neon_smin:

    return DAG.getNode(ISD::SMIN, DL, Op.getValueType(), Op.getOperand(1),

                       Op.getOperand(2));

  case Intrinsic::aarch64_neon_umin:

    return DAG.getNode(ISD::UMIN, DL, Op.getValueType(), Op.getOperand(1),

                       Op.getOperand(2));

  case Intrinsic::aarch64_neon_scalar_sqxtn:

  case Intrinsic::aarch64_neon_scalar_sqxtun:

  case Intrinsic::aarch64_neon_scalar_uqxtn: {

    assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);

    if (Op.getValueType() == MVT::i32)

      return DAG.getNode(ISD::BITCAST, DL, MVT::i32,

                         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,

                                     Op.getOperand(0),

                                     DAG.getNode(ISD::BITCAST, DL, MVT::f64,

                                                 Op.getOperand(1))));

    return SDValue();

  }

  case Intrinsic::aarch64_neon_sqxtn:

    return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),

                       Op.getOperand(1));

  case Intrinsic::aarch64_neon_sqxtun:

    return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),

                       Op.getOperand(1));

  case Intrinsic::aarch64_neon_uqxtn:

    return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),

                       Op.getOperand(1));

  case Intrinsic::aarch64_neon_sqshrn:

    if (Op.getValueType().isVector())

      return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),

                         DAG.getNode(AArch64ISD::VASHR, DL,

                                     Op.getOperand(1).getValueType(),

                                     Op.getOperand(1), Op.getOperand(2)));

    return SDValue();

  case Intrinsic::aarch64_neon_sqshrun:

    if (Op.getValueType().isVector())

      return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),

                         DAG.getNode(AArch64ISD::VASHR, DL,

                                     Op.getOperand(1).getValueType(),

                                     Op.getOperand(1), Op.getOperand(2)));

    return SDValue();

  case Intrinsic::aarch64_neon_uqshrn:

    if (Op.getValueType().isVector())

      return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),

                         DAG.getNode(AArch64ISD::VLSHR, DL,

                                     Op.getOperand(1).getValueType(),

                                     Op.getOperand(1), Op.getOperand(2)));

    return SDValue();

  case Intrinsic::aarch64_neon_sqrshrn:

    if (Op.getValueType().isVector())

      return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),

                         DAG.getNode(AArch64ISD::SRSHR_I, DL,

                                     Op.getOperand(1).getValueType(),

                                     Op.getOperand(1), Op.getOperand(2)));

    return SDValue();

  case Intrinsic::aarch64_neon_sqrshrun:

    if (Op.getValueType().isVector())

      return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),

                         DAG.getNode(AArch64ISD::SRSHR_I, DL,

                                     Op.getOperand(1).getValueType(),

                                     Op.getOperand(1), Op.getOperand(2)));

    return SDValue();

  case Intrinsic::aarch64_neon_uqrshrn:

    if (Op.getValueType().isVector())

      return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),

                         DAG.getNode(AArch64ISD::URSHR_I, DL,

                                     Op.getOperand(1).getValueType(),

                                     Op.getOperand(1), Op.getOperand(2)));

    return SDValue();

  case Intrinsic::aarch64_neon_sqadd:

    if (Op.getValueType().isVector())

      return DAG.getNode(ISD::SADDSAT, DL, Op.getValueType(), Op.getOperand(1),

                         Op.getOperand(2));

    return SDValue();

  case Intrinsic::aarch64_neon_sqsub:

    if (Op.getValueType().isVector())

      return DAG.getNode(ISD::SSUBSAT, DL, Op.getValueType(), Op.getOperand(1),

                         Op.getOperand(2));

    return SDValue();

  case Intrinsic::aarch64_neon_uqadd:

    if (Op.getValueType().isVector())

      return DAG.getNode(ISD::UADDSAT, DL, Op.getValueType(), Op.getOperand(1),

                         Op.getOperand(2));

    return SDValue();

  case Intrinsic::aarch64_neon_uqsub:

    if (Op.getValueType().isVector())

      return DAG.getNode(ISD::USUBSAT, DL, Op.getValueType(), Op.getOperand(1),

                         Op.getOperand(2));

    return SDValue();

  case Intrinsic::aarch64_sve_whilelt:

    return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,

                                     /*IsEqual=*/false);

  case Intrinsic::aarch64_sve_whilels:

    return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/false,

                                     /*IsEqual=*/true);

  case Intrinsic::aarch64_sve_whilele:

    return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,

                                     /*IsEqual=*/true);

  case Intrinsic::aarch64_sve_sunpkhi:

    return DAG.getNode(AArch64ISD::SUNPKHI, DL, Op.getValueType(),

                       Op.getOperand(1));

  case Intrinsic::aarch64_sve_sunpklo:

    return DAG.getNode(AArch64ISD::SUNPKLO, DL, Op.getValueType(),

                       Op.getOperand(1));

  case Intrinsic::aarch64_sve_uunpkhi:

    return DAG.getNode(AArch64ISD::UUNPKHI, DL, Op.getValueType(),

                       Op.getOperand(1));

  case Intrinsic::aarch64_sve_uunpklo:

    return DAG.getNode(AArch64ISD::UUNPKLO, DL, Op.getValueType(),

                       Op.getOperand(1));

  case Intrinsic::aarch64_sve_clasta_n:

    return DAG.getNode(AArch64ISD::CLASTA_N, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));

  case Intrinsic::aarch64_sve_clastb_n:

    return DAG.getNode(AArch64ISD::CLASTB_N, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));

  case Intrinsic::aarch64_sve_lasta:

    return DAG.getNode(AArch64ISD::LASTA, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2));

  case Intrinsic::aarch64_sve_lastb:

    return DAG.getNode(AArch64ISD::LASTB, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2));

  case Intrinsic::aarch64_sve_rev:

    return DAG.getNode(ISD::VECTOR_REVERSE, DL, Op.getValueType(),

                       Op.getOperand(1));

  case Intrinsic::aarch64_sve_tbl:

    return DAG.getNode(AArch64ISD::TBL, DL, Op.getValueType(), Op.getOperand(1),

                       Op.getOperand(2));

  case Intrinsic::aarch64_sve_trn1:

    return DAG.getNode(AArch64ISD::TRN1, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2));

  case Intrinsic::aarch64_sve_trn2:

    return DAG.getNode(AArch64ISD::TRN2, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2));

  case Intrinsic::aarch64_sve_uzp1:

    return DAG.getNode(AArch64ISD::UZP1, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2));

  case Intrinsic::aarch64_sve_uzp2:

    return DAG.getNode(AArch64ISD::UZP2, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2));

  case Intrinsic::aarch64_sve_zip1:

    return DAG.getNode(AArch64ISD::ZIP1, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2));

  case Intrinsic::aarch64_sve_zip2:

    return DAG.getNode(AArch64ISD::ZIP2, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2));

  case Intrinsic::aarch64_sve_splice:

    return DAG.getNode(AArch64ISD::SPLICE, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));

  case Intrinsic::aarch64_sve_ptrue:

    return getPTrue(DAG, DL, Op.getValueType(), Op.getConstantOperandVal(1));

  case Intrinsic::aarch64_sve_clz:

    return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

  case Intrinsic::aarch64_sme_cntsd: {

    SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),

                                DAG.getConstant(1, DL, MVT::i32));

    return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes,

                       DAG.getConstant(3, DL, MVT::i32), SDNodeFlags::Exact);

  }

  case Intrinsic::aarch64_sve_cnt: {

    SDValue Data = Op.getOperand(3);

    // CTPOP only supports integer operands.

    if (Data.getValueType().isFloatingPoint())

      Data = DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Data);

    return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Data, Op.getOperand(1));

  }

  case Intrinsic::aarch64_sve_dupq_lane:

    return LowerDUPQLane(Op, DAG);

  case Intrinsic::aarch64_sve_convert_from_svbool:

    if (Op.getValueType() == MVT::aarch64svcount)

      return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Op.getOperand(1));

    return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);

  case Intrinsic::aarch64_sve_convert_to_svbool:

    if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)

      return DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, Op.getOperand(1));

    return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);

  case Intrinsic::aarch64_sve_fneg:

    return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

  case Intrinsic::aarch64_sve_frintp:

    return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

  case Intrinsic::aarch64_sve_frintm:

    return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

  case Intrinsic::aarch64_sve_frinti:

    return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL,

                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),

                       Op.getOperand(1));

  case Intrinsic::aarch64_sve_frintx:

    return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

  case Intrinsic::aarch64_sve_frinta:

    return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

  case Intrinsic::aarch64_sve_frintn:

    return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL,

                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),

                       Op.getOperand(1));

  case Intrinsic::aarch64_sve_frintz:

    return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

  case Intrinsic::aarch64_sve_ucvtf:

    return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL,

                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),

                       Op.getOperand(1));

  case Intrinsic::aarch64_sve_scvtf:

    return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL,

                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),

                       Op.getOperand(1));

  case Intrinsic::aarch64_sve_fcvtzu:

    return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

  case Intrinsic::aarch64_sve_fcvtzs:

    return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

  case Intrinsic::aarch64_sve_fsqrt:

    return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

  case Intrinsic::aarch64_sve_frecpx:

    return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

  case Intrinsic::aarch64_sve_frecpe_x:

    return DAG.getNode(AArch64ISD::FRECPE, DL, Op.getValueType(),

                       Op.getOperand(1));

  case Intrinsic::aarch64_sve_frecps_x:

    return DAG.getNode(AArch64ISD::FRECPS, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2));

  case Intrinsic::aarch64_sve_frsqrte_x:

    return DAG.getNode(AArch64ISD::FRSQRTE, DL, Op.getValueType(),

                       Op.getOperand(1));

  case Intrinsic::aarch64_sve_frsqrts_x:

    return DAG.getNode(AArch64ISD::FRSQRTS, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2));

  case Intrinsic::aarch64_sve_fabs:

    return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

  case Intrinsic::aarch64_sve_abs:

    return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

  case Intrinsic::aarch64_sve_neg:

    return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

  case Intrinsic::aarch64_sve_insr: {

    SDValue Scalar = Op.getOperand(2);

    EVT ScalarTy = Scalar.getValueType();

    if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))

      Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);


    return DAG.getNode(AArch64ISD::INSR, DL, Op.getValueType(),

                       Op.getOperand(1), Scalar);

  }

  case Intrinsic::aarch64_sve_rbit:

    return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL,

                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),

                       Op.getOperand(1));

  case Intrinsic::aarch64_sve_revb:

    return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

  case Intrinsic::aarch64_sve_revh:

    return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

  case Intrinsic::aarch64_sve_revw:

    return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

  case Intrinsic::aarch64_sve_revd:

    return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, Op.getValueType(),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

  case Intrinsic::aarch64_sve_sxtb:

    return DAG.getNode(

        AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),

        Op.getOperand(2), Op.getOperand(3),

        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),

        Op.getOperand(1));

  case Intrinsic::aarch64_sve_sxth:

    return DAG.getNode(

        AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),

        Op.getOperand(2), Op.getOperand(3),

        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),

        Op.getOperand(1));

  case Intrinsic::aarch64_sve_sxtw:

    return DAG.getNode(

        AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),

        Op.getOperand(2), Op.getOperand(3),

        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),

        Op.getOperand(1));

  case Intrinsic::aarch64_sve_uxtb:

    return DAG.getNode(

        AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),

        Op.getOperand(2), Op.getOperand(3),

        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),

        Op.getOperand(1));

  case Intrinsic::aarch64_sve_uxth:

    return DAG.getNode(

        AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),

        Op.getOperand(2), Op.getOperand(3),

        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),

        Op.getOperand(1));

  case Intrinsic::aarch64_sve_uxtw:

    return DAG.getNode(

        AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),

        Op.getOperand(2), Op.getOperand(3),

        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),

        Op.getOperand(1));

  case Intrinsic::localaddress: {

    const auto &MF = DAG.getMachineFunction();

    const auto *RegInfo = Subtarget->getRegisterInfo();

    unsigned Reg = RegInfo->getLocalAddressRegister(MF);

    return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg,

                              Op.getSimpleValueType());

  }


  case Intrinsic::eh_recoverfp: {

    // FIXME: This needs to be implemented to correctly handle highly aligned

    // stack objects. For now we simply return the incoming FP. Refer D53541

    // for more details.

    SDValue FnOp = Op.getOperand(1);

    SDValue IncomingFPOp = Op.getOperand(2);

    GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);

    auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);

    if (!Fn)

      report_fatal_error(

          "llvm.eh.recoverfp must take a function as the first argument");

    return IncomingFPOp;

  }

  case Intrinsic::aarch64_neon_vsri:

  case Intrinsic::aarch64_neon_vsli:

  case Intrinsic::aarch64_sve_sri:

  case Intrinsic::aarch64_sve_sli: {

    EVT Ty = Op.getValueType();


    if (!Ty.isVector())

      report_fatal_error("Unexpected type for aarch64_neon_vsli");


    assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());


    bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||

                        IntNo == Intrinsic::aarch64_sve_sri;

    unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;

    return DAG.getNode(Opcode, DL, Ty, Op.getOperand(1), Op.getOperand(2),

                       Op.getOperand(3));

  }


  case Intrinsic::aarch64_neon_srhadd:

  case Intrinsic::aarch64_neon_urhadd:

  case Intrinsic::aarch64_neon_shadd:

  case Intrinsic::aarch64_neon_uhadd: {

    bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||

                        IntNo == Intrinsic::aarch64_neon_shadd);

    bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||

                          IntNo == Intrinsic::aarch64_neon_urhadd);

    unsigned Opcode = IsSignedAdd

                          ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)

                          : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);

    return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),

                       Op.getOperand(2));

  }

  case Intrinsic::aarch64_neon_saddlp:

  case Intrinsic::aarch64_neon_uaddlp: {

    unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp

                          ? AArch64ISD::UADDLP

                          : AArch64ISD::SADDLP;

    return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1));

  }

  case Intrinsic::aarch64_neon_sdot:

  case Intrinsic::aarch64_neon_udot:

  case Intrinsic::aarch64_sve_sdot:

  case Intrinsic::aarch64_sve_udot: {

    unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||

                       IntNo == Intrinsic::aarch64_sve_udot)

                          ? AArch64ISD::UDOT

                          : AArch64ISD::SDOT;

    return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),

                       Op.getOperand(2), Op.getOperand(3));

  }

  case Intrinsic::aarch64_neon_usdot:

  case Intrinsic::aarch64_sve_usdot: {

    return DAG.getNode(AArch64ISD::USDOT, DL, Op.getValueType(),

                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));

  }

  case Intrinsic::aarch64_neon_saddlv:

  case Intrinsic::aarch64_neon_uaddlv: {

    EVT OpVT = Op.getOperand(1).getValueType();

    EVT ResVT = Op.getValueType();

    assert(

        ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||

                                OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||

         (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&

        "Unexpected aarch64_neon_u/saddlv type");

    (void)OpVT;

    // In order to avoid insert_subvector, use v4i32 rather than v2i32.

    SDValue ADDLV = DAG.getNode(

        IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV

                                                : AArch64ISD::SADDLV,

        DL, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));

    SDValue EXTRACT_VEC_ELT = DAG.getNode(

        ISD::EXTRACT_VECTOR_ELT, DL, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,

        ADDLV, DAG.getConstant(0, DL, MVT::i64));

    return EXTRACT_VEC_ELT;

  }

  case Intrinsic::experimental_cttz_elts: {

    SDValue CttzOp = Op.getOperand(1);

    EVT VT = CttzOp.getValueType();

    assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");


    if (VT.isFixedLengthVector()) {

      // We can use SVE instructions to lower this intrinsic by first creating

      // an SVE predicate register mask from the fixed-width vector.

      EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);

      SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, CttzOp);

      CttzOp = convertFixedMaskToScalableVector(Mask, DAG);

    }


    SDValue NewCttzElts =

        DAG.getNode(AArch64ISD::CTTZ_ELTS, DL, MVT::i64, CttzOp);

    return DAG.getZExtOrTrunc(NewCttzElts, DL, Op.getValueType());

  }

  case Intrinsic::experimental_vector_match: {

    return LowerVectorMatch(Op, DAG);

  }

  }

}


bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {

  if (VT.getVectorElementType() == MVT::i8 ||

      VT.getVectorElementType() == MVT::i16) {

    EltTy = MVT::i32;

    return true;

  }

  return false;

}


bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,

                                                          EVT DataVT) const {

  const EVT IndexVT = Extend.getOperand(0).getValueType();

  // SVE only supports implicit extension of 32-bit indices.

  if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)

    return false;


  // Indices cannot be smaller than the main data type.

  if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())

    return false;


  // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit

  // element container type, which would violate the previous clause.

  return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;

}


bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {

  EVT ExtVT = ExtVal.getValueType();

  if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())

    return false;


  // It may be worth creating extending masked loads if there are multiple

  // masked loads using the same predicate. That way we'll end up creating

  // extending masked loads that may then get split by the legaliser. This

  // results in just one set of predicate unpacks at the start, instead of

  // multiple sets of vector unpacks after each load.

  if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {

    if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {

      // Disable extending masked loads for fixed-width for now, since the code

      // quality doesn't look great.

      if (!ExtVT.isScalableVector())

        return false;


      unsigned NumExtMaskedLoads = 0;

      for (auto *U : Ld->getMask()->users())

        if (isa<MaskedLoadSDNode>(U))

          NumExtMaskedLoads++;


      if (NumExtMaskedLoads <= 1)

        return false;

    }

  }


  EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType();

  return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 ||

         PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64;

}


unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {

  std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {

      {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),

       AArch64ISD::GLD1_MERGE_ZERO},

      {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),

       AArch64ISD::GLD1_UXTW_MERGE_ZERO},

      {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),

       AArch64ISD::GLD1_MERGE_ZERO},

      {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),

       AArch64ISD::GLD1_SXTW_MERGE_ZERO},

      {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),

       AArch64ISD::GLD1_SCALED_MERGE_ZERO},

      {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),

       AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},

      {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),

       AArch64ISD::GLD1_SCALED_MERGE_ZERO},

      {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),

       AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},

  };

  auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);

  return AddrModes.find(Key)->second;

}


unsigned getSignExtendedGatherOpcode(unsigned Opcode) {

  switch (Opcode) {

  default:

    llvm_unreachable("unimplemented opcode");

    return Opcode;

  case AArch64ISD::GLD1_MERGE_ZERO:

    return AArch64ISD::GLD1S_MERGE_ZERO;

  case AArch64ISD::GLD1_IMM_MERGE_ZERO:

    return AArch64ISD::GLD1S_IMM_MERGE_ZERO;

  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:

    return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;

  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:

    return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;

  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:

    return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;

  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:

    return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;

  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:

    return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;

  }

}


SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,

                                            SelectionDAG &DAG) const {

  MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);


  SDLoc DL(Op);

  SDValue Chain = MGT->getChain();

  SDValue PassThru = MGT->getPassThru();

  SDValue Mask = MGT->getMask();

  SDValue BasePtr = MGT->getBasePtr();

  SDValue Index = MGT->getIndex();

  SDValue Scale = MGT->getScale();

  EVT VT = Op.getValueType();

  EVT MemVT = MGT->getMemoryVT();

  ISD::LoadExtType ExtType = MGT->getExtensionType();

  ISD::MemIndexType IndexType = MGT->getIndexType();


  // SVE supports zero (and so undef) passthrough values only, everything else

  // must be handled manually by an explicit select on the load's output.

  if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {

    SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};

    SDValue Load =

        DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,

                            MGT->getMemOperand(), IndexType, ExtType);

    SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);

    return DAG.getMergeValues({Select, Load.getValue(1)}, DL);

  }


  bool IsScaled = MGT->isIndexScaled();

  bool IsSigned = MGT->isIndexSigned();


  // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else

  // must be calculated before hand.

  uint64_t ScaleVal = Scale->getAsZExtVal();

  if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {

    assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");

    EVT IndexVT = Index.getValueType();

    Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,

                        DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));

    Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());


    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};

    return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,

                               MGT->getMemOperand(), IndexType, ExtType);

  }


  // Lower fixed length gather to a scalable equivalent.

  if (VT.isFixedLengthVector()) {

    assert(Subtarget->useSVEForFixedLengthVectors() &&

           "Cannot lower when not using SVE for fixed vectors!");


    // NOTE: Handle floating-point as if integer then bitcast the result.

    EVT DataVT = VT.changeVectorElementTypeToInteger();

    MemVT = MemVT.changeVectorElementTypeToInteger();


    // Find the smallest integer fixed length vector we can use for the gather.

    EVT PromotedVT = VT.changeVectorElementType(MVT::i32);

    if (DataVT.getVectorElementType() == MVT::i64 ||

        Index.getValueType().getVectorElementType() == MVT::i64 ||

        Mask.getValueType().getVectorElementType() == MVT::i64)

      PromotedVT = VT.changeVectorElementType(MVT::i64);


    // Promote vector operands except for passthrough, which we know is either

    // undef or zero, and thus best constructed directly.

    unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

    Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);

    Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);


    // A promoted result type forces the need for an extending load.

    if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)

      ExtType = ISD::EXTLOAD;


    EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);


    // Convert fixed length vector operands to scalable.

    MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());

    Index = convertToScalableVector(DAG, ContainerVT, Index);

    Mask = convertFixedMaskToScalableVector(Mask, DAG);

    PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)

                                   : DAG.getConstant(0, DL, ContainerVT);


    // Emit equivalent scalable vector gather.

    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};

    SDValue Load =

        DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,

                            Ops, MGT->getMemOperand(), IndexType, ExtType);


    // Extract fixed length data then convert to the required result type.

    SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);

    Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);

    if (VT.isFloatingPoint())

      Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);


    return DAG.getMergeValues({Result, Load.getValue(1)}, DL);

  }


  // Everything else is legal.

  return Op;

}


SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,

                                             SelectionDAG &DAG) const {

  MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);


  SDLoc DL(Op);

  SDValue Chain = MSC->getChain();

  SDValue StoreVal = MSC->getValue();

  SDValue Mask = MSC->getMask();

  SDValue BasePtr = MSC->getBasePtr();

  SDValue Index = MSC->getIndex();

  SDValue Scale = MSC->getScale();

  EVT VT = StoreVal.getValueType();

  EVT MemVT = MSC->getMemoryVT();

  ISD::MemIndexType IndexType = MSC->getIndexType();

  bool Truncating = MSC->isTruncatingStore();


  bool IsScaled = MSC->isIndexScaled();

  bool IsSigned = MSC->isIndexSigned();


  // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else

  // must be calculated before hand.

  uint64_t ScaleVal = Scale->getAsZExtVal();

  if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {

    assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");

    EVT IndexVT = Index.getValueType();

    Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,

                        DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));

    Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());


    SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};

    return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,

                                MSC->getMemOperand(), IndexType, Truncating);

  }


  // Lower fixed length scatter to a scalable equivalent.

  if (VT.isFixedLengthVector()) {

    assert(Subtarget->useSVEForFixedLengthVectors() &&

           "Cannot lower when not using SVE for fixed vectors!");


    // Once bitcast we treat floating-point scatters as if integer.

    if (VT.isFloatingPoint()) {

      VT = VT.changeVectorElementTypeToInteger();

      MemVT = MemVT.changeVectorElementTypeToInteger();

      StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);

    }


    // Find the smallest integer fixed length vector we can use for the scatter.

    EVT PromotedVT = VT.changeVectorElementType(MVT::i32);

    if (VT.getVectorElementType() == MVT::i64 ||

        Index.getValueType().getVectorElementType() == MVT::i64 ||

        Mask.getValueType().getVectorElementType() == MVT::i64)

      PromotedVT = VT.changeVectorElementType(MVT::i64);


    // Promote vector operands.

    unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

    Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);

    Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);

    StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);


    // A promoted value type forces the need for a truncating store.

    if (PromotedVT != VT)

      Truncating = true;


    EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);


    // Convert fixed length vector operands to scalable.

    MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());

    Index = convertToScalableVector(DAG, ContainerVT, Index);

    Mask = convertFixedMaskToScalableVector(Mask, DAG);

    StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);


    // Emit equivalent scalable vector scatter.

    SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};

    return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,

                                MSC->getMemOperand(), IndexType, Truncating);

  }


  // Everything else is legal.

  return Op;

}


SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {

  SDLoc DL(Op);

  MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);

  assert(LoadNode && "Expected custom lowering of a masked load node");

  EVT VT = Op->getValueType(0);


  if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))

    return LowerFixedLengthVectorMLoadToSVE(Op, DAG);


  SDValue PassThru = LoadNode->getPassThru();

  SDValue Mask = LoadNode->getMask();


  if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))

    return Op;


  SDValue Load = DAG.getMaskedLoad(

      VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),

      LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),

      LoadNode->getMemOperand(), LoadNode->getAddressingMode(),

      LoadNode->getExtensionType());


  SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);


  return DAG.getMergeValues({Result, Load.getValue(1)}, DL);

}


// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.


static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,

                                        EVT VT, EVT MemVT,

                                        SelectionDAG &DAG) {

  assert(VT.isVector() && "VT should be a vector type");

  assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);


  SDValue Value = ST->getValue();


  // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract

  // the word lane which represent the v4i8 subvector.  It optimizes the store

  // to:

  //

  //   xtn  v0.8b, v0.8h

  //   str  s0, [x0]


  SDValue Undef = DAG.getUNDEF(MVT::i16);

  SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,

                                        {Undef, Undef, Undef, Undef});


  SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,

                                 Value, UndefVec);

  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);


  Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);

  SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,

                                     Trunc, DAG.getConstant(0, DL, MVT::i64));


  return DAG.getStore(ST->getChain(), DL, ExtractTrunc,

                      ST->getBasePtr(), ST->getMemOperand());

}


static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {

  SDLoc DL(Op);

  SDValue Src = Op.getOperand(0);

  MVT DestVT = Op.getSimpleValueType();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());


  unsigned SrcAS = N->getSrcAddressSpace();

  unsigned DestAS = N->getDestAddressSpace();

  assert(SrcAS != DestAS &&

         "addrspacecast must be between different address spaces");

  assert(TLI.getTargetMachine().getPointerSize(SrcAS) !=

             TLI.getTargetMachine().getPointerSize(DestAS) &&

         "addrspacecast must be between different ptr sizes");

  (void)TLI;


  if (SrcAS == ARM64AS::PTR32_SPTR) {

    return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, Src,

                       DAG.getTargetConstant(0, DL, DestVT));

  } else if (SrcAS == ARM64AS::PTR32_UPTR) {

    return DAG.getNode(ISD::ZERO_EXTEND, DL, DestVT, Src,

                       DAG.getTargetConstant(0, DL, DestVT));

  } else if ((DestAS == ARM64AS::PTR32_SPTR) ||

             (DestAS == ARM64AS::PTR32_UPTR)) {

    SDValue Ext = DAG.getAnyExtOrTrunc(Src, DL, DestVT);

    SDValue Trunc = DAG.getZeroExtendInReg(Ext, DL, DestVT);

    return Trunc;

  } else {

    return Src;

  }

}


// Custom lowering for any store, vector or scalar and/or default or with

// a truncate operations.  Currently only custom lower truncate operation

// from vector v4i16 to v4i8 or volatile stores of i128.

SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,

                                          SelectionDAG &DAG) const {

  SDLoc Dl(Op);

  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);

  assert (StoreNode && "Can only custom lower store nodes");


  SDValue Value = StoreNode->getValue();


  EVT VT = Value.getValueType();

  EVT MemVT = StoreNode->getMemoryVT();


  if (VT.isVector()) {

    if (useSVEForFixedLengthVectorVT(

            VT,

            /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))

      return LowerFixedLengthVectorStoreToSVE(Op, DAG);


    unsigned AS = StoreNode->getAddressSpace();

    Align Alignment = StoreNode->getAlign();

    if (Alignment < MemVT.getStoreSize() &&

        !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,

                                        StoreNode->getMemOperand()->getFlags(),

                                        nullptr)) {

      return scalarizeVectorStore(StoreNode, DAG);

    }


    if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&

        MemVT == MVT::v4i8) {

      return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);

    }

    // 256 bit non-temporal stores can be lowered to STNP. Do this as part of

    // the custom lowering, as there are no un-paired non-temporal stores and

    // legalization will break up 256 bit inputs.

    ElementCount EC = MemVT.getVectorElementCount();

    if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&

        EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&

        (MemVT.getScalarSizeInBits() == 8u ||

         MemVT.getScalarSizeInBits() == 16u ||

         MemVT.getScalarSizeInBits() == 32u ||

         MemVT.getScalarSizeInBits() == 64u)) {

      SDValue Lo =

          DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,

                      MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),

                      StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));

      SDValue Hi =

          DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,

                      MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),

                      StoreNode->getValue(),

                      DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));

      SDValue Result = DAG.getMemIntrinsicNode(

          AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),

          {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo),

           DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()},

          StoreNode->getMemoryVT(), StoreNode->getMemOperand());

      return Result;

    }

  } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {

    return LowerStore128(Op, DAG);

  } else if (MemVT == MVT::i64x8) {

    SDValue Value = StoreNode->getValue();

    assert(Value->getValueType(0) == MVT::i64x8);

    SDValue Chain = StoreNode->getChain();

    SDValue Base = StoreNode->getBasePtr();

    EVT PtrVT = Base.getValueType();

    for (unsigned i = 0; i < 8; i++) {

      SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,

                                 Value, DAG.getConstant(i, Dl, MVT::i32));

      SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,

                                DAG.getConstant(i * 8, Dl, PtrVT));

      Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),

                           StoreNode->getBaseAlign());

    }

    return Chain;

  }


  return SDValue();

}


/// Lower atomic or volatile 128-bit stores to a single STP instruction.

SDValue AArch64TargetLowering::LowerStore128(SDValue Op,

                                             SelectionDAG &DAG) const {

  MemSDNode *StoreNode = cast<MemSDNode>(Op);

  assert(StoreNode->getMemoryVT() == MVT::i128);

  assert(StoreNode->isVolatile() || StoreNode->isAtomic());


  bool IsStoreRelease =

      StoreNode->getMergedOrdering() == AtomicOrdering::Release;

  if (StoreNode->isAtomic())

    assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&

            Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||

           StoreNode->getMergedOrdering() == AtomicOrdering::Unordered ||

           StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);


  SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||

                   StoreNode->getOpcode() == ISD::ATOMIC_STORE)

                      ? StoreNode->getOperand(1)

                      : StoreNode->getOperand(2);

  SDLoc DL(Op);

  auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);

  unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;

  if (DAG.getDataLayout().isBigEndian())

    std::swap(StoreValue.first, StoreValue.second);

  SDValue Result = DAG.getMemIntrinsicNode(

      Opcode, DL, DAG.getVTList(MVT::Other),

      {StoreNode->getChain(), StoreValue.first, StoreValue.second,

       StoreNode->getBasePtr()},

      StoreNode->getMemoryVT(), StoreNode->getMemOperand());

  return Result;

}


SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,

                                         SelectionDAG &DAG) const {

  SDLoc DL(Op);

  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);

  assert(LoadNode && "Expected custom lowering of a load node");


  if (LoadNode->getMemoryVT() == MVT::i64x8) {

    SmallVector<SDValue, 8> Ops;

    SDValue Base = LoadNode->getBasePtr();

    SDValue Chain = LoadNode->getChain();

    EVT PtrVT = Base.getValueType();

    for (unsigned i = 0; i < 8; i++) {

      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,

                                DAG.getConstant(i * 8, DL, PtrVT));

      SDValue Part =

          DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),

                      LoadNode->getBaseAlign());

      Ops.push_back(Part);

      Chain = SDValue(Part.getNode(), 1);

    }

    SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);

    return DAG.getMergeValues({Loaded, Chain}, DL);

  }


  // Custom lowering for extending v4i8 vector loads.

  EVT VT = Op->getValueType(0);

  assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");


  if (LoadNode->getMemoryVT() != MVT::v4i8)

    return SDValue();


  // Avoid generating unaligned loads.

  if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))

    return SDValue();


  unsigned ExtType;

  if (LoadNode->getExtensionType() == ISD::SEXTLOAD)

    ExtType = ISD::SIGN_EXTEND;

  else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||

           LoadNode->getExtensionType() == ISD::EXTLOAD)

    ExtType = ISD::ZERO_EXTEND;

  else

    return SDValue();


  SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),

                             LoadNode->getBasePtr(), MachinePointerInfo());

  SDValue Chain = Load.getValue(1);

  SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);

  SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);

  SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);

  Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,

                    DAG.getConstant(0, DL, MVT::i64));

  if (VT == MVT::v4i32)

    Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);

  return DAG.getMergeValues({Ext, Chain}, DL);

}


SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,

                                                    SelectionDAG &DAG) const {

  SDLoc DL(Op);

  SDValue Vec = Op.getOperand(0);

  SDValue Mask = Op.getOperand(1);

  SDValue Passthru = Op.getOperand(2);

  EVT VecVT = Vec.getValueType();

  EVT MaskVT = Mask.getValueType();

  EVT ElmtVT = VecVT.getVectorElementType();

  const bool IsFixedLength = VecVT.isFixedLengthVector();

  const bool HasPassthru = !Passthru.isUndef();

  unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();

  EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts);


  assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");


  if (!Subtarget->isSVEAvailable())

    return SDValue();


  if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)

    return SDValue();


  // Only <vscale x {4|2} x {i32|i64}> supported for compact.

  if (MinElmts != 2 && MinElmts != 4)

    return SDValue();


  // We can use the SVE register containing the NEON vector in its lowest bits.

  if (IsFixedLength) {

    EVT ScalableVecVT =

        MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts);

    EVT ScalableMaskVT = MVT::getScalableVectorVT(

        MaskVT.getVectorElementType().getSimpleVT(), MinElmts);


    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,

                      DAG.getUNDEF(ScalableVecVT), Vec,

                      DAG.getConstant(0, DL, MVT::i64));

    Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT,

                       DAG.getUNDEF(ScalableMaskVT), Mask,

                       DAG.getConstant(0, DL, MVT::i64));

    Mask = DAG.getNode(ISD::TRUNCATE, DL,

                       ScalableMaskVT.changeVectorElementType(MVT::i1), Mask);

    Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,

                           DAG.getUNDEF(ScalableVecVT), Passthru,

                           DAG.getConstant(0, DL, MVT::i64));


    VecVT = Vec.getValueType();

    MaskVT = Mask.getValueType();

  }


  // Get legal type for compact instruction

  EVT ContainerVT = getSVEContainerType(VecVT);

  EVT CastVT = VecVT.changeVectorElementTypeToInteger();


  // Convert to i32 or i64 for smaller types, as these are the only supported

  // sizes for compact.

  if (ContainerVT != VecVT) {

    Vec = DAG.getBitcast(CastVT, Vec);

    Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);

  }


  SDValue Compressed = DAG.getNode(

      ISD::INTRINSIC_WO_CHAIN, DL, Vec.getValueType(),

      DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec);


  // compact fills with 0s, so if our passthru is all 0s, do nothing here.

  if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {

    SDValue Offset = DAG.getNode(

        ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,

        DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask);


    SDValue IndexMask = DAG.getNode(

        ISD::INTRINSIC_WO_CHAIN, DL, MaskVT,

        DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64),

        DAG.getConstant(0, DL, MVT::i64), Offset);


    Compressed =

        DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru);

  }


  // Extracting from a legal SVE type before truncating produces better code.

  if (IsFixedLength) {

    Compressed = DAG.getNode(

        ISD::EXTRACT_SUBVECTOR, DL,

        FixedVecVT.changeVectorElementType(ContainerVT.getVectorElementType()),

        Compressed, DAG.getConstant(0, DL, MVT::i64));

    CastVT = FixedVecVT.changeVectorElementTypeToInteger();

    VecVT = FixedVecVT;

  }


  // If we changed the element type before, we need to convert it back.

  if (ContainerVT != VecVT) {

    Compressed = DAG.getNode(ISD::TRUNCATE, DL, CastVT, Compressed);

    Compressed = DAG.getBitcast(VecVT, Compressed);

  }


  return Compressed;

}


// Generate SUBS and CSEL for integer abs.

SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {

  MVT VT = Op.getSimpleValueType();


  if (VT.isVector())

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);


  SDLoc DL(Op);

  SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, VT);


  // Generate SUBS & CSEL.

  SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),

                            Op.getOperand(0), DAG.getConstant(0, DL, VT));

  return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,

                     getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1));

}


static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {

  SDValue Chain = Op.getOperand(0);

  SDValue Cond = Op.getOperand(1);

  SDValue Dest = Op.getOperand(2);


  AArch64CC::CondCode CC;

  if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {

    SDLoc DL(Op);

    SDValue CCVal = getCondCode(DAG, CC);

    return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,

                       Cmp);

  }


  return SDValue();

}


// Treat FSHR with constant shifts as legal operation, otherwise it is expanded

// FSHL is converted to FSHR before deciding what to do with it


static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) {

  SDValue Shifts = Op.getOperand(2);

  // Check if the shift amount is a constant and normalise to [0, SrcBitLen)

  // If opcode is FSHL, convert it to FSHR

  if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {

    SDLoc DL(Op);

    MVT VT = Op.getSimpleValueType();

    unsigned int NewShiftNo = ShiftNo->getZExtValue() % VT.getFixedSizeInBits();


    if (Op.getOpcode() == ISD::FSHL) {

      if (NewShiftNo == 0)

        return Op.getOperand(0);


      NewShiftNo = VT.getFixedSizeInBits() - NewShiftNo;

      return DAG.getNode(

          ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),

          DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));

    }


    if (Op.getOpcode() == ISD::FSHR) {

      if (NewShiftNo == 0)

        return Op.getOperand(1);


      if (ShiftNo->getZExtValue() == NewShiftNo)

        return Op;


      // Rewrite using the normalised shift amount.

      return DAG.getNode(

          ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),

          DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));

    }

  }


  return SDValue();

}


static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) {

  SDValue X = Op.getOperand(0);

  EVT XScalarTy = X.getValueType();

  SDValue Exp = Op.getOperand(1);


  SDLoc DL(Op);

  EVT XVT, ExpVT;

  switch (Op.getSimpleValueType().SimpleTy) {

  default:

    return SDValue();

  case MVT::bf16:

  case MVT::f16:

    X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);

    [[fallthrough]];

  case MVT::f32:

    XVT = MVT::nxv4f32;

    ExpVT = MVT::nxv4i32;

    break;

  case MVT::f64:

    XVT = MVT::nxv2f64;

    ExpVT = MVT::nxv2i64;

    Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);

    break;

  }


  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);

  SDValue VX =

      DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);

  SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,

                             DAG.getUNDEF(ExpVT), Exp, Zero);

  SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),

                         AArch64SVEPredPattern::all);

  SDValue FScale =

      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, XVT,

                  DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),

                  VPg, VX, VExp);

  SDValue Final =

      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);

  if (X.getValueType() != XScalarTy)

    Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,

                        DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));

  return Final;

}


SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,

                                                      SelectionDAG &DAG) const {

  return Op.getOperand(0);

}


SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,

                                                    SelectionDAG &DAG) const {

  SDValue Chain = Op.getOperand(0);

  SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes

  SDValue FPtr = Op.getOperand(2); // nested function

  SDValue Nest = Op.getOperand(3); // 'nest' parameter value


  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();


  // ldr NestReg, .+16

  // ldr x17, .+20

  // br x17

  // .word 0

  // .nest: .qword nest

  // .fptr: .qword fptr

  SDValue OutChains[5];


  const Function *Func =

      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());

  CallingConv::ID CC = Func->getCallingConv();

  unsigned NestReg;


  switch (CC) {

  default:

    NestReg = 0x0f; // X15

    break;

  case CallingConv::ARM64EC_Thunk_X64:

    // Must be kept in sync with AArch64CallingConv.td

    NestReg = 0x04; // X4

    break;

  }


  const char FptrReg = 0x11; // X17


  SDValue Addr = Trmp;


  SDLoc DL(Op);

  OutChains[0] = DAG.getStore(

      Chain, DL, DAG.getConstant(0x58000080u | NestReg, DL, MVT::i32), Addr,

      MachinePointerInfo(TrmpAddr));


  Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,

                     DAG.getConstant(4, DL, MVT::i64));

  OutChains[1] = DAG.getStore(

      Chain, DL, DAG.getConstant(0x580000b0u | FptrReg, DL, MVT::i32), Addr,

      MachinePointerInfo(TrmpAddr, 4));


  Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,

                     DAG.getConstant(8, DL, MVT::i64));

  OutChains[2] =

      DAG.getStore(Chain, DL, DAG.getConstant(0xd61f0220u, DL, MVT::i32), Addr,

                   MachinePointerInfo(TrmpAddr, 8));


  Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,

                     DAG.getConstant(16, DL, MVT::i64));

  OutChains[3] =

      DAG.getStore(Chain, DL, Nest, Addr, MachinePointerInfo(TrmpAddr, 16));


  Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,

                     DAG.getConstant(24, DL, MVT::i64));

  OutChains[4] =

      DAG.getStore(Chain, DL, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24));


  SDValue StoreToken = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);


  SDValue EndOfTrmp = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,

                                  DAG.getConstant(12, DL, MVT::i64));


  // Call clear cache on the trampoline instructions.

  return DAG.getNode(ISD::CLEAR_CACHE, DL, MVT::Other, StoreToken, Trmp,

                     EndOfTrmp);

}


SDValue AArch64TargetLowering::LowerOperation(SDValue Op,

                                              SelectionDAG &DAG) const {

  LLVM_DEBUG(dbgs() << "Custom lowering: ");

  LLVM_DEBUG(Op.dump());


  switch (Op.getOpcode()) {

  default:

    llvm_unreachable("unimplemented operand");

    return SDValue();

  case ISD::LOOP_DEPENDENCE_RAW_MASK:

  case ISD::LOOP_DEPENDENCE_WAR_MASK:

    return LowerLOOP_DEPENDENCE_MASK(Op, DAG);

  case ISD::BITCAST:

    return LowerBITCAST(Op, DAG);

  case ISD::GlobalAddress:

    return LowerGlobalAddress(Op, DAG);

  case ISD::GlobalTLSAddress:

    return LowerGlobalTLSAddress(Op, DAG);

  case ISD::PtrAuthGlobalAddress:

    return LowerPtrAuthGlobalAddress(Op, DAG);

  case ISD::ADJUST_TRAMPOLINE:

    return LowerADJUST_TRAMPOLINE(Op, DAG);

  case ISD::INIT_TRAMPOLINE:

    return LowerINIT_TRAMPOLINE(Op, DAG);

  case ISD::SETCC:

  case ISD::STRICT_FSETCC:

  case ISD::STRICT_FSETCCS:

    return LowerSETCC(Op, DAG);

  case ISD::SETCCCARRY:

    return LowerSETCCCARRY(Op, DAG);

  case ISD::BRCOND:

    return LowerBRCOND(Op, DAG);

  case ISD::BR_CC:

    return LowerBR_CC(Op, DAG);

  case ISD::SELECT:

    return LowerSELECT(Op, DAG);

  case ISD::SELECT_CC:

    return LowerSELECT_CC(Op, DAG);

  case ISD::JumpTable:

    return LowerJumpTable(Op, DAG);

  case ISD::BR_JT:

    return LowerBR_JT(Op, DAG);

  case ISD::BRIND:

    return LowerBRIND(Op, DAG);

  case ISD::ConstantPool:

    return LowerConstantPool(Op, DAG);

  case ISD::BlockAddress:

    return LowerBlockAddress(Op, DAG);

  case ISD::VASTART:

    return LowerVASTART(Op, DAG);

  case ISD::VACOPY:

    return LowerVACOPY(Op, DAG);

  case ISD::VAARG:

    return LowerVAARG(Op, DAG);

  case ISD::UADDO_CARRY:

    return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);

  case ISD::USUBO_CARRY:

    return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);

  case ISD::SADDO_CARRY:

    return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);

  case ISD::SSUBO_CARRY:

    return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);

  case ISD::SADDO:

  case ISD::UADDO:

  case ISD::SSUBO:

  case ISD::USUBO:

  case ISD::SMULO:

  case ISD::UMULO:

    return LowerXALUO(Op, DAG);

  case ISD::FADD:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);

  case ISD::FSUB:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);

  case ISD::FMUL:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);

  case ISD::FMA:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);

  case ISD::FDIV:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);

  case ISD::FNEG:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);

  case ISD::FCEIL:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);

  case ISD::FFLOOR:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);

  case ISD::FNEARBYINT:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);

  case ISD::FRINT:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);

  case ISD::FROUND:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);

  case ISD::FROUNDEVEN:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);

  case ISD::FTRUNC:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);

  case ISD::FSQRT:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);

  case ISD::FABS:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);

  case ISD::FP_ROUND:

  case ISD::STRICT_FP_ROUND:

    return LowerFP_ROUND(Op, DAG);

  case ISD::FP_EXTEND:

  case ISD::STRICT_FP_EXTEND:

    return LowerFP_EXTEND(Op, DAG);

  case ISD::FRAMEADDR:

    return LowerFRAMEADDR(Op, DAG);

  case ISD::SPONENTRY:

    return LowerSPONENTRY(Op, DAG);

  case ISD::RETURNADDR:

    return LowerRETURNADDR(Op, DAG);

  case ISD::ADDROFRETURNADDR:

    return LowerADDROFRETURNADDR(Op, DAG);

  case ISD::CONCAT_VECTORS:

    return LowerCONCAT_VECTORS(Op, DAG);

  case ISD::INSERT_VECTOR_ELT:

    return LowerINSERT_VECTOR_ELT(Op, DAG);

  case ISD::EXTRACT_VECTOR_ELT:

    return LowerEXTRACT_VECTOR_ELT(Op, DAG);

  case ISD::BUILD_VECTOR:

    return LowerBUILD_VECTOR(Op, DAG);

  case ISD::ZERO_EXTEND_VECTOR_INREG:

    return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);

  case ISD::VECTOR_SHUFFLE:

    return LowerVECTOR_SHUFFLE(Op, DAG);

  case ISD::SPLAT_VECTOR:

    return LowerSPLAT_VECTOR(Op, DAG);

  case ISD::EXTRACT_SUBVECTOR:

    return LowerEXTRACT_SUBVECTOR(Op, DAG);

  case ISD::INSERT_SUBVECTOR:

    return LowerINSERT_SUBVECTOR(Op, DAG);

  case ISD::SDIV:

  case ISD::UDIV:

    return LowerDIV(Op, DAG);

  case ISD::SMIN:

  case ISD::UMIN:

  case ISD::SMAX:

  case ISD::UMAX:

    return LowerMinMax(Op, DAG);

  case ISD::SRA:

  case ISD::SRL:

  case ISD::SHL:

    return LowerVectorSRA_SRL_SHL(Op, DAG);

  case ISD::SHL_PARTS:

  case ISD::SRL_PARTS:

  case ISD::SRA_PARTS:

    return LowerShiftParts(Op, DAG);

  case ISD::CTPOP:

  case ISD::PARITY:

    return LowerCTPOP_PARITY(Op, DAG);

  case ISD::FCOPYSIGN:

    return LowerFCOPYSIGN(Op, DAG);

  case ISD::OR:

    return LowerVectorOR(Op, DAG);

  case ISD::XOR:

    return LowerXOR(Op, DAG);

  case ISD::PREFETCH:

    return LowerPREFETCH(Op, DAG);

  case ISD::SINT_TO_FP:

  case ISD::UINT_TO_FP:

  case ISD::STRICT_SINT_TO_FP:

  case ISD::STRICT_UINT_TO_FP:

    return LowerINT_TO_FP(Op, DAG);

  case ISD::FP_TO_SINT:

  case ISD::FP_TO_UINT:

  case ISD::STRICT_FP_TO_SINT:

  case ISD::STRICT_FP_TO_UINT:

    return LowerFP_TO_INT(Op, DAG);

  case ISD::FP_TO_SINT_SAT:

  case ISD::FP_TO_UINT_SAT:

    return LowerFP_TO_INT_SAT(Op, DAG);

  case ISD::FSINCOS:

    return LowerFSINCOS(Op, DAG);

  case ISD::GET_ROUNDING:

    return LowerGET_ROUNDING(Op, DAG);

  case ISD::SET_ROUNDING:

    return LowerSET_ROUNDING(Op, DAG);

  case ISD::GET_FPMODE:

    return LowerGET_FPMODE(Op, DAG);

  case ISD::SET_FPMODE:

    return LowerSET_FPMODE(Op, DAG);

  case ISD::RESET_FPMODE:

    return LowerRESET_FPMODE(Op, DAG);

  case ISD::MUL:

    return LowerMUL(Op, DAG);

  case ISD::MULHS:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);

  case ISD::MULHU:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);

  case ISD::INTRINSIC_W_CHAIN:

    return LowerINTRINSIC_W_CHAIN(Op, DAG);

  case ISD::INTRINSIC_WO_CHAIN:

    return LowerINTRINSIC_WO_CHAIN(Op, DAG);

  case ISD::INTRINSIC_VOID:

    return LowerINTRINSIC_VOID(Op, DAG);

  case ISD::ATOMIC_STORE:

    if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {

      assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());

      return LowerStore128(Op, DAG);

    }

    return SDValue();

  case ISD::STORE:

    return LowerSTORE(Op, DAG);

  case ISD::MSTORE:

    return LowerFixedLengthVectorMStoreToSVE(Op, DAG);

  case ISD::MGATHER:

    return LowerMGATHER(Op, DAG);

  case ISD::MSCATTER:

    return LowerMSCATTER(Op, DAG);

  case ISD::VECREDUCE_SEQ_FADD:

    return LowerVECREDUCE_SEQ_FADD(Op, DAG);

  case ISD::VECREDUCE_ADD:

  case ISD::VECREDUCE_AND:

  case ISD::VECREDUCE_OR:

  case ISD::VECREDUCE_XOR:

  case ISD::VECREDUCE_SMAX:

  case ISD::VECREDUCE_SMIN:

  case ISD::VECREDUCE_UMAX:

  case ISD::VECREDUCE_UMIN:

  case ISD::VECREDUCE_FADD:

  case ISD::VECREDUCE_FMAX:

  case ISD::VECREDUCE_FMIN:

  case ISD::VECREDUCE_FMAXIMUM:

  case ISD::VECREDUCE_FMINIMUM:

    return LowerVECREDUCE(Op, DAG);

  case ISD::VECREDUCE_MUL:

  case ISD::VECREDUCE_FMUL:

    return LowerVECREDUCE_MUL(Op, DAG);

  case ISD::ATOMIC_LOAD_AND:

    return LowerATOMIC_LOAD_AND(Op, DAG);

  case ISD::DYNAMIC_STACKALLOC:

    return LowerDYNAMIC_STACKALLOC(Op, DAG);

  case ISD::VSCALE:

    return LowerVSCALE(Op, DAG);

  case ISD::VECTOR_COMPRESS:

    return LowerVECTOR_COMPRESS(Op, DAG);

  case ISD::ANY_EXTEND:

  case ISD::SIGN_EXTEND:

  case ISD::ZERO_EXTEND:

    return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);

  case ISD::ADDRSPACECAST:

    return LowerADDRSPACECAST(Op, DAG);

  case ISD::SIGN_EXTEND_INREG: {

    // Only custom lower when ExtraVT has a legal byte based element type.

    EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();

    EVT ExtraEltVT = ExtraVT.getVectorElementType();

    if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&

        (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))

      return SDValue();


    return LowerToPredicatedOp(Op, DAG,

                               AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);

  }

  case ISD::TRUNCATE:

    return LowerTRUNCATE(Op, DAG);

  case ISD::MLOAD:

    return LowerMLOAD(Op, DAG);

  case ISD::LOAD:

    if (useSVEForFixedLengthVectorVT(Op.getValueType(),

                                     !Subtarget->isNeonAvailable()))

      return LowerFixedLengthVectorLoadToSVE(Op, DAG);

    return LowerLOAD(Op, DAG);

  case ISD::ADD:

  case ISD::AND:

  case ISD::SUB:

    return LowerToScalableOp(Op, DAG);

  case ISD::FMAXIMUM:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);

  case ISD::FMAXNUM:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);

  case ISD::FMINIMUM:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);

  case ISD::FMINNUM:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);

  case ISD::VSELECT:

    return LowerFixedLengthVectorSelectToSVE(Op, DAG);

  case ISD::ABS:

    return LowerABS(Op, DAG);

  case ISD::ABDS:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);

  case ISD::ABDU:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);

  case ISD::AVGFLOORS:

    return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);

  case ISD::AVGFLOORU:

    return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);

  case ISD::AVGCEILS:

    return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);

  case ISD::AVGCEILU:

    return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);

  case ISD::BITREVERSE:

    return LowerBitreverse(Op, DAG);

  case ISD::BSWAP:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);

  case ISD::CTLZ:

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);

  case ISD::CTTZ:

    return LowerCTTZ(Op, DAG);

  case ISD::VECTOR_SPLICE:

    return LowerVECTOR_SPLICE(Op, DAG);

  case ISD::VECTOR_DEINTERLEAVE:

    return LowerVECTOR_DEINTERLEAVE(Op, DAG);

  case ISD::VECTOR_INTERLEAVE:

    return LowerVECTOR_INTERLEAVE(Op, DAG);

  case ISD::GET_ACTIVE_LANE_MASK:

    return LowerGET_ACTIVE_LANE_MASK(Op, DAG);

  case ISD::LRINT:

  case ISD::LLRINT:

    if (Op.getValueType().isVector())

      return LowerVectorXRINT(Op, DAG);

    [[fallthrough]];

  case ISD::LROUND:

  case ISD::LLROUND: {

    assert((Op.getOperand(0).getValueType() == MVT::f16 ||

            Op.getOperand(0).getValueType() == MVT::bf16) &&

           "Expected custom lowering of rounding operations only for f16");

    SDLoc DL(Op);

    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));

    return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);

  }

  case ISD::STRICT_LROUND:

  case ISD::STRICT_LLROUND:

  case ISD::STRICT_LRINT:

  case ISD::STRICT_LLRINT: {

    assert((Op.getOperand(1).getValueType() == MVT::f16 ||

            Op.getOperand(1).getValueType() == MVT::bf16) &&

           "Expected custom lowering of rounding operations only for f16");

    SDLoc DL(Op);

    SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},

                              {Op.getOperand(0), Op.getOperand(1)});

    return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},

                       {Ext.getValue(1), Ext.getValue(0)});

  }

  case ISD::WRITE_REGISTER: {

    assert(Op.getOperand(2).getValueType() == MVT::i128 &&

           "WRITE_REGISTER custom lowering is only for 128-bit sysregs");

    SDLoc DL(Op);


    SDValue Chain = Op.getOperand(0);

    SDValue SysRegName = Op.getOperand(1);

    std::pair<SDValue, SDValue> Pair =

        DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);


    // chain = MSRR(chain, sysregname, lo, hi)

    SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,

                                 SysRegName, Pair.first, Pair.second);


    return Result;

  }

  case ISD::FSHL:

  case ISD::FSHR:

    return LowerFunnelShift(Op, DAG);

  case ISD::FLDEXP:

    return LowerFLDEXP(Op, DAG);

  case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:

    return LowerVECTOR_HISTOGRAM(Op, DAG);

  case ISD::PARTIAL_REDUCE_SMLA:

  case ISD::PARTIAL_REDUCE_UMLA:

  case ISD::PARTIAL_REDUCE_SUMLA:

    return LowerPARTIAL_REDUCE_MLA(Op, DAG);

  }

}


bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {

  return !Subtarget->useSVEForFixedLengthVectors();

}


bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(

    EVT VT, bool OverrideNEON) const {

  if (!VT.isFixedLengthVector() || !VT.isSimple())

    return false;


  // Don't use SVE for vectors we cannot scalarize if required.

  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {

  // Fixed length predicates should be promoted to i8.

  // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.

  case MVT::i1:

  default:

    return false;

  case MVT::i8:

  case MVT::i16:

  case MVT::i32:

  case MVT::i64:

  case MVT::f16:

  case MVT::f32:

  case MVT::f64:

    break;

  }


  // NEON-sized vectors can be emulated using SVE instructions.

  if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))

    return Subtarget->isSVEorStreamingSVEAvailable();


  // Ensure NEON MVTs only belong to a single register class.

  if (VT.getFixedSizeInBits() <= 128)

    return false;


  // Ensure wider than NEON code generation is enabled.

  if (!Subtarget->useSVEForFixedLengthVectors())

    return false;


  // Don't use SVE for types that don't fit.

  if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())

    return false;


  // TODO: Perhaps an artificial restriction, but worth having whilst getting

  // the base fixed length SVE support in place.

  if (!VT.isPow2VectorType())

    return false;


  return true;

}


//===----------------------------------------------------------------------===//

//                      Calling Convention Implementation

//===----------------------------------------------------------------------===//


static unsigned getIntrinsicID(const SDNode *N) {

  unsigned Opcode = N->getOpcode();

  switch (Opcode) {

  default:

    return Intrinsic::not_intrinsic;

  case ISD::INTRINSIC_WO_CHAIN: {

    unsigned IID = N->getConstantOperandVal(0);

    if (IID < Intrinsic::num_intrinsics)

      return IID;

    return Intrinsic::not_intrinsic;

  }

  }

}


bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,

                                                SDValue N1) const {

  if (!N0.hasOneUse())

    return false;


  unsigned IID = getIntrinsicID(N1.getNode());

  // Avoid reassociating expressions that can be lowered to smlal/umlal.

  if (IID == Intrinsic::aarch64_neon_umull ||

      N1.getOpcode() == AArch64ISD::UMULL ||

      IID == Intrinsic::aarch64_neon_smull ||

      N1.getOpcode() == AArch64ISD::SMULL)

    return N0.getOpcode() != ISD::ADD;


  return true;

}


/// Selects the correct CCAssignFn for a given CallingConvention value.


CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,

                                                     bool IsVarArg) const {

  switch (CC) {

  default:

    reportFatalUsageError("unsupported calling convention");

  case CallingConv::GHC:

    return CC_AArch64_GHC;

  case CallingConv::PreserveNone:

    // The VarArg implementation makes assumptions about register

    // argument passing that do not hold for preserve_none, so we

    // instead fall back to C argument passing.

    // The non-vararg case is handled in the CC function itself.

    if (!IsVarArg)

      return CC_AArch64_Preserve_None;

    [[fallthrough]];

  case CallingConv::C:

  case CallingConv::Fast:

  case CallingConv::PreserveMost:

  case CallingConv::PreserveAll:

  case CallingConv::CXX_FAST_TLS:

  case CallingConv::Swift:

  case CallingConv::SwiftTail:

  case CallingConv::Tail:

  case CallingConv::GRAAL:

    if (Subtarget->isTargetWindows()) {

      if (IsVarArg) {

        if (Subtarget->isWindowsArm64EC())

          return CC_AArch64_Arm64EC_VarArg;

        return CC_AArch64_Win64_VarArg;

      }

      return CC_AArch64_Win64PCS;

    }

    if (!Subtarget->isTargetDarwin())

      return CC_AArch64_AAPCS;

    if (!IsVarArg)

      return CC_AArch64_DarwinPCS;

    return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg

                                      : CC_AArch64_DarwinPCS_VarArg;

  case CallingConv::Win64:

    if (IsVarArg) {

      if (Subtarget->isWindowsArm64EC())

        return CC_AArch64_Arm64EC_VarArg;

      return CC_AArch64_Win64_VarArg;

    }

    return CC_AArch64_Win64PCS;

  case CallingConv::CFGuard_Check:

    if (Subtarget->isWindowsArm64EC())

      return CC_AArch64_Arm64EC_CFGuard_Check;

    return CC_AArch64_Win64_CFGuard_Check;

  case CallingConv::AArch64_VectorCall:

  case CallingConv::AArch64_SVE_VectorCall:

  case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:

  case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1:

  case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:

    return CC_AArch64_AAPCS;

  case CallingConv::ARM64EC_Thunk_X64:

    return CC_AArch64_Arm64EC_Thunk;

  case CallingConv::ARM64EC_Thunk_Native:

    return CC_AArch64_Arm64EC_Thunk_Native;

  }

}


CCAssignFn *


AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {

  switch (CC) {

  default:

    return RetCC_AArch64_AAPCS;

  case CallingConv::ARM64EC_Thunk_X64:

    return RetCC_AArch64_Arm64EC_Thunk;

  case CallingConv::CFGuard_Check:

    if (Subtarget->isWindowsArm64EC())

      return RetCC_AArch64_Arm64EC_CFGuard_Check;

    return RetCC_AArch64_AAPCS;

  }

}


static bool isPassedInFPR(EVT VT) {

  return VT.isFixedLengthVector() ||

         (VT.isFloatingPoint() && !VT.isScalableVector());

}


static SDValue getZT0FrameIndex(MachineFrameInfo &MFI,

                                AArch64FunctionInfo &FuncInfo,

                                SelectionDAG &DAG) {

  if (!FuncInfo.hasZT0SpillSlotIndex())

    FuncInfo.setZT0SpillSlotIndex(MFI.CreateSpillStackObject(64, Align(16)));


  return DAG.getFrameIndex(

      FuncInfo.getZT0SpillSlotIndex(),

      DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));

}


// Emit a call to __arm_sme_save or __arm_sme_restore.


static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI,

                                       SelectionDAG &DAG,

                                       AArch64FunctionInfo *Info, SDLoc DL,

                                       SDValue Chain, bool IsSave) {

  MachineFunction &MF = DAG.getMachineFunction();

  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

  FuncInfo->setSMESaveBufferUsed();

  TargetLowering::ArgListTy Args;

  Args.emplace_back(

      DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),

      PointerType::getUnqual(*DAG.getContext()));


  RTLIB::Libcall LC =

      IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;

  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),

                                         TLI.getPointerTy(DAG.getDataLayout()));

  auto *RetTy = Type::getVoidTy(*DAG.getContext());

  TargetLowering::CallLoweringInfo CLI(DAG);

  CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(

      TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));

  return TLI.LowerCallTo(CLI).second;

}


static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL,

                                     const AArch64TargetLowering &TLI,

                                     const AArch64RegisterInfo &TRI,

                                     AArch64FunctionInfo &FuncInfo,

                                     SelectionDAG &DAG) {

  // Conditionally restore the lazy save using a pseudo node.

  RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;

  TPIDR2Object &TPIDR2 = FuncInfo.getTPIDR2Obj();

  SDValue RegMask = DAG.getRegisterMask(TRI.getCallPreservedMask(

      DAG.getMachineFunction(), TLI.getLibcallCallingConv(LC)));

  SDValue RestoreRoutine = DAG.getTargetExternalSymbol(

      TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout()));

  SDValue TPIDR2_EL0 = DAG.getNode(

      ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain,

      DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));

  // Copy the address of the TPIDR2 block into X0 before 'calling' the

  // RESTORE_ZA pseudo.

  SDValue Glue;

  SDValue TPIDR2Block = DAG.getFrameIndex(

      TPIDR2.FrameIndex,

      DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));

  Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, TPIDR2Block, Glue);

  Chain =

      DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,

                  {Chain, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),

                   RestoreRoutine, RegMask, Chain.getValue(1)});

  // Finally reset the TPIDR2_EL0 register to 0.

  Chain = DAG.getNode(

      ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,

      DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),

      DAG.getConstant(0, DL, MVT::i64));

  TPIDR2.Uses++;

  return Chain;

}


SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,

                                               SelectionDAG &DAG) const {

  assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");

  SDValue Glue = Chain.getValue(1);


  MachineFunction &MF = DAG.getMachineFunction();

  auto &FuncInfo = *MF.getInfo<AArch64FunctionInfo>();

  auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();

  const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();


  SMEAttrs SMEFnAttrs = FuncInfo.getSMEFnAttrs();


  // The following conditions are true on entry to an exception handler:

  // - PSTATE.SM is 0.

  // - PSTATE.ZA is 0.

  // - TPIDR2_EL0 is null.

  // See:

  // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#exceptions

  //

  // Therefore, if the function that contains this exception handler is a

  // streaming[-compatible] function, we must re-enable streaming mode.

  //

  // These mode changes are usually optimized away in catch blocks as they

  // occur before the __cxa_begin_catch (which is a non-streaming function),

  // but are necessary in some cases (such as for cleanups).

  //

  // Additionally, if the function has ZA or ZT0 state, we must restore it.


  // [COND_]SMSTART SM

  if (SMEFnAttrs.hasStreamingInterfaceOrBody())

    Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,

                                /*Glue*/ Glue, AArch64SME::Always);

  else if (SMEFnAttrs.hasStreamingCompatibleInterface())

    Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,

                                AArch64SME::IfCallerIsStreaming);


  if (getTM().useNewSMEABILowering())

    return Chain;


  if (SMEFnAttrs.hasAgnosticZAInterface()) {

    // Restore full ZA

    Chain = emitSMEStateSaveRestore(*this, DAG, &FuncInfo, DL, Chain,

                                    /*IsSave=*/false);

  } else if (SMEFnAttrs.hasZAState() || SMEFnAttrs.hasZT0State()) {

    // SMSTART ZA

    Chain = DAG.getNode(

        AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,

        DAG.getTargetConstant(int32_t(AArch64SVCR::SVCRZA), DL, MVT::i32));


    // Restore ZT0

    if (SMEFnAttrs.hasZT0State()) {

      SDValue ZT0FrameIndex =

          getZT0FrameIndex(MF.getFrameInfo(), FuncInfo, DAG);

      Chain =

          DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),

                      {Chain, DAG.getConstant(0, DL, MVT::i32), ZT0FrameIndex});

    }


    // Restore ZA

    if (SMEFnAttrs.hasZAState())

      Chain = emitRestoreZALazySave(Chain, DL, *this, TRI, FuncInfo, DAG);

  }


  return Chain;

}


SDValue AArch64TargetLowering::LowerFormalArguments(

    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,

    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,

    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

  MachineFunction &MF = DAG.getMachineFunction();

  const Function &F = MF.getFunction();

  MachineFrameInfo &MFI = MF.getFrameInfo();

  bool IsWin64 =

      Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());

  bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||

                    (isVarArg && Subtarget->isWindowsArm64EC());

  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();


  SmallVector<ISD::OutputArg, 4> Outs;

  GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,

                DAG.getTargetLoweringInfo(), MF.getDataLayout());

  if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))

    FuncInfo->setIsSVECC(true);


  // Assign locations to all of the incoming arguments.

  SmallVector<CCValAssign, 16> ArgLocs;

  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());


  // At this point, Ins[].VT may already be promoted to i32. To correctly

  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and

  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.

  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here

  // we use a special version of AnalyzeFormalArguments to pass in ValVT and

  // LocVT.

  unsigned NumArgs = Ins.size();

  Function::const_arg_iterator CurOrigArg = F.arg_begin();

  unsigned CurArgIdx = 0;

  bool UseVarArgCC = false;

  if (IsWin64)

    UseVarArgCC = isVarArg;


  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);


  for (unsigned i = 0; i != NumArgs; ++i) {

    MVT ValVT = Ins[i].VT;

    if (Ins[i].isOrigArg()) {

      std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);

      CurArgIdx = Ins[i].getOrigArgIndex();


      // Get type of the original argument.

      EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),

                                  /*AllowUnknown*/ true);

      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;

      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.

      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)

        ValVT = MVT::i8;

      else if (ActualMVT == MVT::i16)

        ValVT = MVT::i16;

    }

    bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags,

                        Ins[i].OrigTy, CCInfo);

    assert(!Res && "Call operand has unhandled type");

    (void)Res;

  }


  SMEAttrs Attrs = FuncInfo->getSMEFnAttrs();

  bool IsLocallyStreaming =

      !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();

  assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");

  SDValue Glue = Chain.getValue(1);


  unsigned ExtraArgLocs = 0;

  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {

    CCValAssign &VA = ArgLocs[i - ExtraArgLocs];


    if (Ins[i].Flags.isByVal()) {

      // Byval is used for HFAs in the PCS, but the system should work in a

      // non-compliant manner for larger structs.

      EVT PtrVT = getPointerTy(DAG.getDataLayout());

      int Size = Ins[i].Flags.getByValSize();

      unsigned NumRegs = (Size + 7) / 8;


      // FIXME: This works on big-endian for composite byvals, which are the common

      // case. It should also work for fundamental types too.

      unsigned FrameIdx =

        MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);

      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);

      InVals.push_back(FrameIdxN);


      continue;

    }


    if (Ins[i].Flags.isSwiftAsync())

      MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);


    SDValue ArgValue;

    if (VA.isRegLoc()) {

      // Arguments stored in registers.

      EVT RegVT = VA.getLocVT();

      const TargetRegisterClass *RC;


      if (RegVT == MVT::i32)

        RC = &AArch64::GPR32RegClass;

      else if (RegVT == MVT::i64)

        RC = &AArch64::GPR64RegClass;

      else if (RegVT == MVT::f16 || RegVT == MVT::bf16)

        RC = &AArch64::FPR16RegClass;

      else if (RegVT == MVT::f32)

        RC = &AArch64::FPR32RegClass;

      else if (RegVT == MVT::f64 || RegVT.is64BitVector())

        RC = &AArch64::FPR64RegClass;

      else if (RegVT == MVT::f128 || RegVT.is128BitVector())

        RC = &AArch64::FPR128RegClass;

      else if (RegVT.isScalableVector() &&

               RegVT.getVectorElementType() == MVT::i1) {

        FuncInfo->setIsSVECC(true);

        RC = &AArch64::PPRRegClass;

      } else if (RegVT == MVT::aarch64svcount) {

        FuncInfo->setIsSVECC(true);

        RC = &AArch64::PPRRegClass;

      } else if (RegVT.isScalableVector()) {

        FuncInfo->setIsSVECC(true);

        RC = &AArch64::ZPRRegClass;

      } else

        llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");


      // Transform the arguments in physical registers into virtual ones.

      Register Reg = MF.addLiveIn(VA.getLocReg(), RC);


      if (IsLocallyStreaming) {

        // LocallyStreamingFunctions must insert the SMSTART in the correct

        // position, so we use Glue to ensure no instructions can be scheduled

        // between the chain of:

        //        t0: ch,glue = EntryNode

        //      t1:  res,ch,glue = CopyFromReg

        //     ...

        //   tn: res,ch,glue = CopyFromReg t(n-1), ..

        // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2

        // ^^^^^^

        // This will be the new Chain/Root node.

        ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);

        Glue = ArgValue.getValue(2);

        if (isPassedInFPR(ArgValue.getValueType())) {

          ArgValue =

              DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,

                          DAG.getVTList(ArgValue.getValueType(), MVT::Glue),

                          {ArgValue, Glue});

          Glue = ArgValue.getValue(1);

        }

      } else

        ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);


      // If this is an 8, 16 or 32-bit value, it is really passed promoted

      // to 64 bits.  Insert an assert[sz]ext to capture this, then

      // truncate to the right size.

      switch (VA.getLocInfo()) {

      default:

        llvm_unreachable("Unknown loc info!");

      case CCValAssign::Full:

        break;

      case CCValAssign::Indirect:

        assert(

            (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&

            "Indirect arguments should be scalable on most subtargets");

        break;

      case CCValAssign::BCvt:

        ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);

        break;

      case CCValAssign::AExt:

      case CCValAssign::SExt:

      case CCValAssign::ZExt:

        break;

      case CCValAssign::AExtUpper:

        ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,

                               DAG.getConstant(32, DL, RegVT));

        ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());

        break;

      }

    } else { // VA.isRegLoc()

      assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");

      unsigned ArgOffset = VA.getLocMemOffset();

      unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect

                              ? VA.getLocVT().getSizeInBits()

                              : VA.getValVT().getSizeInBits()) / 8;


      uint32_t BEAlign = 0;

      if (!Subtarget->isLittleEndian() && ArgSize < 8 &&

          !Ins[i].Flags.isInConsecutiveRegs())

        BEAlign = 8 - ArgSize;


      SDValue FIN;

      MachinePointerInfo PtrInfo;

      if (StackViaX4) {

        // In both the ARM64EC varargs convention and the thunk convention,

        // arguments on the stack are accessed relative to x4, not sp. In

        // the thunk convention, there's an additional offset of 32 bytes

        // to account for the shadow store.

        unsigned ObjOffset = ArgOffset + BEAlign;

        if (CallConv == CallingConv::ARM64EC_Thunk_X64)

          ObjOffset += 32;

        Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);

        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);

        FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,

                          DAG.getConstant(ObjOffset, DL, MVT::i64));

        PtrInfo = MachinePointerInfo::getUnknownStack(MF);

      } else {

        int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);


        // Create load nodes to retrieve arguments from the stack.

        FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

        PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);

      }


      // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)

      ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;

      MVT MemVT = VA.getValVT();


      switch (VA.getLocInfo()) {

      default:

        break;

      case CCValAssign::Trunc:

      case CCValAssign::BCvt:

        MemVT = VA.getLocVT();

        break;

      case CCValAssign::Indirect:

        assert((VA.getValVT().isScalableVector() ||

                Subtarget->isWindowsArm64EC()) &&

               "Indirect arguments should be scalable on most subtargets");

        MemVT = VA.getLocVT();

        break;

      case CCValAssign::SExt:

        ExtType = ISD::SEXTLOAD;

        break;

      case CCValAssign::ZExt:

        ExtType = ISD::ZEXTLOAD;

        break;

      case CCValAssign::AExt:

        ExtType = ISD::EXTLOAD;

        break;

      }


      ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,

                                MemVT);

    }


    if (VA.getLocInfo() == CCValAssign::Indirect) {

      assert((VA.getValVT().isScalableVT() ||

              Subtarget->isWindowsArm64EC()) &&

             "Indirect arguments should be scalable on most subtargets");


      uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();

      unsigned NumParts = 1;

      if (Ins[i].Flags.isInConsecutiveRegs()) {

        while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())

          ++NumParts;

      }


      MVT PartLoad = VA.getValVT();

      SDValue Ptr = ArgValue;


      // Ensure we generate all loads for each tuple part, whilst updating the

      // pointer after each load correctly using vscale.

      while (NumParts > 0) {

        ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());

        InVals.push_back(ArgValue);

        NumParts--;

        if (NumParts > 0) {

          SDValue BytesIncrement;

          if (PartLoad.isScalableVector()) {

            BytesIncrement = DAG.getVScale(

                DL, Ptr.getValueType(),

                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));

          } else {

            BytesIncrement = DAG.getConstant(

                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,

                Ptr.getValueType());

          }

          Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,

                            BytesIncrement, SDNodeFlags::NoUnsignedWrap);

          ExtraArgLocs++;

          i++;

        }

      }

    } else {

      if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())

        ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),

                               ArgValue, DAG.getValueType(MVT::i32));


      // i1 arguments are zero-extended to i8 by the caller. Emit a

      // hint to reflect this.

      if (Ins[i].isOrigArg()) {

        Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());

        if (OrigArg->getType()->isIntegerTy(1)) {

          if (!Ins[i].Flags.isZExt()) {

            ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,

                                   ArgValue.getValueType(), ArgValue);

          }

        }

      }


      InVals.push_back(ArgValue);

    }

  }

  assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());


  if (Attrs.hasStreamingCompatibleInterface()) {

    SDValue EntryPStateSM =

        DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL,

                    DAG.getVTList(MVT::i64, MVT::Other), {Chain});


    // Copy the value to a virtual register, and save that in FuncInfo.

    Register EntryPStateSMReg =

        MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);

    Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg,

                             EntryPStateSM);

    FuncInfo->setPStateSMReg(EntryPStateSMReg);

  }


  // Insert the SMSTART if this is a locally streaming function and

  // make sure it is Glued to the last CopyFromReg value.

  if (IsLocallyStreaming) {

    if (Attrs.hasStreamingCompatibleInterface())

      Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,

                                  AArch64SME::IfCallerIsNonStreaming);

    else

      Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,

                                  AArch64SME::Always);


    // Ensure that the SMSTART happens after the CopyWithChain such that its

    // chain result is used.

    for (unsigned I=0; I<InVals.size(); ++I) {

      Register Reg = MF.getRegInfo().createVirtualRegister(

          getRegClassFor(InVals[I].getValueType().getSimpleVT()));

      Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);

      InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,

                                     InVals[I].getValueType());

    }

  }


  // varargs

  if (isVarArg) {

    if (DAG.getMachineFunction().getFrameInfo().hasVAStart()) {

      if (!Subtarget->isTargetDarwin() || IsWin64) {

        // The AAPCS variadic function ABI is identical to the non-variadic

        // one. As a result there may be more arguments in registers and we

        // should save them for future reference.

        // Win64 variadic functions also pass arguments in registers, but all

        // float arguments are passed in integer registers.

        saveVarArgRegisters(CCInfo, DAG, DL, Chain);

      }


      // This will point to the next argument passed via stack.

      unsigned VarArgsOffset = CCInfo.getStackSize();

      // We currently pass all varargs at 8-byte alignment, or 4 for ILP32

      VarArgsOffset =

          alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);

      FuncInfo->setVarArgsStackOffset(VarArgsOffset);

      FuncInfo->setVarArgsStackIndex(

          MFI.CreateFixedObject(4, VarArgsOffset, true));

    }


    if (MFI.hasMustTailInVarArgFunc()) {

      SmallVector<MVT, 2> RegParmTypes;

      RegParmTypes.push_back(MVT::i64);

      RegParmTypes.push_back(MVT::f128);

      // Compute the set of forwarded registers. The rest are scratch.

      SmallVectorImpl<ForwardedRegister> &Forwards =

                                       FuncInfo->getForwardedMustTailRegParms();

      CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,

                                               CC_AArch64_AAPCS);


      // Conservatively forward X8, since it might be used for aggregate return.

      if (!CCInfo.isAllocated(AArch64::X8)) {

        Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);

        Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));

      }

    }

  }


  // On Windows, InReg pointers must be returned, so record the pointer in a

  // virtual register at the start of the function so it can be returned in the

  // epilogue.

  if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {

    for (unsigned I = 0, E = Ins.size(); I != E; ++I) {

      if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||

           Ins[I].Flags.isInReg()) &&

          Ins[I].Flags.isSRet()) {

        assert(!FuncInfo->getSRetReturnReg());


        MVT PtrTy = getPointerTy(DAG.getDataLayout());

        Register Reg =

            MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));

        FuncInfo->setSRetReturnReg(Reg);


        SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);

        Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);

        break;

      }

    }

  }


  unsigned StackArgSize = CCInfo.getStackSize();

  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;

  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {

    // This is a non-standard ABI so by fiat I say we're allowed to make full

    // use of the stack area to be popped, which must be aligned to 16 bytes in

    // any case:

    StackArgSize = alignTo(StackArgSize, 16);


    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding

    // a multiple of 16.

    FuncInfo->setArgumentStackToRestore(StackArgSize);


    // This realignment carries over to the available bytes below. Our own

    // callers will guarantee the space is free by giving an aligned value to

    // CALLSEQ_START.

  }

  // Even if we're not expected to free up the space, it's useful to know how

  // much is there while considering tail calls (because we can reuse it).

  FuncInfo->setBytesInStackArgArea(StackArgSize);


  if (Subtarget->hasCustomCallingConv())

    Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);


  if (getTM().useNewSMEABILowering()) {

    if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {

      SDValue Size;

      if (Attrs.hasZAState()) {

        SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,

                                  DAG.getConstant(1, DL, MVT::i32));

        Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);

      } else if (Attrs.hasAgnosticZAInterface()) {

        RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;

        SDValue Callee = DAG.getExternalSymbol(

            getLibcallName(LC), getPointerTy(DAG.getDataLayout()));

        auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());

        TargetLowering::CallLoweringInfo CLI(DAG);

        CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(

            getLibcallCallingConv(LC), RetTy, Callee, {});

        std::tie(Size, Chain) = LowerCallTo(CLI);

      }

      if (Size) {

        SDValue Buffer = DAG.getNode(

            ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),

            {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});

        Chain = Buffer.getValue(1);


        Register BufferPtr =

            MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);

        Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);

        Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,

                            DAG.getVTList(MVT::Other), Chain);

        FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);

        MFI.CreateVariableSizedObject(Align(16), nullptr);

      }

    }

  } else {

    // Old SME ABI lowering (deprecated):

    // Create a 16 Byte TPIDR2 object. The dynamic buffer

    // will be expanded and stored in the static object later using a

    // pseudonode.

    if (Attrs.hasZAState()) {

      TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();

      TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);

      SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,

                                DAG.getConstant(1, DL, MVT::i32));

      SDValue Buffer;

      if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {

        Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,

                             DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});

      } else {

        SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);

        Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,

                             DAG.getVTList(MVT::i64, MVT::Other),

                             {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});

        MFI.CreateVariableSizedObject(Align(16), nullptr);

      }

      SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,

                                            DAG.getConstant(1, DL, MVT::i32));

      Chain = DAG.getNode(

          AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),

          {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0),

           /*Num save slices*/ NumZaSaveSlices});

    } else if (Attrs.hasAgnosticZAInterface()) {

      // Call __arm_sme_state_size().

      SDValue BufferSize =

          DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL,

                      DAG.getVTList(MVT::i64, MVT::Other), Chain);

      Chain = BufferSize.getValue(1);

      SDValue Buffer;

      if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {

        Buffer = DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,

                             DAG.getVTList(MVT::i64, MVT::Other),

                             {Chain, BufferSize});

      } else {

        // Allocate space dynamically.

        Buffer = DAG.getNode(

            ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),

            {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});

        MFI.CreateVariableSizedObject(Align(16), nullptr);

      }

      // Copy the value to a virtual register, and save that in FuncInfo.

      Register BufferPtr =

          MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);

      FuncInfo->setSMESaveBufferAddr(BufferPtr);

      Chain = DAG.getCopyToReg(Buffer.getValue(1), DL, BufferPtr, Buffer);

    }

  }


  if (CallConv == CallingConv::PreserveNone) {

    for (const ISD::InputArg &I : Ins) {

      if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||

          I.Flags.isSwiftAsync()) {

        MachineFunction &MF = DAG.getMachineFunction();

        DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

            MF.getFunction(),

            "Swift attributes can't be used with preserve_none",

            DL.getDebugLoc()));

        break;

      }

    }

  }


  if (getTM().useNewSMEABILowering()) {

    // Clear new ZT0 state. TODO: Move this to the SME ABI pass.

    if (Attrs.isNewZT0())

      Chain = DAG.getNode(

          ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,

          DAG.getConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32),

          DAG.getTargetConstant(0, DL, MVT::i32));

  }


  return Chain;

}


void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,

                                                SelectionDAG &DAG,

                                                const SDLoc &DL,

                                                SDValue &Chain) const {

  MachineFunction &MF = DAG.getMachineFunction();

  MachineFrameInfo &MFI = MF.getFrameInfo();

  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

  auto PtrVT = getPointerTy(DAG.getDataLayout());

  Function &F = MF.getFunction();

  bool IsWin64 =

      Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());


  SmallVector<SDValue, 8> MemOps;


  auto GPRArgRegs = AArch64::getGPRArgRegs();

  unsigned NumGPRArgRegs = GPRArgRegs.size();

  if (Subtarget->isWindowsArm64EC()) {

    // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs

    // functions.

    NumGPRArgRegs = 4;

  }

  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);


  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);

  int GPRIdx = 0;

  if (GPRSaveSize != 0) {

    if (IsWin64) {

      GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);

      if (GPRSaveSize & 15)

        // The extra size here, if triggered, will always be 8.

        MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);

    } else

      GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);


    SDValue FIN;

    if (Subtarget->isWindowsArm64EC()) {

      // With the Arm64EC ABI, we reserve the save area as usual, but we

      // compute its address relative to x4.  For a normal AArch64->AArch64

      // call, x4 == sp on entry, but calls from an entry thunk can pass in a

      // different address.

      Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);

      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);

      FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,

                        DAG.getConstant(GPRSaveSize, DL, MVT::i64));

    } else {

      FIN = DAG.getFrameIndex(GPRIdx, PtrVT);

    }


    for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {

      Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);

      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);

      SDValue Store =

          DAG.getStore(Val.getValue(1), DL, Val, FIN,

                       IsWin64 ? MachinePointerInfo::getFixedStack(

                                     MF, GPRIdx, (i - FirstVariadicGPR) * 8)

                               : MachinePointerInfo::getStack(MF, i * 8));

      MemOps.push_back(Store);

      FIN =

          DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));

    }

  }

  FuncInfo->setVarArgsGPRIndex(GPRIdx);

  FuncInfo->setVarArgsGPRSize(GPRSaveSize);


  if (Subtarget->hasFPARMv8() && !IsWin64) {

    auto FPRArgRegs = AArch64::getFPRArgRegs();

    const unsigned NumFPRArgRegs = FPRArgRegs.size();

    unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);


    unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);

    int FPRIdx = 0;

    if (FPRSaveSize != 0) {

      FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);


      SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);


      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {

        Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);

        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);


        SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,

                                     MachinePointerInfo::getStack(MF, i * 16));

        MemOps.push_back(Store);

        FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,

                          DAG.getConstant(16, DL, PtrVT));

      }

    }

    FuncInfo->setVarArgsFPRIndex(FPRIdx);

    FuncInfo->setVarArgsFPRSize(FPRSaveSize);

  }


  if (!MemOps.empty()) {

    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

  }

}


/// LowerCallResult - Lower the result values of a call into the

/// appropriate copies out of appropriate physical registers.

SDValue AArch64TargetLowering::LowerCallResult(

    SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,

    const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,

    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,

    SDValue ThisVal, bool RequiresSMChange) const {

  DenseMap<unsigned, SDValue> CopiedRegs;

  // Copy all of the result registers out of their specified physreg.

  for (unsigned i = 0; i != RVLocs.size(); ++i) {

    CCValAssign VA = RVLocs[i];


    // Pass 'this' value directly from the argument to return value, to avoid

    // reg unit interference

    if (i == 0 && isThisReturn) {

      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&

             "unexpected return calling convention register assignment");

      InVals.push_back(ThisVal);

      continue;

    }


    // Avoid copying a physreg twice since RegAllocFast is incompetent and only

    // allows one use of a physreg per block.

    SDValue Val = CopiedRegs.lookup(VA.getLocReg());

    if (!Val) {

      Val =

          DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);

      Chain = Val.getValue(1);

      InGlue = Val.getValue(2);

      CopiedRegs[VA.getLocReg()] = Val;

    }


    switch (VA.getLocInfo()) {

    default:

      llvm_unreachable("Unknown loc info!");

    case CCValAssign::Full:

      break;

    case CCValAssign::BCvt:

      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);

      break;

    case CCValAssign::AExtUpper:

      Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,

                        DAG.getConstant(32, DL, VA.getLocVT()));

      [[fallthrough]];

    case CCValAssign::AExt:

      [[fallthrough]];

    case CCValAssign::ZExt:

      Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());

      break;

    }


    if (RequiresSMChange && isPassedInFPR(VA.getValVT()))

      Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,

                        DAG.getVTList(Val.getValueType(), MVT::Glue), Val);


    InVals.push_back(Val);

  }


  return Chain;

}


/// Return true if the calling convention is one that we can guarantee TCO for.


static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {

  return (CC == CallingConv::Fast && GuaranteeTailCalls) ||

         CC == CallingConv::Tail || CC == CallingConv::SwiftTail;

}


/// Return true if we might ever do TCO for calls with this calling convention.


static bool mayTailCallThisCC(CallingConv::ID CC) {

  switch (CC) {

  case CallingConv::C:

  case CallingConv::AArch64_SVE_VectorCall:

  case CallingConv::PreserveMost:

  case CallingConv::PreserveAll:

  case CallingConv::PreserveNone:

  case CallingConv::Swift:

  case CallingConv::SwiftTail:

  case CallingConv::Tail:

  case CallingConv::Fast:

    return true;

  default:

    return false;

  }

}


/// Return true if the call convention supports varargs

/// Currently only those that pass varargs like the C

/// calling convention does are eligible

/// Calling conventions listed in this function must also

/// be properly handled in AArch64Subtarget::isCallingConvWin64


static bool callConvSupportsVarArgs(CallingConv::ID CC) {

  switch (CC) {

  case CallingConv::C:

  case CallingConv::PreserveNone:

  // SVE vector call is only partially supported, but it should

  // support named arguments being passed. Any arguments being passed

  // as varargs, are still unsupported.

  case CallingConv::AArch64_SVE_VectorCall:

    return true;

  default:

    return false;

  }

}


static void analyzeCallOperands(const AArch64TargetLowering &TLI,

                                const AArch64Subtarget *Subtarget,

                                const TargetLowering::CallLoweringInfo &CLI,

                                CCState &CCInfo) {

  const SelectionDAG &DAG = CLI.DAG;

  CallingConv::ID CalleeCC = CLI.CallConv;

  bool IsVarArg = CLI.IsVarArg;

  const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;

  bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);


  // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack

  // for the shadow store.

  if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)

    CCInfo.AllocateStack(32, Align(16));


  unsigned NumArgs = Outs.size();

  for (unsigned i = 0; i != NumArgs; ++i) {

    MVT ArgVT = Outs[i].VT;

    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;


    bool UseVarArgCC = false;

    if (IsVarArg) {

      // On Windows, the fixed arguments in a vararg call are passed in GPRs

      // too, so use the vararg CC to force them to integer registers.

      if (IsCalleeWin64) {

        UseVarArgCC = true;

      } else {

        UseVarArgCC = ArgFlags.isVarArg();

      }

    }


    if (!UseVarArgCC) {

      // Get type of the original argument.

      EVT ActualVT =

          TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,

                       /*AllowUnknown*/ true);

      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;

      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.

      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)

        ArgVT = MVT::i8;

      else if (ActualMVT == MVT::i16)

        ArgVT = MVT::i16;

    }


    // FIXME: CCAssignFnForCall should be called once, for the call and not per

    // argument. This logic should exactly mirror LowerFormalArguments.

    CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);

    bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,

                        Outs[i].OrigTy, CCInfo);

    assert(!Res && "Call operand has unhandled type");

    (void)Res;

  }

}


static SMECallAttrs


getSMECallAttrs(const Function &Caller, const AArch64TargetLowering &TLI,

                const TargetLowering::CallLoweringInfo &CLI) {

  if (CLI.CB)

    return SMECallAttrs(*CLI.CB, &TLI);

  if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))

    return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI));

  return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(SMEAttrs::Normal));

}


bool AArch64TargetLowering::isEligibleForTailCallOptimization(

    const CallLoweringInfo &CLI) const {

  CallingConv::ID CalleeCC = CLI.CallConv;

  if (!mayTailCallThisCC(CalleeCC))

    return false;


  SDValue Callee = CLI.Callee;

  bool IsVarArg = CLI.IsVarArg;

  const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;

  const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;

  const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;

  const SelectionDAG &DAG = CLI.DAG;

  MachineFunction &MF = DAG.getMachineFunction();

  const Function &CallerF = MF.getFunction();

  CallingConv::ID CallerCC = CallerF.getCallingConv();


  // SME Streaming functions are not eligible for TCO as they may require

  // the streaming mode or ZA to be restored after returning from the call.

  SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI);

  if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||

      CallAttrs.requiresPreservingAllZAState() ||

      CallAttrs.caller().hasStreamingBody())

    return false;


  // Functions using the C or Fast calling convention that have an SVE signature

  // preserve more registers and should assume the SVE_VectorCall CC.

  // The check for matching callee-saved regs will determine whether it is

  // eligible for TCO.

  if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&

      MF.getInfo<AArch64FunctionInfo>()->isSVECC())

    CallerCC = CallingConv::AArch64_SVE_VectorCall;


  bool CCMatch = CallerCC == CalleeCC;


  // When using the Windows calling convention on a non-windows OS, we want

  // to back up and restore X18 in such functions; we can't do a tail call

  // from those functions.

  if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&

      CalleeCC != CallingConv::Win64)

    return false;


  // Byval parameters hand the function a pointer directly into the stack area

  // we want to reuse during a tail call. Working around this *is* possible (see

  // X86) but less efficient and uglier in LowerCall.

  for (Function::const_arg_iterator i = CallerF.arg_begin(),

                                    e = CallerF.arg_end();

       i != e; ++i) {

    if (i->hasByValAttr())

      return false;


    // On Windows, "inreg" attributes signify non-aggregate indirect returns.

    // In this case, it is necessary to save X0/X1 in the callee and return it

    // in X0. Tail call opt may interfere with this, so we disable tail call

    // opt when the caller has an "inreg" attribute -- except if the callee

    // also has that attribute on the same argument, and the same value is

    // passed.

    if (i->hasInRegAttr()) {

      unsigned ArgIdx = i - CallerF.arg_begin();

      if (!CLI.CB || CLI.CB->arg_size() <= ArgIdx)

        return false;

      AttributeSet Attrs = CLI.CB->getParamAttributes(ArgIdx);

      if (!Attrs.hasAttribute(Attribute::InReg) ||

          !Attrs.hasAttribute(Attribute::StructRet) || !i->hasStructRetAttr() ||

          CLI.CB->getArgOperand(ArgIdx) != i) {

        return false;

      }

    }

  }


  if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))

    return CCMatch;


  // Externally-defined functions with weak linkage should not be

  // tail-called on AArch64 when the OS does not support dynamic

  // pre-emption of symbols, as the AAELF spec requires normal calls

  // to undefined weak functions to be replaced with a NOP or jump to the

  // next instruction. The behaviour of branch instructions in this

  // situation (as used for tail calls) is implementation-defined, so we

  // cannot rely on the linker replacing the tail call with a return.

  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {

    const GlobalValue *GV = G->getGlobal();

    const Triple &TT = getTargetMachine().getTargetTriple();

    if (GV->hasExternalWeakLinkage() &&

        (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))

      return false;

  }


  // Now we search for cases where we can use a tail call without changing the

  // ABI. Sibcall is used in some places (particularly gcc) to refer to this

  // concept.


  // I want anyone implementing a new calling convention to think long and hard

  // about this assert.

  if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))

    report_fatal_error("Unsupported variadic calling convention");


  LLVMContext &C = *DAG.getContext();

  // Check that the call results are passed in the same way.

  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,

                                  CCAssignFnForCall(CalleeCC, IsVarArg),

                                  CCAssignFnForCall(CallerCC, IsVarArg)))

    return false;

  // The callee has to preserve all registers the caller needs to preserve.

  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();

  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);

  if (!CCMatch) {

    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);

    if (Subtarget->hasCustomCallingConv()) {

      TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);

      TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);

    }

    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))

      return false;

  }


  // Nothing more to check if the callee is taking no arguments

  if (Outs.empty())

    return true;


  SmallVector<CCValAssign, 16> ArgLocs;

  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);


  analyzeCallOperands(*this, Subtarget, CLI, CCInfo);


  if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {

    // When we are musttail, additional checks have been done and we can safely ignore this check

    // At least two cases here: if caller is fastcc then we can't have any

    // memory arguments (we'd be expected to clean up the stack afterwards). If

    // caller is C then we could potentially use its argument area.


    // FIXME: for now we take the most conservative of these in both cases:

    // disallow all variadic memory operands.

    for (const CCValAssign &ArgLoc : ArgLocs)

      if (!ArgLoc.isRegLoc())

        return false;

  }


  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();


  // If any of the arguments is passed indirectly, it must be SVE, so the

  // 'getBytesInStackArgArea' is not sufficient to determine whether we need to

  // allocate space on the stack. That is why we determine this explicitly here

  // the call cannot be a tailcall.

  if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {

        assert((A.getLocInfo() != CCValAssign::Indirect ||

                A.getValVT().isScalableVector() ||

                Subtarget->isWindowsArm64EC()) &&

               "Expected value to be scalable");

        return A.getLocInfo() == CCValAssign::Indirect;

      }))

    return false;


  // If the stack arguments for this call do not fit into our own save area then

  // the call cannot be made tail.

  if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())

    return false;


  const MachineRegisterInfo &MRI = MF.getRegInfo();

  if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))

    return false;


  return true;

}


SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,

                                                   SelectionDAG &DAG,

                                                   MachineFrameInfo &MFI,

                                                   int ClobberedFI) const {

  SmallVector<SDValue, 8> ArgChains;

  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);

  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;


  // Include the original chain at the beginning of the list. When this is

  // used by target LowerCall hooks, this helps legalize find the

  // CALLSEQ_BEGIN node.

  ArgChains.push_back(Chain);


  // Add a chain value for each stack argument corresponding

  for (SDNode *U : DAG.getEntryNode().getNode()->users())

    if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))

      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))

        if (FI->getIndex() < 0) {

          int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());

          int64_t InLastByte = InFirstByte;

          InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;


          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||

              (FirstByte <= InFirstByte && InFirstByte <= LastByte))

            ArgChains.push_back(SDValue(L, 1));

        }


  // Build a tokenfactor for all the chains.

  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);

}


bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,

                                                   bool TailCallOpt) const {

  return (CallCC == CallingConv::Fast && TailCallOpt) ||

         CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;

}


// Check if the value is zero-extended from i1 to i8


static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {

  unsigned SizeInBits = Arg.getValueType().getSizeInBits();

  if (SizeInBits < 8)

    return false;


  APInt RequiredZero(SizeInBits, 0xFE);

  KnownBits Bits = DAG.computeKnownBits(Arg, 4);

  bool ZExtBool = (Bits.Zero & RequiredZero) == RequiredZero;

  return ZExtBool;

}


void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,

                                                          SDNode *Node) const {

  // Live-in physreg copies that are glued to SMSTART are applied as

  // implicit-def's in the InstrEmitter. Here we remove them, allowing the

  // register allocator to pass call args in callee saved regs, without extra

  // copies to avoid these fake clobbers of actually-preserved GPRs.

  if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||

      MI.getOpcode() == AArch64::MSRpstatePseudo) {

    for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)

      if (MachineOperand &MO = MI.getOperand(I);

          MO.isReg() && MO.isImplicit() && MO.isDef() &&

          (AArch64::GPR32RegClass.contains(MO.getReg()) ||

           AArch64::GPR64RegClass.contains(MO.getReg())))

        MI.removeOperand(I);


    // The SVE vector length can change when entering/leaving streaming mode.

    // FPMR is set to 0 when entering/leaving streaming mode.

    if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||

        MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {

      MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,

                                              /*IsImplicit=*/true));

      MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,

                                              /*IsImplicit=*/true));

      MI.addOperand(MachineOperand::CreateReg(AArch64::FPMR, /*IsDef=*/true,

                                              /*IsImplicit=*/true));

    }

  }


  // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that

  // have nothing to do with VG, were it not that they are used to materialise a

  // frame-address. If they contain a frame-index to a scalable vector, this

  // will likely require an ADDVL instruction to materialise the address, thus

  // reading VG.

  const MachineFunction &MF = *MI.getMF();

  if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&

      (MI.getOpcode() == AArch64::ADDXri ||

       MI.getOpcode() == AArch64::SUBXri)) {

    const MachineOperand &MO = MI.getOperand(1);

    if (MO.isFI() && MF.getFrameInfo().hasScalableStackID(MO.getIndex()))

      MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,

                                              /*IsImplicit=*/true));

  }

}


SDValue AArch64TargetLowering::changeStreamingMode(

    SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue,

    unsigned Condition, bool InsertVectorLengthCheck) const {

  MachineFunction &MF = DAG.getMachineFunction();

  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

  FuncInfo->setHasStreamingModeChanges(true);


  auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue {

    SmallVector<SDValue, 2> Ops = {Chain};

    if (InGlue)

      Ops.push_back(InGlue);

    return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL,

                       DAG.getVTList(MVT::Other, MVT::Glue), Ops);

  };


  if (InsertVectorLengthCheck && Enable) {

    // Non-streaming -> Streaming

    // Insert vector length check before smstart

    SDValue CheckVL = GetCheckVL(Chain, InGlue);

    Chain = CheckVL.getValue(0);

    InGlue = CheckVL.getValue(1);

  }


  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();

  SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());

  SDValue MSROp =

      DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);

  SmallVector<SDValue> Ops = {Chain, MSROp};

  unsigned Opcode;

  if (Condition != AArch64SME::Always) {

    Register PStateReg = FuncInfo->getPStateSMReg();

    assert(PStateReg.isValid() && "PStateSM Register is invalid");

    SDValue PStateSM =

        DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue);

    // Use chain and glue from the CopyFromReg.

    Ops[0] = PStateSM.getValue(1);

    InGlue = PStateSM.getValue(2);

    SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);

    Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;

    Ops.push_back(ConditionOp);

    Ops.push_back(PStateSM);

  } else {

    Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;

  }

  Ops.push_back(RegMask);


  if (InGlue)

    Ops.push_back(InGlue);


  SDValue SMChange =

      DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);


  if (!InsertVectorLengthCheck || Enable)

    return SMChange;


  // Streaming -> Non-streaming

  // Insert vector length check after smstop since we cannot read VL

  // in streaming mode

  return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1));

}


static AArch64SME::ToggleCondition


getSMToggleCondition(const SMECallAttrs &CallAttrs) {

  if (!CallAttrs.caller().hasStreamingCompatibleInterface() ||

      CallAttrs.caller().hasStreamingBody())

    return AArch64SME::Always;

  if (CallAttrs.callee().hasNonStreamingInterface())

    return AArch64SME::IfCallerIsStreaming;

  if (CallAttrs.callee().hasStreamingInterface())

    return AArch64SME::IfCallerIsNonStreaming;


  llvm_unreachable("Unsupported attributes");

}


/// Check whether a stack argument requires lowering in a tail call.


static bool shouldLowerTailCallStackArg(const MachineFunction &MF,

                                        const CCValAssign &VA, SDValue Arg,

                                        ISD::ArgFlagsTy Flags, int CallOffset) {

  // FIXME: We should be able to handle this case, but it's not clear how to.

  if (Flags.isZExt() || Flags.isSExt())

    return true;


  for (;;) {

    // Look through nodes that don't alter the bits of the incoming value.

    unsigned Op = Arg.getOpcode();

    if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||

        Arg->isAssert() || Op == AArch64ISD::ASSERT_ZEXT_BOOL) {

      Arg = Arg.getOperand(0);

      continue;

    }

    break;

  }


  // If the argument is a load from the same immutable stack slot, we can reuse

  // it.

  if (auto *LoadNode = dyn_cast<LoadSDNode>(Arg)) {

    if (auto *FINode = dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) {

      const MachineFrameInfo &MFI = MF.getFrameInfo();

      int FI = FINode->getIndex();

      if (!MFI.isImmutableObjectIndex(FI))

        return true;

      if (CallOffset != MFI.getObjectOffset(FI))

        return true;

      uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();

      if (SizeInBits / 8 != static_cast<uint64_t>(MFI.getObjectSize(FI)))

        return true;

      return false;

    }

  }


  return true;

}


/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,

/// and add input and output parameter nodes.

SDValue

AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,

                                 SmallVectorImpl<SDValue> &InVals) const {

  SelectionDAG &DAG = CLI.DAG;

  SDLoc &DL = CLI.DL;

  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;

  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;

  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;

  SDValue Chain = CLI.Chain;

  SDValue Callee = CLI.Callee;

  bool &IsTailCall = CLI.IsTailCall;

  CallingConv::ID &CallConv = CLI.CallConv;

  bool IsVarArg = CLI.IsVarArg;

  const CallBase *CB = CLI.CB;


  MachineFunction &MF = DAG.getMachineFunction();

  MachineFunction::CallSiteInfo CSInfo;

  bool IsThisReturn = false;


  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;

  bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;

  bool IsSibCall = false;

  bool GuardWithBTI = false;


  if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&

      !Subtarget->noBTIAtReturnTwice()) {

    GuardWithBTI = FuncInfo->branchTargetEnforcement();

  }


  // Analyze operands of the call, assigning locations to each operand.

  SmallVector<CCValAssign, 16> ArgLocs;

  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());


  if (IsVarArg) {

    unsigned NumArgs = Outs.size();


    for (unsigned i = 0; i != NumArgs; ++i) {

      if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector())

        report_fatal_error("Passing SVE types to variadic functions is "

                           "currently not supported");

    }

  }


  analyzeCallOperands(*this, Subtarget, CLI, CCInfo);


  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);

  // Assign locations to each value returned by this call.

  SmallVector<CCValAssign, 16> RVLocs;

  CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,

                    *DAG.getContext());

  RetCCInfo.AnalyzeCallResult(Ins, RetCC);


  // Set type id for call site info.

  if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())

    CSInfo = MachineFunction::CallSiteInfo(*CB);


  // Check callee args/returns for SVE registers and set calling convention

  // accordingly.

  if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {

    auto HasSVERegLoc = [](CCValAssign &Loc) {

      if (!Loc.isRegLoc())

        return false;

      return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||

             AArch64::PPRRegClass.contains(Loc.getLocReg());

    };

    if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))

      CallConv = CallingConv::AArch64_SVE_VectorCall;

  }


  // Determine whether we need any streaming mode changes.

  SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI);


  std::optional<unsigned> ZAMarkerNode;

  bool UseNewSMEABILowering = getTM().useNewSMEABILowering();


  if (UseNewSMEABILowering) {

    if (CallAttrs.requiresLazySave() ||

        CallAttrs.requiresPreservingAllZAState())

      ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;

    else if (CallAttrs.caller().hasZAState() ||

             CallAttrs.caller().hasZT0State())

      ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;

  }


  if (IsTailCall) {

    // Check if it's really possible to do a tail call.

    IsTailCall = isEligibleForTailCallOptimization(CLI);


    // A sibling call is one where we're under the usual C ABI and not planning

    // to change that but can still do a tail call:

    if (!ZAMarkerNode && !TailCallOpt && IsTailCall &&

        CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)

      IsSibCall = true;


    if (IsTailCall)

      ++NumTailCalls;

  }


  if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())

    report_fatal_error("failed to perform tail call elimination on a call "

                       "site marked musttail");


  // Get a count of how many bytes are to be pushed on the stack.

  unsigned NumBytes = CCInfo.getStackSize();


  if (IsSibCall) {

    // Since we're not changing the ABI to make this a tail call, the memory

    // operands are already available in the caller's incoming argument space.

    NumBytes = 0;

  }


  // FPDiff is the byte offset of the call's argument area from the callee's.

  // Stores to callee stack arguments will be placed in FixedStackSlots offset

  // by this amount for a tail call. In a sibling call it must be 0 because the

  // caller will deallocate the entire stack and the callee still expects its

  // arguments to begin at SP+0. Completely unused for non-tail calls.

  int FPDiff = 0;


  if (IsTailCall && !IsSibCall) {

    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();


    // Since callee will pop argument stack as a tail call, we must keep the

    // popped size 16-byte aligned.

    NumBytes = alignTo(NumBytes, 16);


    // FPDiff will be negative if this tail call requires more space than we

    // would automatically have in our incoming argument space. Positive if we

    // can actually shrink the stack.

    FPDiff = NumReusableBytes - NumBytes;


    // Update the required reserved area if this is the tail call requiring the

    // most argument stack space.

    if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)

      FuncInfo->setTailCallReservedStack(-FPDiff);


    // The stack pointer must be 16-byte aligned at all times it's used for a

    // memory operation, which in practice means at *all* times and in

    // particular across call boundaries. Therefore our own arguments started at

    // a 16-byte aligned SP and the delta applied for the tail call should

    // satisfy the same constraint.

    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");

  }


  auto DescribeCallsite =

      [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {

    R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";

    if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))

      R << ore::NV("Callee", ES->getSymbol());

    else if (CLI.CB && CLI.CB->getCalledFunction())

      R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());

    else

      R << "unknown callee";

    R << "'";

    return R;

  };


  bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();

  bool RequiresSaveAllZA =

      !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState();

  if (RequiresLazySave) {

    TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();

    SDValue TPIDR2ObjAddr = DAG.getFrameIndex(

        TPIDR2.FrameIndex,

        DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));

    Chain = DAG.getNode(

        ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,

        DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),

        TPIDR2ObjAddr);

    OptimizationRemarkEmitter ORE(&MF.getFunction());

    ORE.emit([&]() {

      auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",

                                                   CLI.CB)

                      : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",

                                                   &MF.getFunction());

      return DescribeCallsite(R) << " sets up a lazy save for ZA";

    });

  } else if (RequiresSaveAllZA) {

    assert(!CallAttrs.callee().hasSharedZAInterface() &&

           "Cannot share state that may not exist");

    Chain = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,

                                    /*IsSave=*/true);

  }


  bool RequiresSMChange = CallAttrs.requiresSMChange();

  if (RequiresSMChange) {

    OptimizationRemarkEmitter ORE(&MF.getFunction());

    ORE.emit([&]() {

      auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",

                                                   CLI.CB)

                      : OptimizationRemarkAnalysis("sme", "SMETransition",

                                                   &MF.getFunction());

      DescribeCallsite(R) << " requires a streaming mode transition";

      return R;

    });

  }


  SDValue ZTFrameIdx;

  MachineFrameInfo &MFI = MF.getFrameInfo();

  bool ShouldPreserveZT0 = CallAttrs.requiresPreservingZT0();


  // If the caller has ZT0 state which will not be preserved by the callee,

  // spill ZT0 before the call.

  if (ShouldPreserveZT0) {

    ZTFrameIdx = getZT0FrameIndex(MFI, *FuncInfo, DAG);


    Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),

                        {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});

  }


  // If caller shares ZT0 but the callee is not shared ZA, we need to stop

  // PSTATE.ZA before the call if there is no lazy-save active.

  bool DisableZA = CallAttrs.requiresDisablingZABeforeCall();

  assert((!DisableZA || !RequiresLazySave) &&

         "Lazy-save should have PSTATE.SM=1 on entry to the function");


  if (DisableZA)

    Chain = DAG.getNode(

        AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,

        DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));


  // Adjust the stack pointer for the new arguments... and mark ZA uses.

  // These operations are automatically eliminated by the prolog/epilog pass

  assert((!IsSibCall || !ZAMarkerNode) && "ZA markers require CALLSEQ_START");

  if (!IsSibCall) {

    Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);

    if (ZAMarkerNode) {

      // Note: We need the CALLSEQ_START to glue the ZAMarkerNode to, simply

      // using a chain can result in incorrect scheduling. The markers refer to

      // the position just before the CALLSEQ_START (though occur after as

      // CALLSEQ_START lacks in-glue).

      Chain = DAG.getNode(*ZAMarkerNode, DL, DAG.getVTList(MVT::Other),

                          {Chain, Chain.getValue(1)});

    }

  }


  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,

                                        getPointerTy(DAG.getDataLayout()));


  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

  SmallSet<unsigned, 8> RegsUsed;

  SmallVector<SDValue, 8> MemOpChains;

  auto PtrVT = getPointerTy(DAG.getDataLayout());


  if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {

    const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();

    for (const auto &F : Forwards) {

      SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);

       RegsToPass.emplace_back(F.PReg, Val);

    }

  }


  // Walk the register/memloc assignments, inserting copies/loads.

  unsigned ExtraArgLocs = 0;

  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {

    CCValAssign &VA = ArgLocs[i - ExtraArgLocs];

    SDValue Arg = OutVals[i];

    ISD::ArgFlagsTy Flags = Outs[i].Flags;


    // Promote the value if needed.

    switch (VA.getLocInfo()) {

    default:

      llvm_unreachable("Unknown loc info!");

    case CCValAssign::Full:

      break;

    case CCValAssign::SExt:

      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);

      break;

    case CCValAssign::ZExt:

      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);

      break;

    case CCValAssign::AExt:

      if (Outs[i].ArgVT == MVT::i1) {

        // AAPCS requires i1 to be zero-extended to 8-bits by the caller.

        //

        // Check if we actually have to do this, because the value may

        // already be zero-extended.

        //

        // We cannot just emit a (zext i8 (trunc (assert-zext i8)))

        // and rely on DAGCombiner to fold this, because the following

        // (anyext i32) is combined with (zext i8) in DAG.getNode:

        //

        //   (ext (zext x)) -> (zext x)

        //

        // This will give us (zext i32), which we cannot remove, so

        // try to check this beforehand.

        if (!checkZExtBool(Arg, DAG)) {

          Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);

          Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);

        }

      }

      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);

      break;

    case CCValAssign::AExtUpper:

      assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");

      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);

      Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,

                        DAG.getConstant(32, DL, VA.getLocVT()));

      break;

    case CCValAssign::BCvt:

      Arg = DAG.getBitcast(VA.getLocVT(), Arg);

      break;

    case CCValAssign::Trunc:

      Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());

      break;

    case CCValAssign::FPExt:

      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);

      break;

    case CCValAssign::Indirect:

      bool isScalable = VA.getValVT().isScalableVT();

      assert((isScalable || Subtarget->isWindowsArm64EC()) &&

             "Indirect arguments should be scalable on most subtargets");


      uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();

      uint64_t PartSize = StoreSize;

      unsigned NumParts = 1;

      if (Outs[i].Flags.isInConsecutiveRegs()) {

        while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())

          ++NumParts;

        StoreSize *= NumParts;

      }


      Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());

      Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);

      MachineFrameInfo &MFI = MF.getFrameInfo();

      int FI = MFI.CreateStackObject(StoreSize, Alignment, false);

      if (isScalable) {

        bool IsPred = VA.getValVT() == MVT::aarch64svcount ||

                      VA.getValVT().getVectorElementType() == MVT::i1;

        MFI.setStackID(FI, IsPred ? TargetStackID::ScalablePredicateVector

                                  : TargetStackID::ScalableVector);

      }


      MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);

      SDValue Ptr = DAG.getFrameIndex(

          FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));

      SDValue SpillSlot = Ptr;


      // Ensure we generate all stores for each tuple part, whilst updating the

      // pointer after each store correctly using vscale.

      while (NumParts) {

        SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);

        MemOpChains.push_back(Store);


        NumParts--;

        if (NumParts > 0) {

          SDValue BytesIncrement;

          if (isScalable) {

            BytesIncrement = DAG.getVScale(

                DL, Ptr.getValueType(),

                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));

          } else {

            BytesIncrement = DAG.getConstant(

                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,

                Ptr.getValueType());

          }

          MPI = MachinePointerInfo(MPI.getAddrSpace());

          Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,

                            BytesIncrement, SDNodeFlags::NoUnsignedWrap);

          ExtraArgLocs++;

          i++;

        }

      }


      Arg = SpillSlot;

      break;

    }


    if (VA.isRegLoc()) {

      if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&

          Outs[0].VT == MVT::i64) {

        assert(VA.getLocVT() == MVT::i64 &&

               "unexpected calling convention register assignment");

        assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&

               "unexpected use of 'returned'");

        IsThisReturn = true;

      }

      if (RegsUsed.count(VA.getLocReg())) {

        // If this register has already been used then we're trying to pack

        // parts of an [N x i32] into an X-register. The extension type will

        // take care of putting the two halves in the right place but we have to

        // combine them.

        SDValue &Bits =

            llvm::find_if(RegsToPass,

                          [=](const std::pair<unsigned, SDValue> &Elt) {

                            return Elt.first == VA.getLocReg();

                          })

                ->second;

        Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);

        // Call site info is used for function's parameter entry value

        // tracking. For now we track only simple cases when parameter

        // is transferred through whole register.

        llvm::erase_if(CSInfo.ArgRegPairs,

                       [&VA](MachineFunction::ArgRegPair ArgReg) {

                         return ArgReg.Reg == VA.getLocReg();

                       });

      } else {

        // Add an extra level of indirection for streaming mode changes by

        // using a pseudo copy node that cannot be rematerialised between a

        // smstart/smstop and the call by the simple register coalescer.

        if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))

          Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,

                            DAG.getVTList(Arg.getValueType(), MVT::Glue), Arg);

        RegsToPass.emplace_back(VA.getLocReg(), Arg);

        RegsUsed.insert(VA.getLocReg());

        const TargetOptions &Options = DAG.getTarget().Options;

        if (Options.EmitCallSiteInfo)

          CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);

      }

    } else {

      assert(VA.isMemLoc());


      SDValue DstAddr;

      MachinePointerInfo DstInfo;


      // FIXME: This works on big-endian for composite byvals, which are the

      // common case. It should also work for fundamental types too.

      uint32_t BEAlign = 0;

      unsigned OpSize;

      if (VA.getLocInfo() == CCValAssign::Indirect ||

          VA.getLocInfo() == CCValAssign::Trunc)

        OpSize = VA.getLocVT().getFixedSizeInBits();

      else

        OpSize = Flags.isByVal() ? Flags.getByValSize() * 8

                                 : VA.getValVT().getSizeInBits();

      OpSize = (OpSize + 7) / 8;

      if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&

          !Flags.isInConsecutiveRegs()) {

        if (OpSize < 8)

          BEAlign = 8 - OpSize;

      }

      unsigned LocMemOffset = VA.getLocMemOffset();

      int32_t Offset = LocMemOffset + BEAlign;


      if (IsTailCall) {

        // When the frame pointer is perfectly aligned for the tail call and the

        // same stack argument is passed down intact, we can reuse it.

        if (!FPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags, Offset))

          continue;


        Offset = Offset + FPDiff;

        int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);


        DstAddr = DAG.getFrameIndex(FI, PtrVT);

        DstInfo = MachinePointerInfo::getFixedStack(MF, FI);


        // Make sure any stack arguments overlapping with where we're storing

        // are loaded before this eventual operation. Otherwise they'll be

        // clobbered.

        Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);

      } else {

        SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);


        DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);

        DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);

      }


      if (Outs[i].Flags.isByVal()) {

        SDValue SizeNode =

            DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);

        SDValue Cpy = DAG.getMemcpy(

            Chain, DL, DstAddr, Arg, SizeNode,

            Outs[i].Flags.getNonZeroByValAlign(),

            /*isVol = */ false, /*AlwaysInline = */ false,

            /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());


        MemOpChains.push_back(Cpy);

      } else {

        // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already

        // promoted to a legal register type i32, we should truncate Arg back to

        // i1/i8/i16.

        if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||

            VA.getValVT() == MVT::i16)

          Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);


        SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);

        MemOpChains.push_back(Store);

      }

    }

  }


  if (IsVarArg && Subtarget->isWindowsArm64EC() &&

      !(CLI.CB && CLI.CB->isMustTailCall())) {

    SDValue ParamPtr = StackPtr;

    if (IsTailCall) {

      // Create a dummy object at the top of the stack that can be used to get

      // the SP after the epilogue

      int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);

      ParamPtr = DAG.getFrameIndex(FI, PtrVT);

    }


    // For vararg calls, the Arm64EC ABI requires values in x4 and x5

    // describing the argument list.  x4 contains the address of the

    // first stack parameter. x5 contains the size in bytes of all parameters

    // passed on the stack.

    RegsToPass.emplace_back(AArch64::X4, ParamPtr);

    RegsToPass.emplace_back(AArch64::X5,

                            DAG.getConstant(NumBytes, DL, MVT::i64));

  }


  if (!MemOpChains.empty())

    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);


  SDValue InGlue;

  if (RequiresSMChange) {

    bool InsertVectorLengthCheck =

        (CallConv == CallingConv::AArch64_SVE_VectorCall);

    Chain = changeStreamingMode(

        DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,

        getSMToggleCondition(CallAttrs), InsertVectorLengthCheck);

    InGlue = Chain.getValue(1);

  }


  // Build a sequence of copy-to-reg nodes chained together with token chain

  // and flag operands which copy the outgoing args into the appropriate regs.

  for (auto &RegToPass : RegsToPass) {

    Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,

                             RegToPass.second, InGlue);

    InGlue = Chain.getValue(1);

  }


  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every

  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol

  // node so that legalize doesn't hack it.

  const GlobalValue *CalledGlobal = nullptr;

  unsigned OpFlags = 0;

  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {

    CalledGlobal = G->getGlobal();

    OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,

                                                         getTargetMachine());

    if (OpFlags & AArch64II::MO_GOT) {

      Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);

      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);

    } else {

      const GlobalValue *GV = G->getGlobal();

      Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);

    }

  } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {

    bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&

                   Subtarget->isTargetMachO()) ||

                  MF.getFunction().getParent()->getRtLibUseGOT();

    const char *Sym = S->getSymbol();

    if (UseGot) {

      Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);

      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);

    } else {

      Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);

    }

  }


  // We don't usually want to end the call-sequence here because we would tidy

  // the frame up *after* the call, however in the ABI-changing tail-call case

  // we've carefully laid out the parameters so that when sp is reset they'll be

  // in the correct location.

  if (IsTailCall && !IsSibCall) {

    Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);

    InGlue = Chain.getValue(1);

  }


  unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;


  std::vector<SDValue> Ops;

  Ops.push_back(Chain);

  Ops.push_back(Callee);


  // Calls with operand bundle "clang.arc.attachedcall" are special. They should

  // be expanded to the call, directly followed by a special marker sequence and

  // a call to an ObjC library function.  Use CALL_RVMARKER to do that.

  if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {

    assert(!IsTailCall &&

           "tail calls cannot be marked with clang.arc.attachedcall");

    Opc = AArch64ISD::CALL_RVMARKER;


    // Add a target global address for the retainRV/claimRV runtime function

    // just before the call target.

    Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);

    auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);

    Ops.insert(Ops.begin() + 1, GA);


    // We may or may not need to emit both the marker and the retain/claim call.

    // Tell the pseudo expansion using an additional boolean op.

    bool ShouldEmitMarker = objcarc::attachedCallOpBundleNeedsMarker(CLI.CB);

    SDValue DoEmitMarker =

        DAG.getTargetConstant(ShouldEmitMarker, DL, MVT::i32);

    Ops.insert(Ops.begin() + 2, DoEmitMarker);

  } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {

    Opc = AArch64ISD::CALL_ARM64EC_TO_X64;

  } else if (GuardWithBTI) {

    Opc = AArch64ISD::CALL_BTI;

  }


  if (IsTailCall) {

    // Each tail call may have to adjust the stack by a different amount, so

    // this information must travel along with the operation for eventual

    // consumption by emitEpilogue.

    Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));

  }


  if (CLI.PAI) {

    const uint64_t Key = CLI.PAI->Key;

    assert((Key == AArch64PACKey::IA || Key == AArch64PACKey::IB) &&

           "Invalid auth call key");


    // Split the discriminator into address/integer components.

    SDValue AddrDisc, IntDisc;

    std::tie(IntDisc, AddrDisc) =

        extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);


    if (Opc == AArch64ISD::CALL_RVMARKER)

      Opc = AArch64ISD::AUTH_CALL_RVMARKER;

    else

      Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;

    Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));

    Ops.push_back(IntDisc);

    Ops.push_back(AddrDisc);

  }


  // Add argument registers to the end of the list so that they are known live

  // into the call.

  for (auto &RegToPass : RegsToPass)

    Ops.push_back(DAG.getRegister(RegToPass.first,

                                  RegToPass.second.getValueType()));


  // Add a register mask operand representing the call-preserved registers.

  const uint32_t *Mask;

  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();

  if (IsThisReturn) {

    // For 'this' returns, use the X0-preserving mask if applicable

    Mask = TRI->getThisReturnPreservedMask(MF, CallConv);

    if (!Mask) {

      IsThisReturn = false;

      Mask = TRI->getCallPreservedMask(MF, CallConv);

    }

  } else

    Mask = TRI->getCallPreservedMask(MF, CallConv);


  if (Subtarget->hasCustomCallingConv())

    TRI->UpdateCustomCallPreservedMask(MF, &Mask);


  if (TRI->isAnyArgRegReserved(MF))

    TRI->emitReservedArgRegCallError(MF);


  assert(Mask && "Missing call preserved mask for calling convention");

  Ops.push_back(DAG.getRegisterMask(Mask));


  if (InGlue.getNode())

    Ops.push_back(InGlue);


  // If we're doing a tall call, use a TC_RETURN here rather than an

  // actual call instruction.

  if (IsTailCall) {

    MF.getFrameInfo().setHasTailCall();

    SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);

    if (IsCFICall)

      Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());


    DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);

    DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));

    if (CalledGlobal &&

        MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))

      DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);

    return Ret;

  }


  // Returns a chain and a flag for retval copy to use.

  Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);

  if (IsCFICall)

    Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());


  DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);

  InGlue = Chain.getValue(1);

  DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));

  if (CalledGlobal &&

      MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))

    DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);


  uint64_t CalleePopBytes =

      DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;


  Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);

  InGlue = Chain.getValue(1);


  // Handle result values, copying them out of physregs into vregs that we

  // return.

  SDValue Result = LowerCallResult(

      Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,

      IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);


  if (!Ins.empty())

    InGlue = Result.getValue(Result->getNumValues() - 1);


  if (RequiresSMChange) {

    Result = changeStreamingMode(

        DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,

        getSMToggleCondition(CallAttrs));

  }


  if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall())

    // Unconditionally resume ZA.

    Result = DAG.getNode(

        AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result,

        DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));


  if (ShouldPreserveZT0)

    Result =

        DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),

                    {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});


  if (RequiresLazySave) {

    Result = emitRestoreZALazySave(Result, DL, *this, *TRI, *FuncInfo, DAG);

  } else if (RequiresSaveAllZA) {

    Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,

                                     /*IsSave=*/false);

  }


  if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0 ||

      RequiresSaveAllZA) {

    for (unsigned I = 0; I < InVals.size(); ++I) {

      // The smstart/smstop is chained as part of the call, but when the

      // resulting chain is discarded (which happens when the call is not part

      // of a chain, e.g. a call to @llvm.cos()), we need to ensure the

      // smstart/smstop is chained to the result value. We can do that by doing

      // a vreg -> vreg copy.

      Register Reg = MF.getRegInfo().createVirtualRegister(

          getRegClassFor(InVals[I].getValueType().getSimpleVT()));

      SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);

      InVals[I] = DAG.getCopyFromReg(X, DL, Reg,

                                     InVals[I].getValueType());

    }

  }


  if (CallConv == CallingConv::PreserveNone) {

    for (const ISD::OutputArg &O : Outs) {

      if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||

          O.Flags.isSwiftAsync()) {

        MachineFunction &MF = DAG.getMachineFunction();

        DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

            MF.getFunction(),

            "Swift attributes can't be used with preserve_none",

            DL.getDebugLoc()));

        break;

      }

    }

  }


  return Result;

}


bool AArch64TargetLowering::CanLowerReturn(

    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,

    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,

    const Type *RetTy) const {

  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);

  SmallVector<CCValAssign, 16> RVLocs;

  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);

  return CCInfo.CheckReturn(Outs, RetCC);

}


SDValue

AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

                                   bool isVarArg,

                                   const SmallVectorImpl<ISD::OutputArg> &Outs,

                                   const SmallVectorImpl<SDValue> &OutVals,

                                   const SDLoc &DL, SelectionDAG &DAG) const {

  auto &MF = DAG.getMachineFunction();

  auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();


  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);

  SmallVector<CCValAssign, 16> RVLocs;

  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());

  CCInfo.AnalyzeReturn(Outs, RetCC);


  // Copy the result values into the output registers.

  SDValue Glue;

  SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;

  SmallSet<unsigned, 4> RegsUsed;

  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();

       ++i, ++realRVLocIdx) {

    CCValAssign &VA = RVLocs[i];

    assert(VA.isRegLoc() && "Can only return in registers!");

    SDValue Arg = OutVals[realRVLocIdx];


    switch (VA.getLocInfo()) {

    default:

      llvm_unreachable("Unknown loc info!");

    case CCValAssign::Full:

      if (Outs[i].ArgVT == MVT::i1) {

        // AAPCS requires i1 to be zero-extended to i8 by the producer of the

        // value. This is strictly redundant on Darwin (which uses "zeroext

        // i1"), but will be optimised out before ISel.

        Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);

        Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);

      }

      break;

    case CCValAssign::BCvt:

      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);

      break;

    case CCValAssign::AExt:

    case CCValAssign::ZExt:

      Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());

      break;

    case CCValAssign::AExtUpper:

      assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");

      Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());

      Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,

                        DAG.getConstant(32, DL, VA.getLocVT()));

      break;

    }


    if (RegsUsed.count(VA.getLocReg())) {

      SDValue &Bits =

          llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {

            return Elt.first == VA.getLocReg();

          })->second;

      Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);

    } else {

      RetVals.emplace_back(VA.getLocReg(), Arg);

      RegsUsed.insert(VA.getLocReg());

    }

  }


  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();


  // Emit SMSTOP before returning from a locally streaming function

  SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();

  if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {

    if (FuncAttrs.hasStreamingCompatibleInterface())

      Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,

                                  /*Glue*/ SDValue(),

                                  AArch64SME::IfCallerIsNonStreaming);

    else

      Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,

                                  /*Glue*/ SDValue(), AArch64SME::Always);

    Glue = Chain.getValue(1);

  }


  SmallVector<SDValue, 4> RetOps(1, Chain);

  for (auto &RetVal : RetVals) {

    if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&

        isPassedInFPR(RetVal.second.getValueType()))

      RetVal.second =

          DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,

                      DAG.getVTList(RetVal.second.getValueType(), MVT::Glue),

                      RetVal.second);

    Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);

    Glue = Chain.getValue(1);

    RetOps.push_back(

        DAG.getRegister(RetVal.first, RetVal.second.getValueType()));

  }


  // Windows AArch64 ABIs require that for returning structs by value we copy

  // the sret argument into X0 for the return.

  // We saved the argument into a virtual register in the entry block,

  // so now we copy the value out and into X0.

  if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {

    SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,

                                     getPointerTy(MF.getDataLayout()));


    unsigned RetValReg = AArch64::X0;

    if (CallConv == CallingConv::ARM64EC_Thunk_X64)

      RetValReg = AArch64::X8;

    Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);

    Glue = Chain.getValue(1);


    RetOps.push_back(

      DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

  }


  const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);

  if (I) {

    for (; *I; ++I) {

      if (AArch64::GPR64RegClass.contains(*I))

        RetOps.push_back(DAG.getRegister(*I, MVT::i64));

      else if (AArch64::FPR64RegClass.contains(*I))

        RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));

      else

        llvm_unreachable("Unexpected register class in CSRsViaCopy!");

    }

  }


  RetOps[0] = Chain; // Update chain.


  // Add the glue if we have it.

  if (Glue.getNode())

    RetOps.push_back(Glue);


  if (CallConv == CallingConv::ARM64EC_Thunk_X64) {

    // ARM64EC entry thunks use a special return sequence: instead of a regular

    // "ret" instruction, they need to explicitly call the emulator.

    EVT PtrVT = getPointerTy(DAG.getDataLayout());

    SDValue Arm64ECRetDest =

        DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);

    Arm64ECRetDest =

        getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);

    Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,

                                 MachinePointerInfo());

    RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);

    RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));

    return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);

  }


  return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);

}


//===----------------------------------------------------------------------===//

//  Other Lowering Code

//===----------------------------------------------------------------------===//


SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,

                                             SelectionDAG &DAG,

                                             unsigned Flag) const {

  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,

                                    N->getOffset(), Flag);

}


SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,

                                             SelectionDAG &DAG,

                                             unsigned Flag) const {

  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);

}


SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,

                                             SelectionDAG &DAG,

                                             unsigned Flag) const {

  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),

                                   N->getOffset(), Flag);

}


SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,

                                             SelectionDAG &DAG,

                                             unsigned Flag) const {

  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);

}


SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,

                                             SelectionDAG &DAG,

                                             unsigned Flag) const {

  return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);

}


// (loadGOT sym)

template <class NodeTy>

SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,

                                      unsigned Flags) const {

  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");

  SDLoc DL(N);

  EVT Ty = getPointerTy(DAG.getDataLayout());

  SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);

  // FIXME: Once remat is capable of dealing with instructions with register

  // operands, expand this into two nodes instead of using a wrapper node.

  if (DAG.getMachineFunction()

          .getInfo<AArch64FunctionInfo>()

          ->hasELFSignedGOT())

    return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),

                   0);

  return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);

}


// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))

template <class NodeTy>

SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,

                                            unsigned Flags) const {

  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");

  SDLoc DL(N);

  EVT Ty = getPointerTy(DAG.getDataLayout());

  const unsigned char MO_NC = AArch64II::MO_NC;

  return DAG.getNode(

      AArch64ISD::WrapperLarge, DL, Ty,

      getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),

      getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),

      getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),

      getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));

}


// (addlow (adrp %hi(sym)) %lo(sym))

template <class NodeTy>

SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,

                                       unsigned Flags) const {

  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");

  SDLoc DL(N);

  EVT Ty = getPointerTy(DAG.getDataLayout());

  SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);

  SDValue Lo = getTargetNode(N, Ty, DAG,

                             AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags);

  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);

  return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);

}


// (adr sym)

template <class NodeTy>

SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,

                                           unsigned Flags) const {

  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");

  SDLoc DL(N);

  EVT Ty = getPointerTy(DAG.getDataLayout());

  SDValue Sym = getTargetNode(N, Ty, DAG, Flags);

  return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);

}


SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,

                                                  SelectionDAG &DAG) const {

  GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);

  const GlobalValue *GV = GN->getGlobal();

  unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());


  if (OpFlags != AArch64II::MO_NO_FLAG)

    assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&

           "unexpected offset in global node");


  // This also catches the large code model case for Darwin, and tiny code

  // model with got relocations.

  if ((OpFlags & AArch64II::MO_GOT) != 0) {

    return getGOT(GN, DAG, OpFlags);

  }


  SDValue Result;

  if (getTargetMachine().getCodeModel() == CodeModel::Large &&

      !getTargetMachine().isPositionIndependent()) {

    Result = getAddrLarge(GN, DAG, OpFlags);

  } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {

    Result = getAddrTiny(GN, DAG, OpFlags);

  } else {

    Result = getAddr(GN, DAG, OpFlags);

  }

  EVT PtrVT = getPointerTy(DAG.getDataLayout());

  SDLoc DL(GN);

  if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB))

    Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,

                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));

  return Result;

}


/// Convert a TLS address reference into the correct sequence of loads

/// and calls to compute the variable's address (for Darwin, currently) and

/// return an SDValue containing the final node.


/// Darwin only has one TLS scheme which must be capable of dealing with the

/// fully general situation, in the worst case. This means:

///     + "extern __thread" declaration.

///     + Defined in a possibly unknown dynamic library.

///

/// The general system is that each __thread variable has a [3 x i64] descriptor

/// which contains information used by the runtime to calculate the address. The

/// only part of this the compiler needs to know about is the first xword, which

/// contains a function pointer that must be called with the address of the

/// entire descriptor in "x0".

///

/// Since this descriptor may be in a different unit, in general even the

/// descriptor must be accessed via an indirect load. The "ideal" code sequence

/// is:

///     adrp x0, _var@TLVPPAGE

///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor

///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,

///                                      ; the function pointer

///     blr x1                           ; Uses descriptor address in x0

///     ; Address of _var is now in x0.

///

/// If the address of _var's descriptor *is* known to the linker, then it can

/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for

/// a slight efficiency gain.

SDValue

AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,

                                                   SelectionDAG &DAG) const {

  assert(Subtarget->isTargetDarwin() &&

         "This function expects a Darwin target");


  SDLoc DL(Op);

  MVT PtrVT = getPointerTy(DAG.getDataLayout());

  MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());

  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();


  SDValue TLVPAddr =

      DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);

  SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);


  // The first entry in the descriptor is a function pointer that we must call

  // to obtain the address of the variable.

  SDValue Chain = DAG.getEntryNode();

  SDValue FuncTLVGet = DAG.getLoad(

      PtrMemVT, DL, Chain, DescAddr,

      MachinePointerInfo::getGOT(DAG.getMachineFunction()),

      Align(PtrMemVT.getSizeInBits() / 8),

      MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);

  Chain = FuncTLVGet.getValue(1);


  // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.

  FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);


  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

  MFI.setAdjustsStack(true);


  // TLS calls preserve all registers except those that absolutely must be

  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be

  // silly).

  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();

  const uint32_t *Mask = TRI->getTLSCallPreservedMask();

  if (Subtarget->hasCustomCallingConv())

    TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);


  // Finally, we can make the call. This is just a degenerate version of a

  // normal AArch64 call node: x0 takes the address of the descriptor, and

  // returns the address of the variable in this thread.

  Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());


  unsigned Opcode = AArch64ISD::CALL;

  SmallVector<SDValue, 8> Ops;

  Ops.push_back(Chain);

  Ops.push_back(FuncTLVGet);


  // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).

  if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {

    Opcode = AArch64ISD::AUTH_CALL;

    Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));

    Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.

    Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.

  }


  Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));

  Ops.push_back(DAG.getRegisterMask(Mask));

  Ops.push_back(Chain.getValue(1));

  Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);

  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));

}


/// Convert a thread-local variable reference into a sequence of instructions to

/// compute the variable's address for the local exec TLS model of ELF targets.

/// The sequence depends on the maximum TLS area size.

SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,

                                                    SDValue ThreadBase,

                                                    const SDLoc &DL,

                                                    SelectionDAG &DAG) const {

  EVT PtrVT = getPointerTy(DAG.getDataLayout());

  SDValue TPOff, Addr;


  switch (DAG.getTarget().Options.TLSSize) {

  default:

    llvm_unreachable("Unexpected TLS size");


  case 12: {

    // mrs   x0, TPIDR_EL0

    // add   x0, x0, :tprel_lo12:a

    SDValue Var = DAG.getTargetGlobalAddress(

        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);

    return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,

                                      Var,

                                      DAG.getTargetConstant(0, DL, MVT::i32)),

                   0);

  }


  case 24: {

    // mrs   x0, TPIDR_EL0

    // add   x0, x0, :tprel_hi12:a

    // add   x0, x0, :tprel_lo12_nc:a

    SDValue HiVar = DAG.getTargetGlobalAddress(

        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);

    SDValue LoVar = DAG.getTargetGlobalAddress(

        GV, DL, PtrVT, 0,

        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);

    Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,

                                      HiVar,

                                      DAG.getTargetConstant(0, DL, MVT::i32)),

                   0);

    return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,

                                      LoVar,

                                      DAG.getTargetConstant(0, DL, MVT::i32)),

                   0);

  }


  case 32: {

    // mrs   x1, TPIDR_EL0

    // movz  x0, #:tprel_g1:a

    // movk  x0, #:tprel_g0_nc:a

    // add   x0, x1, x0

    SDValue HiVar = DAG.getTargetGlobalAddress(

        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);

    SDValue LoVar = DAG.getTargetGlobalAddress(

        GV, DL, PtrVT, 0,

        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);

    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,

                                       DAG.getTargetConstant(16, DL, MVT::i32)),

                    0);

    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,

                                       DAG.getTargetConstant(0, DL, MVT::i32)),

                    0);

    return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);

  }


  case 48: {

    // mrs   x1, TPIDR_EL0

    // movz  x0, #:tprel_g2:a

    // movk  x0, #:tprel_g1_nc:a

    // movk  x0, #:tprel_g0_nc:a

    // add   x0, x1, x0

    SDValue HiVar = DAG.getTargetGlobalAddress(

        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);

    SDValue MiVar = DAG.getTargetGlobalAddress(

        GV, DL, PtrVT, 0,

        AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC);

    SDValue LoVar = DAG.getTargetGlobalAddress(

        GV, DL, PtrVT, 0,

        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);

    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,

                                       DAG.getTargetConstant(32, DL, MVT::i32)),

                    0);

    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,

                                       DAG.getTargetConstant(16, DL, MVT::i32)),

                    0);

    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,

                                       DAG.getTargetConstant(0, DL, MVT::i32)),

                    0);

    return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);

  }

  }

}


/// When accessing thread-local variables under either the general-dynamic or

/// local-dynamic system, we make a "TLS-descriptor" call. The variable will

/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry

/// is a function pointer to carry out the resolution.

///

/// The sequence is:

///    adrp  x0, :tlsdesc:var

///    ldr   x1, [x0, #:tlsdesc_lo12:var]

///    add   x0, x0, #:tlsdesc_lo12:var

///    .tlsdesccall var

///    blr   x1

///    (TPIDR_EL0 offset now in x0)

///

///  The above sequence must be produced unscheduled, to enable the linker to

///  optimize/relax this sequence.

///  Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the

///  above sequence, and expanded really late in the compilation flow, to ensure

///  the sequence is produced as per above.

SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,

                                                      const SDLoc &DL,

                                                      SelectionDAG &DAG) const {

  EVT PtrVT = getPointerTy(DAG.getDataLayout());


  SDValue Chain = DAG.getEntryNode();

  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);


  unsigned Opcode =

      DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()

          ? AArch64ISD::TLSDESC_AUTH_CALLSEQ

          : AArch64ISD::TLSDESC_CALLSEQ;

  Chain = DAG.getNode(Opcode, DL, NodeTys, {Chain, SymAddr});

  SDValue Glue = Chain.getValue(1);


  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);

}


SDValue

AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,

                                                SelectionDAG &DAG) const {

  assert(Subtarget->isTargetELF() && "This function expects an ELF target");


  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

  AArch64FunctionInfo *MFI =

      DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();


  TLSModel::Model Model = MFI->hasELFSignedGOT()

                              ? TLSModel::GeneralDynamic

                              : getTargetMachine().getTLSModel(GA->getGlobal());


  if (!EnableAArch64ELFLocalDynamicTLSGeneration) {

    if (Model == TLSModel::LocalDynamic)

      Model = TLSModel::GeneralDynamic;

  }


  if (getTargetMachine().getCodeModel() == CodeModel::Large &&

      Model != TLSModel::LocalExec)

    report_fatal_error("ELF TLS only supported in small memory model or "

                       "in local exec TLS model");

  // Different choices can be made for the maximum size of the TLS area for a

  // module. For the small address model, the default TLS size is 16MiB and the

  // maximum TLS size is 4GiB.

  // FIXME: add tiny and large code model support for TLS access models other

  // than local exec. We currently generate the same code as small for tiny,

  // which may be larger than needed.


  SDValue TPOff;

  EVT PtrVT = getPointerTy(DAG.getDataLayout());

  SDLoc DL(Op);

  const GlobalValue *GV = GA->getGlobal();


  SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);


  if (Model == TLSModel::LocalExec) {

    return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);

  } else if (Model == TLSModel::InitialExec) {

    TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);

    TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);

  } else if (Model == TLSModel::LocalDynamic) {

    // Local-dynamic accesses proceed in two phases. A general-dynamic TLS

    // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate

    // the beginning of the module's TLS region, followed by a DTPREL offset

    // calculation.


    // These accesses will need deduplicating if there's more than one.

    MFI->incNumLocalDynamicTLSAccesses();


    // The call needs a relocation too for linker relaxation. It doesn't make

    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of

    // the address.

    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,

                                                  AArch64II::MO_TLS);


    // Now we can calculate the offset from TPIDR_EL0 to this module's

    // thread-local area.

    TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);


    // Now use :dtprel_whatever: operations to calculate this variable's offset

    // in its thread-storage area.

    SDValue HiVar = DAG.getTargetGlobalAddress(

        GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);

    SDValue LoVar = DAG.getTargetGlobalAddress(

        GV, DL, MVT::i64, 0,

        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);


    TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,

                                       DAG.getTargetConstant(0, DL, MVT::i32)),

                    0);

    TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,

                                       DAG.getTargetConstant(0, DL, MVT::i32)),

                    0);

  } else if (Model == TLSModel::GeneralDynamic) {

    // The call needs a relocation too for linker relaxation. It doesn't make

    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of

    // the address.

    SDValue SymAddr =

        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);


    // Finally we can make a call to calculate the offset from tpidr_el0.

    TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);

  } else

    llvm_unreachable("Unsupported ELF TLS access model");


  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);

}


SDValue

AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,

                                                    SelectionDAG &DAG) const {

  assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");


  SDValue Chain = DAG.getEntryNode();

  EVT PtrVT = getPointerTy(DAG.getDataLayout());

  SDLoc DL(Op);


  SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);


  // Load the ThreadLocalStoragePointer from the TEB

  // A pointer to the TLS array is located at offset 0x58 from the TEB.

  SDValue TLSArray =

      DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));

  TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());

  Chain = TLSArray.getValue(1);


  // Load the TLS index from the C runtime;

  // This does the same as getAddr(), but without having a GlobalAddressSDNode.

  // This also does the same as LOADgot, but using a generic i32 load,

  // while LOADgot only loads i64.

  SDValue TLSIndexHi =

      DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);

  SDValue TLSIndexLo = DAG.getTargetExternalSymbol(

      "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);

  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);

  SDValue TLSIndex =

      DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);

  TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());

  Chain = TLSIndex.getValue(1);


  // The pointer to the thread's TLS data area is at the TLS Index scaled by 8

  // offset into the TLSArray.

  TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);

  SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,

                             DAG.getConstant(3, DL, PtrVT));

  SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,

                            DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),

                            MachinePointerInfo());

  Chain = TLS.getValue(1);


  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

  const GlobalValue *GV = GA->getGlobal();

  SDValue TGAHi = DAG.getTargetGlobalAddress(

      GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);

  SDValue TGALo = DAG.getTargetGlobalAddress(

      GV, DL, PtrVT, 0,

      AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);


  // Add the offset from the start of the .tls section (section base).

  SDValue Addr =

      SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,

                                 DAG.getTargetConstant(0, DL, MVT::i32)),

              0);

  Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);

  return Addr;

}


SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,

                                                     SelectionDAG &DAG) const {

  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

  if (DAG.getTarget().useEmulatedTLS())

    return LowerToTLSEmulatedModel(GA, DAG);


  if (Subtarget->isTargetDarwin())

    return LowerDarwinGlobalTLSAddress(Op, DAG);

  if (Subtarget->isTargetELF())

    return LowerELFGlobalTLSAddress(Op, DAG);

  if (Subtarget->isTargetWindows())

    return LowerWindowsGlobalTLSAddress(Op, DAG);


  llvm_unreachable("Unexpected platform trying to use TLS");

}


//===----------------------------------------------------------------------===//

//                      PtrAuthGlobalAddress lowering

//

// We have 3 lowering alternatives to choose from:

// - MOVaddrPAC: similar to MOVaddr, with added PAC.

//   If the GV doesn't need a GOT load (i.e., is locally defined)

//   materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.

//

// - LOADgotPAC: similar to LOADgot, with added PAC.

//   If the GV needs a GOT load, materialize the pointer using the usual

//   GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT

//   section is assumed to be read-only (for example, via relro mechanism). See

//   LowerMOVaddrPAC.

//

// - LOADauthptrstatic: similar to LOADgot, but use a

//   special stub slot instead of a GOT slot.

//   Load a signed pointer for symbol 'sym' from a stub slot named

//   'sym$auth_ptr$key$disc' filled by dynamic linker during relocation

//   resolving. This usually lowers to adrp+ldr, but also emits an entry into

//   .data with an @AUTH relocation. See LowerLOADauthptrstatic.

//

// All 3 are pseudos that are expand late to longer sequences: this lets us

// provide integrity guarantees on the to-be-signed intermediate values.

//

// LOADauthptrstatic is undesirable because it requires a large section filled

// with often similarly-signed pointers, making it a good harvesting target.

// Thus, it's only used for ptrauth references to extern_weak to avoid null

// checks.


static SDValue LowerPtrAuthGlobalAddressStatically(

    SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,

    SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {

  const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());

  assert(TGN->getGlobal()->hasExternalWeakLinkage());


  // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the

  // offset alone as a pointer if the symbol wasn't available, which would

  // probably break null checks in users. Ptrauth complicates things further:

  // error out.

  if (TGN->getOffset() != 0)

    report_fatal_error(

        "unsupported non-zero offset in weak ptrauth global reference");


  if (!isNullConstant(AddrDiscriminator))

    report_fatal_error("unsupported weak addr-div ptrauth global");


  SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);

  return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,

                                    {TGA, Key, Discriminator}),

                 0);

}


SDValue

AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,

                                                 SelectionDAG &DAG) const {

  SDValue Ptr = Op.getOperand(0);

  uint64_t KeyC = Op.getConstantOperandVal(1);

  SDValue AddrDiscriminator = Op.getOperand(2);

  uint64_t DiscriminatorC = Op.getConstantOperandVal(3);

  EVT VT = Op.getValueType();

  SDLoc DL(Op);


  if (KeyC > AArch64PACKey::LAST)

    report_fatal_error("key in ptrauth global out of range [0, " +

                       Twine((int)AArch64PACKey::LAST) + "]");


  // Blend only works if the integer discriminator is 16-bit wide.

  if (!isUInt<16>(DiscriminatorC))

    report_fatal_error(

        "constant discriminator in ptrauth global out of range [0, 0xffff]");


  // Choosing between 3 lowering alternatives is target-specific.

  if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())

    report_fatal_error("ptrauth global lowering only supported on MachO/ELF");


  int64_t PtrOffsetC = 0;

  if (Ptr.getOpcode() == ISD::ADD) {

    PtrOffsetC = Ptr.getConstantOperandVal(1);

    Ptr = Ptr.getOperand(0);

  }

  const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());

  const GlobalValue *PtrGV = PtrN->getGlobal();


  // Classify the reference to determine whether it needs a GOT load.

  const unsigned OpFlags =

      Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());

  const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);

  assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&

         "unsupported non-GOT op flags on ptrauth global reference");


  // Fold any offset into the GV; our pseudos expect it there.

  PtrOffsetC += PtrN->getOffset();

  SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,

                                            /*TargetFlags=*/0);

  assert(PtrN->getTargetFlags() == 0 &&

         "unsupported target flags on ptrauth global");


  SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);

  SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);

  SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)

                                   ? AddrDiscriminator

                                   : DAG.getRegister(AArch64::XZR, MVT::i64);


  // No GOT load needed -> MOVaddrPAC

  if (!NeedsGOTLoad) {

    assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");

    return SDValue(

        DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,

                           {TPtr, Key, TAddrDiscriminator, Discriminator}),

        0);

  }


  // GOT load -> LOADgotPAC

  // Note that we disallow extern_weak refs to avoid null checks later.

  if (!PtrGV->hasExternalWeakLinkage())

    return SDValue(

        DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,

                           {TPtr, Key, TAddrDiscriminator, Discriminator}),

        0);


  // extern_weak ref -> LOADauthptrstatic

  return LowerPtrAuthGlobalAddressStatically(

      TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,

      DAG);

}


// Looks through \param Val to determine the bit that can be used to

// check the sign of the value. It returns the unextended value and

// the sign bit position.


std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {

  if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)

    return {Val.getOperand(0),

            cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -

                1};


  if (Val.getOpcode() == ISD::SIGN_EXTEND)

    return {Val.getOperand(0),

            Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};


  return {Val, Val.getValueSizeInBits() - 1};

}


SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {

  SDValue Chain = Op.getOperand(0);

  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();

  SDValue LHS = Op.getOperand(2);

  SDValue RHS = Op.getOperand(3);

  SDValue Dest = Op.getOperand(4);

  SDLoc DL(Op);


  MachineFunction &MF = DAG.getMachineFunction();

  // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions

  // will not be produced, as they are conditional branch instructions that do

  // not set flags.

  bool ProduceNonFlagSettingCondBr =

      !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);


  // Handle f128 first, since lowering it will result in comparing the return

  // value of a libcall against zero, which is just what the rest of LowerBR_CC

  // is expecting to deal with.

  if (LHS.getValueType() == MVT::f128) {

    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);


    // If softenSetCCOperands returned a scalar, we need to compare the result

    // against zero to select between true and false values.

    if (!RHS.getNode()) {

      RHS = DAG.getConstant(0, DL, LHS.getValueType());

      CC = ISD::SETNE;

    }

  }


  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch

  // instruction.

  if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&

      (CC == ISD::SETEQ || CC == ISD::SETNE)) {

    // Only lower legal XALUO ops.

    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))

      return SDValue();


    // The actual operation with overflow check.

    AArch64CC::CondCode OFCC;

    SDValue Value, Overflow;

    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);


    if (CC == ISD::SETNE)

      OFCC = getInvertedCondCode(OFCC);

    SDValue CCVal = getCondCode(DAG, OFCC);


    return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,

                       Overflow);

  }


  if (LHS.getValueType().isInteger()) {

    assert((LHS.getValueType() == RHS.getValueType()) &&

           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));


    // If the RHS of the comparison is zero, we can potentially fold this

    // to a specialized branch.

    const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);

    if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {

      if (CC == ISD::SETEQ) {

        // See if we can use a TBZ to fold in an AND as well.

        // TBZ has a smaller branch displacement than CBZ.  If the offset is

        // out of bounds, a late MI-layer pass rewrites branches.

        // 403.gcc is an example that hits this case.

        if (LHS.getOpcode() == ISD::AND &&

            isa<ConstantSDNode>(LHS.getOperand(1)) &&

            isPowerOf2_64(LHS.getConstantOperandVal(1))) {

          SDValue Test = LHS.getOperand(0);

          uint64_t Mask = LHS.getConstantOperandVal(1);

          return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, Test,

                             DAG.getConstant(Log2_64(Mask), DL, MVT::i64),

                             Dest);

        }


        return DAG.getNode(AArch64ISD::CBZ, DL, MVT::Other, Chain, LHS, Dest);

      } else if (CC == ISD::SETNE) {

        // See if we can use a TBZ to fold in an AND as well.

        // TBZ has a smaller branch displacement than CBZ.  If the offset is

        // out of bounds, a late MI-layer pass rewrites branches.

        // 403.gcc is an example that hits this case.

        if (LHS.getOpcode() == ISD::AND &&

            isa<ConstantSDNode>(LHS.getOperand(1)) &&

            isPowerOf2_64(LHS.getConstantOperandVal(1))) {

          SDValue Test = LHS.getOperand(0);

          uint64_t Mask = LHS.getConstantOperandVal(1);

          return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, Test,

                             DAG.getConstant(Log2_64(Mask), DL, MVT::i64),

                             Dest);

        }


        return DAG.getNode(AArch64ISD::CBNZ, DL, MVT::Other, Chain, LHS, Dest);

      } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {

        // Don't combine AND since emitComparison converts the AND to an ANDS

        // (a.k.a. TST) and the test in the test bit and branch instruction

        // becomes redundant.  This would also increase register pressure.

        uint64_t SignBitPos;

        std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);

        return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, LHS,

                           DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);

      }

    }

    if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&

        LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {

      // Don't combine AND since emitComparison converts the AND to an ANDS

      // (a.k.a. TST) and the test in the test bit and branch instruction

      // becomes redundant.  This would also increase register pressure.

      uint64_t SignBitPos;

      std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);

      return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, LHS,

                         DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);

    }


    // Try to emit Armv9.6 CB instructions. We prefer tb{n}z/cb{n}z due to their

    // larger branch displacement but do prefer CB over cmp + br.

    if (Subtarget->hasCMPBR() &&

        AArch64CC::isValidCBCond(changeIntCCToAArch64CC(CC)) &&

        ProduceNonFlagSettingCondBr) {

      SDValue Cond =

          DAG.getTargetConstant(changeIntCCToAArch64CC(CC), DL, CondCodeVT);

      return DAG.getNode(AArch64ISD::CB, DL, MVT::Other, Chain, Cond, LHS, RHS,

                         Dest);

    }


    SDValue CCVal;

    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);

    return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,

                       Cmp);

  }


  assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||

         LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);


  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally

  // clean.  Some of them require two branches to implement.

  SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);

  AArch64CC::CondCode CC1, CC2;

  changeFPCCToAArch64CC(CC, CC1, CC2);

  SDValue CC1Val = getCondCode(DAG, CC1);

  SDValue BR1 =

      DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CC1Val, Cmp);

  if (CC2 != AArch64CC::AL) {

    SDValue CC2Val = getCondCode(DAG, CC2);

    return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, BR1, Dest, CC2Val,

                       Cmp);

  }


  return BR1;

}


SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,

                                              SelectionDAG &DAG) const {

  if (!Subtarget->isNeonAvailable() &&

      !Subtarget->useSVEForFixedLengthVectors())

    return SDValue();


  EVT VT = Op.getValueType();

  EVT IntVT = VT.changeTypeToInteger();

  SDLoc DL(Op);


  SDValue In1 = Op.getOperand(0);

  SDValue In2 = Op.getOperand(1);

  EVT SrcVT = In2.getValueType();


  if (!SrcVT.bitsEq(VT))

    In2 = DAG.getFPExtendOrRound(In2, DL, VT);


  if (VT.isScalableVector())

    IntVT =

        getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger());


  if (VT.isFixedLengthVector() &&

      useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {

    EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);


    In1 = convertToScalableVector(DAG, ContainerVT, In1);

    In2 = convertToScalableVector(DAG, ContainerVT, In2);


    SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);

    return convertFromScalableVector(DAG, VT, Res);

  }


  // With SVE, but without Neon, extend the scalars to scalable vectors and use

  // a SVE FCOPYSIGN.

  if (!VT.isVector() && !Subtarget->isNeonAvailable() &&

      Subtarget->isSVEorStreamingSVEAvailable()) {

    if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64 && VT != MVT::bf16)

      return SDValue();

    EVT SVT = getPackedSVEVectorVT(VT);


    SDValue Ins1 =

        DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In1,

                    DAG.getConstant(0, DL, MVT::i64));

    SDValue Ins2 =

        DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In2,

                    DAG.getConstant(0, DL, MVT::i64));

    SDValue FCS = DAG.getNode(ISD::FCOPYSIGN, DL, SVT, Ins1, Ins2);

    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, FCS,

                       DAG.getConstant(0, DL, MVT::i64));

  }


  auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {

    if (VT.isScalableVector())

      return getSVESafeBitCast(VT, Op, DAG);


    return DAG.getBitcast(VT, Op);

  };


  SDValue VecVal1, VecVal2;

  EVT VecVT;

  auto SetVecVal = [&](int Idx = -1) {

    if (!VT.isVector()) {

      VecVal1 =

          DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);

      VecVal2 =

          DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);

    } else {

      VecVal1 = BitCast(VecVT, In1, DAG);

      VecVal2 = BitCast(VecVT, In2, DAG);

    }

  };

  if (VT.isVector()) {

    VecVT = IntVT;

    SetVecVal();

  } else if (VT == MVT::f64) {

    VecVT = MVT::v2i64;

    SetVecVal(AArch64::dsub);

  } else if (VT == MVT::f32) {

    VecVT = MVT::v4i32;

    SetVecVal(AArch64::ssub);

  } else if (VT == MVT::f16 || VT == MVT::bf16) {

    VecVT = MVT::v8i16;

    SetVecVal(AArch64::hsub);

  } else {

    llvm_unreachable("Invalid type for copysign!");

  }


  unsigned BitWidth = In1.getScalarValueSizeInBits();

  SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);


  // We want to materialize a mask with every bit but the high bit set, but the

  // AdvSIMD immediate moves cannot materialize that in a single instruction for

  // 64-bit elements. Instead, materialize all bits set and then negate that.

  if (VT == MVT::f64 || VT == MVT::v2f64) {

    SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);

    SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);

    SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);

    SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);

  }


  SDValue BSP =

      DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);

  if (VT == MVT::f16 || VT == MVT::bf16)

    return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);

  if (VT == MVT::f32)

    return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);

  if (VT == MVT::f64)

    return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);


  return BitCast(VT, BSP, DAG);

}


SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,

                                                 SelectionDAG &DAG) const {

  if (DAG.getMachineFunction().getFunction().hasFnAttribute(

          Attribute::NoImplicitFloat))

    return SDValue();


  EVT VT = Op.getValueType();

  if (VT.isScalableVector() ||

      useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);


  bool IsParity = Op.getOpcode() == ISD::PARITY;

  SDValue Val = Op.getOperand(0);

  SDLoc DL(Op);


  // for i32, general parity function using EORs is more efficient compared to

  // using floating point

  if (VT == MVT::i32 && IsParity)

    return SDValue();


  if (Subtarget->isSVEorStreamingSVEAvailable()) {

    if (VT == MVT::i32 || VT == MVT::i64) {

      EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;

      Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,

                        DAG.getUNDEF(ContainerVT), Val,

                        DAG.getVectorIdxConstant(0, DL));

      Val = DAG.getNode(ISD::CTPOP, DL, ContainerVT, Val);

      Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Val,

                        DAG.getVectorIdxConstant(0, DL));

      if (IsParity)

        Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));

      return Val;

    }


    if (VT == MVT::i128) {

      Val = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Val);

      Val = convertToScalableVector(DAG, MVT::nxv2i64, Val);

      Val = DAG.getNode(ISD::CTPOP, DL, MVT::nxv2i64, Val);

      Val = convertFromScalableVector(DAG, MVT::v2i64, Val);

      Val = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Val);

      Val = DAG.getZExtOrTrunc(Val, DL, VT);

      if (IsParity)

        Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));

      return Val;

    }

  }


  if (!Subtarget->isNeonAvailable())

    return SDValue();


  // If there is no CNT instruction available, GPR popcount can

  // be more efficiently lowered to the following sequence that uses

  // AdvSIMD registers/instructions as long as the copies to/from

  // the AdvSIMD registers are cheap.

  //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd

  //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts

  //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts

  //  FMOV    X0, D0        // copy result back to integer reg

  if (VT == MVT::i32 || VT == MVT::i64) {

    if (VT == MVT::i32)

      Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);

    Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);


    SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);

    SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);

    AddV = DAG.getNode(AArch64ISD::NVCAST, DL,

                       VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);

    AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,

                       DAG.getConstant(0, DL, MVT::i64));

    if (IsParity)

      AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));

    return AddV;

  } else if (VT == MVT::i128) {

    Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);


    SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);

    SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);

    AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,

                       DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),

                       DAG.getConstant(0, DL, MVT::i64));

    AddV = DAG.getZExtOrTrunc(AddV, DL, VT);

    if (IsParity)

      AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));

    return AddV;

  }


  assert(!IsParity && "ISD::PARITY of vector types not supported");


  assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||

          VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&

         "Unexpected type for custom ctpop lowering");


  EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;

  Val = DAG.getBitcast(VT8Bit, Val);

  Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);


  if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&

      VT.getVectorNumElements() >= 2) {

    EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;

    SDValue Zeros = DAG.getConstant(0, DL, DT);

    SDValue Ones = DAG.getConstant(1, DL, VT8Bit);


    if (VT == MVT::v2i64) {

      Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);

      Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);

    } else if (VT == MVT::v2i32) {

      Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);

    } else if (VT == MVT::v4i32) {

      Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);

    } else {

      llvm_unreachable("Unexpected type for custom ctpop lowering");

    }


    return Val;

  }


  // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.

  unsigned EltSize = 8;

  unsigned NumElts = VT.is64BitVector() ? 8 : 16;

  while (EltSize != VT.getScalarSizeInBits()) {

    EltSize *= 2;

    NumElts /= 2;

    MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);

    Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);

  }


  return Val;

}


SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  assert(VT.isScalableVector() ||

         useSVEForFixedLengthVectorVT(

             VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));


  SDLoc DL(Op);

  SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));

  return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);

}


SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,

                                           SelectionDAG &DAG) const {


  EVT VT = Op.getValueType();

  SDLoc DL(Op);

  unsigned Opcode = Op.getOpcode();

  ISD::CondCode CC;

  switch (Opcode) {

  default:

    llvm_unreachable("Wrong instruction");

  case ISD::SMAX:

    CC = ISD::SETGT;

    break;

  case ISD::SMIN:

    CC = ISD::SETLT;

    break;

  case ISD::UMAX:

    CC = ISD::SETUGT;

    break;

  case ISD::UMIN:

    CC = ISD::SETULT;

    break;

  }


  if (VT.isScalableVector() ||

      useSVEForFixedLengthVectorVT(

          VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {

    switch (Opcode) {

    default:

      llvm_unreachable("Wrong instruction");

    case ISD::SMAX:

      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);

    case ISD::SMIN:

      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);

    case ISD::UMAX:

      return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);

    case ISD::UMIN:

      return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);

    }

  }


  SDValue Op0 = Op.getOperand(0);

  SDValue Op1 = Op.getOperand(1);

  SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);

  return DAG.getSelect(DL, VT, Cond, Op0, Op1);

}


SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,

                                               SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();


  if (VT.isScalableVector() ||

      useSVEForFixedLengthVectorVT(

          VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);


  SDLoc DL(Op);

  SDValue REVB;

  MVT VST;


  switch (VT.getSimpleVT().SimpleTy) {

  default:

    llvm_unreachable("Invalid type for bitreverse!");


  case MVT::v2i32: {

    VST = MVT::v8i8;

    REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));


    break;

  }


  case MVT::v4i32: {

    VST = MVT::v16i8;

    REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));


    break;

  }


  case MVT::v1i64: {

    VST = MVT::v8i8;

    REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));


    break;

  }


  case MVT::v2i64: {

    VST = MVT::v16i8;

    REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));


    break;

  }

  }


  return DAG.getNode(AArch64ISD::NVCAST, DL, VT,

                     DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));

}


// Check whether the continuous comparison sequence.

static bool


isOrXorChain(SDValue N, unsigned &Num,

             SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {

  if (Num == MaxXors)

    return false;


  // Skip the one-use zext

  if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())

    N = N->getOperand(0);


  // The leaf node must be XOR

  if (N->getOpcode() == ISD::XOR) {

    WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));

    Num++;

    return true;

  }


  // All the non-leaf nodes must be OR.

  if (N->getOpcode() != ISD::OR || !N->hasOneUse())

    return false;


  if (isOrXorChain(N->getOperand(0), Num, WorkList) &&

      isOrXorChain(N->getOperand(1), Num, WorkList))

    return true;

  return false;

}


// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.


static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG) {

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  SDLoc DL(N);

  EVT VT = N->getValueType(0);

  SmallVector<std::pair<SDValue, SDValue>, 16> WorkList;


  // Only handle integer compares.

  if (N->getOpcode() != ISD::SETCC)

    return SDValue();


  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();

  // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:

  // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag

  unsigned NumXors = 0;

  if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&

      LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&

      isOrXorChain(LHS, NumXors, WorkList)) {

    SDValue XOR0, XOR1;

    std::tie(XOR0, XOR1) = WorkList[0];

    unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;

    SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);

    for (unsigned I = 1; I < WorkList.size(); I++) {

      std::tie(XOR0, XOR1) = WorkList[I];

      SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);

      Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);

    }


    // Exit early by inverting the condition, which help reduce indentations.

    return Cmp;

  }


  return SDValue();

}


SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {


  if (Op.getValueType().isVector())

    return LowerVSETCC(Op, DAG);


  bool IsStrict = Op->isStrictFPOpcode();

  bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

  unsigned OpNo = IsStrict ? 1 : 0;

  SDValue Chain;

  if (IsStrict)

    Chain = Op.getOperand(0);

  SDValue LHS = Op.getOperand(OpNo + 0);

  SDValue RHS = Op.getOperand(OpNo + 1);

  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();

  SDLoc DL(Op);


  // We chose ZeroOrOneBooleanContents, so use zero and one.

  EVT VT = Op.getValueType();

  SDValue TVal = DAG.getConstant(1, DL, VT);

  SDValue FVal = DAG.getConstant(0, DL, VT);


  // Handle f128 first, since one possible outcome is a normal integer

  // comparison which gets picked up by the next if statement.

  if (LHS.getValueType() == MVT::f128) {

    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS, Chain,

                        IsSignaling);


    // If softenSetCCOperands returned a scalar, use it.

    if (!RHS.getNode()) {

      assert(LHS.getValueType() == Op.getValueType() &&

             "Unexpected setcc expansion!");

      return IsStrict ? DAG.getMergeValues({LHS, Chain}, DL) : LHS;

    }

  }


  if (LHS.getValueType().isInteger()) {


    simplifySetCCIntoEq(CC, LHS, RHS, DAG, DL);


    SDValue CCVal;

    SDValue Cmp = getAArch64Cmp(

        LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, DL);


    // Note that we inverted the condition above, so we reverse the order of

    // the true and false operands here.  This will allow the setcc to be

    // matched to a single CSINC instruction.

    SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CCVal, Cmp);

    return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;

  }


  // Now we know we're dealing with FP values.

  assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||

         LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);


  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead

  // and do the comparison.

  SDValue Cmp;

  if (IsStrict)

    Cmp = emitStrictFPComparison(LHS, RHS, DL, DAG, Chain, IsSignaling);

  else

    Cmp = emitComparison(LHS, RHS, CC, DL, DAG);


  AArch64CC::CondCode CC1, CC2;

  changeFPCCToAArch64CC(CC, CC1, CC2);

  SDValue Res;

  if (CC2 == AArch64CC::AL) {

    changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,

                          CC2);

    SDValue CC1Val = getCondCode(DAG, CC1);


    // Note that we inverted the condition above, so we reverse the order of

    // the true and false operands here.  This will allow the setcc to be

    // matched to a single CSINC instruction.

    Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CC1Val, Cmp);

  } else {

    // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't

    // totally clean.  Some of them require two CSELs to implement.  As is in

    // this case, we emit the first CSEL and then emit a second using the output

    // of the first as the RHS.  We're effectively OR'ing the two CC's together.


    // FIXME: It would be nice if we could match the two CSELs to two CSINCs.

    SDValue CC1Val = getCondCode(DAG, CC1);

    SDValue CS1 =

        DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);


    SDValue CC2Val = getCondCode(DAG, CC2);

    Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);

  }

  return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, DL) : Res;

}


SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,

                                               SelectionDAG &DAG) const {


  SDValue LHS = Op.getOperand(0);

  SDValue RHS = Op.getOperand(1);

  EVT VT = LHS.getValueType();

  if (VT != MVT::i32 && VT != MVT::i64)

    return SDValue();


  SDLoc DL(Op);

  SDValue Carry = Op.getOperand(2);

  // SBCS uses a carry not a borrow so the carry flag should be inverted first.

  SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);

  SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT),

                            LHS, RHS, InvCarry);


  EVT OpVT = Op.getValueType();

  SDValue TVal = DAG.getConstant(1, DL, OpVT);

  SDValue FVal = DAG.getConstant(0, DL, OpVT);


  ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();

  ISD::CondCode CondInv = ISD::getSetCCInverse(Cond, VT);

  SDValue CCVal = getCondCode(DAG, changeIntCCToAArch64CC(CondInv));

  // Inputs are swapped because the condition is inverted. This will allow

  // matching with a single CSINC instruction.

  return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,

                     Cmp.getValue(1));

}


/// Emit vector comparison for floating-point values, producing a mask.


static SDValue emitVectorComparison(SDValue LHS, SDValue RHS,

                                    AArch64CC::CondCode CC, bool NoNans, EVT VT,

                                    const SDLoc &DL, SelectionDAG &DAG) {

  assert(VT.getSizeInBits() == LHS.getValueType().getSizeInBits() &&

         "function only supposed to emit natural comparisons");


  switch (CC) {

  default:

    return SDValue();

  case AArch64CC::NE: {

    SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);

    // Use vector semantics for the inversion to potentially save a copy between

    // SIMD and regular registers.

    if (!LHS.getValueType().isVector()) {

      EVT VecVT =

          EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());

      SDValue Zero = DAG.getConstant(0, DL, MVT::i64);

      SDValue MaskVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT,

                                    DAG.getUNDEF(VecVT), Fcmeq, Zero);

      SDValue InvertedMask = DAG.getNOT(DL, MaskVec, VecVT);

      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, InvertedMask, Zero);

    }

    return DAG.getNOT(DL, Fcmeq, VT);

  }

  case AArch64CC::EQ:

    return DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);

  case AArch64CC::GE:

    return DAG.getNode(AArch64ISD::FCMGE, DL, VT, LHS, RHS);

  case AArch64CC::GT:

    return DAG.getNode(AArch64ISD::FCMGT, DL, VT, LHS, RHS);

  case AArch64CC::LE:

    if (!NoNans)

      return SDValue();

    // If we ignore NaNs then we can use to the LS implementation.

    [[fallthrough]];

  case AArch64CC::LS:

    return DAG.getNode(AArch64ISD::FCMGE, DL, VT, RHS, LHS);

  case AArch64CC::LT:

    if (!NoNans)

      return SDValue();

    // If we ignore NaNs then we can use to the MI implementation.

    [[fallthrough]];

  case AArch64CC::MI:

    return DAG.getNode(AArch64ISD::FCMGT, DL, VT, RHS, LHS);

  }

}


/// For SELECT_CC, when the true/false values are (-1, 0) and the compared

/// values are scalars, try to emit a mask generating vector instruction.


static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal,

                                    SDValue FVal, ISD::CondCode CC, bool NoNaNs,

                                    const SDLoc &DL, SelectionDAG &DAG) {

  assert(!LHS.getValueType().isVector());

  assert(!RHS.getValueType().isVector());


  auto *CTVal = dyn_cast<ConstantSDNode>(TVal);

  auto *CFVal = dyn_cast<ConstantSDNode>(FVal);

  if (!CTVal || !CFVal)

    return {};

  if (!(CTVal->isAllOnes() && CFVal->isZero()) &&

      !(CTVal->isZero() && CFVal->isAllOnes()))

    return {};


  if (CTVal->isZero())

    CC = ISD::getSetCCInverse(CC, LHS.getValueType());


  EVT VT = TVal.getValueType();

  if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits())

    return {};


  if (!NoNaNs && (CC == ISD::SETUO || CC == ISD::SETO)) {

    bool OneNaN = false;

    if (LHS == RHS) {

      OneNaN = true;

    } else if (DAG.isKnownNeverNaN(RHS)) {

      OneNaN = true;

      RHS = LHS;

    } else if (DAG.isKnownNeverNaN(LHS)) {

      OneNaN = true;

      LHS = RHS;

    }

    if (OneNaN)

      CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ;

  }


  AArch64CC::CondCode CC1;

  AArch64CC::CondCode CC2;

  bool ShouldInvert = false;

  changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);

  SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, DL, DAG);

  SDValue Cmp2;

  if (CC2 != AArch64CC::AL) {

    Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, VT, DL, DAG);

    if (!Cmp2)

      return {};

  }

  if (!Cmp2 && !ShouldInvert)

    return Cmp;


  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());

  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);

  Cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT), Cmp,

                    Zero);

  if (Cmp2) {

    Cmp2 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT),

                       Cmp2, Zero);

    Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp, Cmp2);

  }

  if (ShouldInvert)

    Cmp = DAG.getNOT(DL, Cmp, VecVT);

  Cmp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cmp, Zero);

  return Cmp;

}


SDValue AArch64TargetLowering::LowerSELECT_CC(

    ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal,

    iterator_range<SDNode::user_iterator> Users, SDNodeFlags Flags,

    const SDLoc &DL, SelectionDAG &DAG) const {

  // Handle f128 first, because it will result in a comparison of some RTLIB

  // call result against zero.

  if (LHS.getValueType() == MVT::f128) {

    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);


    // If softenSetCCOperands returned a scalar, we need to compare the result

    // against zero to select between true and false values.

    if (!RHS.getNode()) {

      RHS = DAG.getConstant(0, DL, LHS.getValueType());

      CC = ISD::SETNE;

    }

  }


  // Also handle f16, for which we need to do a f32 comparison.

  if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||

      LHS.getValueType() == MVT::bf16) {

    LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);

    RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);

  }


  // Next, handle integers.

  if (LHS.getValueType().isInteger()) {

    assert((LHS.getValueType() == RHS.getValueType()) &&

           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));


    ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);

    ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);

    ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);


    // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.

    // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))

    // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))

    // Both require less instructions than compare and conditional select.

    if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&

        RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&

        LHS.getValueType() == RHS.getValueType()) {

      EVT VT = LHS.getValueType();

      SDValue Shift =

          DAG.getNode(ISD::SRA, DL, VT, LHS,

                      DAG.getConstant(VT.getSizeInBits() - 1, DL, VT));


      if (CC == ISD::SETGT)

        Shift = DAG.getNOT(DL, Shift, VT);


      return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);

    }


    // Check for sign bit test patterns that can use TST optimization.

    // (SELECT_CC setlt, sign_extend_inreg, 0, tval, fval)

    //                          -> TST %operand, sign_bit; CSEL

    // (SELECT_CC setlt, sign_extend, 0, tval, fval)

    //                          -> TST %operand, sign_bit; CSEL

    if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() &&

        (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG ||

         LHS.getOpcode() == ISD::SIGN_EXTEND)) {


      uint64_t SignBitPos;

      std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);

      EVT TestVT = LHS.getValueType();

      SDValue SignBitConst = DAG.getConstant(1ULL << SignBitPos, DL, TestVT);

      SDValue TST =

          DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32),

                      LHS, SignBitConst);


      SDValue Flags = TST.getValue(1);

      return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal, FVal,

                         DAG.getConstant(AArch64CC::NE, DL, MVT::i32), Flags);

    }


    // Canonicalise absolute difference patterns:

    //   select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->

    //   select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc

    //

    //   select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc ->

    //   select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc

    // The second forms can be matched into subs+cneg.

    // NOTE: Drop poison generating flags from the negated operand to avoid

    // inadvertently propagating poison after the canonicalisation.

    if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) {

      if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS &&

          FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) {

        TVal->dropFlags(SDNodeFlags::PoisonGeneratingFlags);

        FVal = DAG.getNegative(TVal, DL, TVal.getValueType());

      } else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&

                 FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) {

        FVal->dropFlags(SDNodeFlags::PoisonGeneratingFlags);

        TVal = DAG.getNegative(FVal, DL, FVal.getValueType());

      }

    }


    unsigned Opcode = AArch64ISD::CSEL;


    // If both the TVal and the FVal are constants, see if we can swap them in

    // order to for a CSINV or CSINC out of them.

    if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {

      std::swap(TVal, FVal);

      std::swap(CTVal, CFVal);

      CC = ISD::getSetCCInverse(CC, LHS.getValueType());

    } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {

      std::swap(TVal, FVal);

      std::swap(CTVal, CFVal);

      CC = ISD::getSetCCInverse(CC, LHS.getValueType());

    } else if (TVal.getOpcode() == ISD::XOR) {

      // If TVal is a NOT we want to swap TVal and FVal so that we can match

      // with a CSINV rather than a CSEL.

      if (isAllOnesConstant(TVal.getOperand(1))) {

        std::swap(TVal, FVal);

        std::swap(CTVal, CFVal);

        CC = ISD::getSetCCInverse(CC, LHS.getValueType());

      }

    } else if (TVal.getOpcode() == ISD::SUB) {

      // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so

      // that we can match with a CSNEG rather than a CSEL.

      if (isNullConstant(TVal.getOperand(0))) {

        std::swap(TVal, FVal);

        std::swap(CTVal, CFVal);

        CC = ISD::getSetCCInverse(CC, LHS.getValueType());

      }

    } else if (CTVal && CFVal) {

      const int64_t TrueVal = CTVal->getSExtValue();

      const int64_t FalseVal = CFVal->getSExtValue();

      bool Swap = false;


      // If both TVal and FVal are constants, see if FVal is the

      // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC

      // instead of a CSEL in that case.

      if (TrueVal == ~FalseVal) {

        Opcode = AArch64ISD::CSINV;

      } else if (FalseVal > std::numeric_limits<int64_t>::min() &&

                 TrueVal == -FalseVal) {

        Opcode = AArch64ISD::CSNEG;

      } else if (TVal.getValueType() == MVT::i32) {

        // If our operands are only 32-bit wide, make sure we use 32-bit

        // arithmetic for the check whether we can use CSINC. This ensures that

        // the addition in the check will wrap around properly in case there is

        // an overflow (which would not be the case if we do the check with

        // 64-bit arithmetic).

        const uint32_t TrueVal32 = CTVal->getZExtValue();

        const uint32_t FalseVal32 = CFVal->getZExtValue();


        if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {

          Opcode = AArch64ISD::CSINC;


          if (TrueVal32 > FalseVal32) {

            Swap = true;

          }

        }

      } else {

        // 64-bit check whether we can use CSINC.

        const uint64_t TrueVal64 = TrueVal;

        const uint64_t FalseVal64 = FalseVal;


        if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {

          Opcode = AArch64ISD::CSINC;


          if (TrueVal > FalseVal) {

            Swap = true;

          }

        }

      }


      // Swap TVal and FVal if necessary.

      if (Swap) {

        std::swap(TVal, FVal);

        std::swap(CTVal, CFVal);

        CC = ISD::getSetCCInverse(CC, LHS.getValueType());

      }


      if (Opcode != AArch64ISD::CSEL) {

        // Drop FVal since we can get its value by simply inverting/negating

        // TVal.

        FVal = TVal;

      }

    }


    // Avoid materializing a constant when possible by reusing a known value in

    // a register.  However, don't perform this optimization if the known value

    // is one, zero or negative one in the case of a CSEL.  We can always

    // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the

    // FVal, respectively.

    ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);

    if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&

        !RHSVal->isZero() && !RHSVal->isAllOnes()) {

      AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);

      // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to

      // "a != C ? x : a" to avoid materializing C.

      if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)

        TVal = LHS;

      else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)

        FVal = LHS;

    } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {

      assert (CTVal && CFVal && "Expected constant operands for CSNEG.");

      // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to

      // avoid materializing C.

      AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);

      if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {

        Opcode = AArch64ISD::CSINV;

        TVal = LHS;

        FVal = DAG.getConstant(0, DL, FVal.getValueType());

      }

    }


    SDValue CCVal;

    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);

    EVT VT = TVal.getValueType();

    return DAG.getNode(Opcode, DL, VT, TVal, FVal, CCVal, Cmp);

  }


  // Now we know we're dealing with FP values.

  assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||

         LHS.getValueType() == MVT::f64);

  assert(LHS.getValueType() == RHS.getValueType());

  EVT VT = TVal.getValueType();


  // If the purpose of the comparison is to select between all ones

  // or all zeros, try to use a vector comparison because the operands are

  // already stored in SIMD registers.

  if (Subtarget->isNeonAvailable() && all_of(Users, [](const SDNode *U) {

        switch (U->getOpcode()) {

        default:

          return false;

        case ISD::INSERT_VECTOR_ELT:

        case ISD::SCALAR_TO_VECTOR:

        case AArch64ISD::DUP:

          return true;

        }

      })) {

    bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs();

    SDValue VectorCmp =

        emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);

    if (VectorCmp)

      return VectorCmp;

  }


  SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);


  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally

  // clean.  Some of them require two CSELs to implement.

  AArch64CC::CondCode CC1, CC2;

  changeFPCCToAArch64CC(CC, CC1, CC2);


  if (Flags.hasNoSignedZeros()) {

    // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and

    // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.

    ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);

    if (RHSVal && RHSVal->isZero()) {

      ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);

      ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);


      if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&

          CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())

        TVal = LHS;

      else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&

               CFVal && CFVal->isZero() &&

               FVal.getValueType() == LHS.getValueType())

        FVal = LHS;

    }

  }


  // Emit first, and possibly only, CSEL.

  SDValue CC1Val = getCondCode(DAG, CC1);

  SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);


  // If we need a second CSEL, emit it, using the output of the first as the

  // RHS.  We're effectively OR'ing the two CC's together.

  if (CC2 != AArch64CC::AL) {

    SDValue CC2Val = getCondCode(DAG, CC2);

    return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);

  }


  // Otherwise, return the output of the first CSEL.

  return CS1;

}


SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,

                                                  SelectionDAG &DAG) const {

  EVT Ty = Op.getValueType();

  auto Idx = Op.getConstantOperandAPInt(2);

  int64_t IdxVal = Idx.getSExtValue();

  assert(Ty.isScalableVector() &&

         "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");


  // We can use the splice instruction for certain index values where we are

  // able to efficiently generate the correct predicate. The index will be

  // inverted and used directly as the input to the ptrue instruction, i.e.

  // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the

  // splice predicate. However, we can only do this if we can guarantee that

  // there are enough elements in the vector, hence we check the index <= min

  // number of elements.

  std::optional<unsigned> PredPattern;

  if (Ty.isScalableVector() && IdxVal < 0 &&

      (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=

          std::nullopt) {

    SDLoc DL(Op);


    // Create a predicate where all but the last -IdxVal elements are false.

    EVT PredVT = Ty.changeVectorElementType(MVT::i1);

    SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);

    Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);


    // Now splice the two inputs together using the predicate.

    return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),

                       Op.getOperand(1));

  }


  // We can select to an EXT instruction when indexing the first 256 bytes.

  unsigned BlockSize = AArch64::SVEBitsPerBlock / Ty.getVectorMinNumElements();

  if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)

    return Op;


  return SDValue();

}


SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,

                                              SelectionDAG &DAG) const {

  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();

  SDValue LHS = Op.getOperand(0);

  SDValue RHS = Op.getOperand(1);

  SDValue TVal = Op.getOperand(2);

  SDValue FVal = Op.getOperand(3);

  SDNodeFlags Flags = Op->getFlags();

  SDLoc DL(Op);

  return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);

}


SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,

                                           SelectionDAG &DAG) const {

  SDValue CCVal = Op->getOperand(0);

  SDValue TVal = Op->getOperand(1);

  SDValue FVal = Op->getOperand(2);

  SDLoc DL(Op);


  EVT Ty = Op.getValueType();

  if (Ty == MVT::aarch64svcount) {

    TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);

    FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);

    SDValue Sel =

        DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);

    return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);

  }


  if (Ty.isScalableVector()) {

    MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());

    SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);

    return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);

  }


  if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {

    // FIXME: Ideally this would be the same as above using i1 types, however

    // for the moment we can't deal with fixed i1 vector types properly, so

    // instead extend the predicate to a result type sized integer vector.

    MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());

    MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());

    SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);

    SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);

    return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);

  }


  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select

  // instruction.

  if (ISD::isOverflowIntrOpRes(CCVal)) {

    // Only lower legal XALUO ops.

    if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))

      return SDValue();


    AArch64CC::CondCode OFCC;

    SDValue Value, Overflow;

    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);

    SDValue CCVal = getCondCode(DAG, OFCC);


    return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,

                       CCVal, Overflow);

  }


  // Lower it the same way as we would lower a SELECT_CC node.

  ISD::CondCode CC;

  SDValue LHS, RHS;

  if (CCVal.getOpcode() == ISD::SETCC) {

    LHS = CCVal.getOperand(0);

    RHS = CCVal.getOperand(1);

    CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();

  } else {

    LHS = CCVal;

    RHS = DAG.getConstant(0, DL, CCVal.getValueType());

    CC = ISD::SETNE;

  }


  // If we are lowering a f16 and we do not have fullf16, convert to a f32 in

  // order to use FCSELSrrr

  if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {

    TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,

                                     DAG.getUNDEF(MVT::f32), TVal);

    FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,

                                     DAG.getUNDEF(MVT::f32), FVal);

  }


  SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),

                               Op->getFlags(), DL, DAG);


  if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {

    return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);

  }


  return Res;

}


SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,

                                              SelectionDAG &DAG) const {

  // Jump table entries as PC relative offsets. No additional tweaking

  // is necessary here. Just get the address of the jump table.

  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);


  CodeModel::Model CM = getTargetMachine().getCodeModel();

  if (CM == CodeModel::Large && !getTargetMachine().isPositionIndependent() &&

      !Subtarget->isTargetMachO())

    return getAddrLarge(JT, DAG);

  if (CM == CodeModel::Tiny)

    return getAddrTiny(JT, DAG);

  return getAddr(JT, DAG);

}


SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,

                                          SelectionDAG &DAG) const {

  // Jump table entries as PC relative offsets. No additional tweaking

  // is necessary here. Just get the address of the jump table.

  SDLoc DL(Op);

  SDValue JT = Op.getOperand(1);

  SDValue Entry = Op.getOperand(2);

  int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();


  auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();

  AFI->setJumpTableEntryInfo(JTI, 4, nullptr);


  // With aarch64-jump-table-hardening, we only expand the jump table dispatch

  // sequence later, to guarantee the integrity of the intermediate values.

  if (DAG.getMachineFunction().getFunction().hasFnAttribute(

          "aarch64-jump-table-hardening")) {

    CodeModel::Model CM = getTargetMachine().getCodeModel();

    if (Subtarget->isTargetMachO()) {

      if (CM != CodeModel::Small && CM != CodeModel::Large)

        report_fatal_error("Unsupported code-model for hardened jump-table");

    } else {

      // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.

      assert(Subtarget->isTargetELF() &&

             "jump table hardening only supported on MachO/ELF");

      if (CM != CodeModel::Small)

        report_fatal_error("Unsupported code-model for hardened jump-table");

    }


    SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,

                                       Entry, SDValue());

    SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,

                                   DAG.getTargetJumpTable(JTI, MVT::i32),

                                   X16Copy.getValue(0), X16Copy.getValue(1));

    return SDValue(B, 0);

  }


  SDNode *Dest =

      DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,

                         Entry, DAG.getTargetJumpTable(JTI, MVT::i32));

  SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);

  return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));

}


SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {

  SDValue Chain = Op.getOperand(0);

  SDValue Dest = Op.getOperand(1);


  // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr

  // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.

  if (Dest->isMachineOpcode() &&

      Dest->getMachineOpcode() == AArch64::JumpTableDest32)

    return SDValue();


  const MachineFunction &MF = DAG.getMachineFunction();

  std::optional<uint16_t> BADisc =

      Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(MF.getFunction());

  if (!BADisc)

    return SDValue();


  SDLoc DL(Op);


  SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);

  SDValue Key = DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32);

  SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);


  SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,

                                   {Dest, Key, Disc, AddrDisc, Chain});

  return SDValue(BrA, 0);

}


SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,

                                                 SelectionDAG &DAG) const {

  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

  CodeModel::Model CM = getTargetMachine().getCodeModel();

  if (CM == CodeModel::Large) {

    // Use the GOT for the large code model on iOS.

    if (Subtarget->isTargetMachO()) {

      return getGOT(CP, DAG);

    }

    if (!getTargetMachine().isPositionIndependent())

      return getAddrLarge(CP, DAG);

  } else if (CM == CodeModel::Tiny) {

    return getAddrTiny(CP, DAG);

  }

  return getAddr(CP, DAG);

}


SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,

                                               SelectionDAG &DAG) const {

  BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);

  const BlockAddress *BA = BAN->getBlockAddress();


  if (std::optional<uint16_t> BADisc =

          Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(

              *BA->getFunction())) {

    SDLoc DL(Op);


    // This isn't cheap, but BRIND is rare.

    SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));


    SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);


    SDValue Key = DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32);

    SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);


    SDNode *MOV =

        DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},

                           {TargetBA, Key, AddrDisc, Disc});

    return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,

                              SDValue(MOV, 1));

  }


  CodeModel::Model CM = getTargetMachine().getCodeModel();

  if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {

    if (!getTargetMachine().isPositionIndependent())

      return getAddrLarge(BAN, DAG);

  } else if (CM == CodeModel::Tiny) {

    return getAddrTiny(BAN, DAG);

  }

  return getAddr(BAN, DAG);

}


SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,

                                                 SelectionDAG &DAG) const {

  AArch64FunctionInfo *FuncInfo =

      DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();


  SDLoc DL(Op);

  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),

                                 getPointerTy(DAG.getDataLayout()));

  FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));

  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),

                      MachinePointerInfo(SV));

}


SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,

                                                  SelectionDAG &DAG) const {

  MachineFunction &MF = DAG.getMachineFunction();

  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();


  SDLoc DL(Op);

  SDValue FR;

  if (Subtarget->isWindowsArm64EC()) {

    // With the Arm64EC ABI, we compute the address of the varargs save area

    // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,

    // but calls from an entry thunk can pass in a different address.

    Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);

    SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);

    uint64_t StackOffset;

    if (FuncInfo->getVarArgsGPRSize() > 0)

      StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();

    else

      StackOffset = FuncInfo->getVarArgsStackOffset();

    FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,

                     DAG.getConstant(StackOffset, DL, MVT::i64));

  } else {

    FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0

                               ? FuncInfo->getVarArgsGPRIndex()

                               : FuncInfo->getVarArgsStackIndex(),

                           getPointerTy(DAG.getDataLayout()));

  }

  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),

                      MachinePointerInfo(SV));

}


SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,

                                                  SelectionDAG &DAG) const {

  // The layout of the va_list struct is specified in the AArch64 Procedure Call

  // Standard, section B.3.

  MachineFunction &MF = DAG.getMachineFunction();

  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

  unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;

  auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());

  auto PtrVT = getPointerTy(DAG.getDataLayout());

  SDLoc DL(Op);


  SDValue Chain = Op.getOperand(0);

  SDValue VAList = Op.getOperand(1);

  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

  SmallVector<SDValue, 4> MemOps;


  // void *__stack at offset 0

  unsigned Offset = 0;

  SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);

  Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);

  MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,

                                MachinePointerInfo(SV), Align(PtrSize)));


  // void *__gr_top at offset 8 (4 on ILP32)

  Offset += PtrSize;

  int GPRSize = FuncInfo->getVarArgsGPRSize();

  if (GPRSize > 0) {

    SDValue GRTop, GRTopAddr;


    GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,

                            DAG.getConstant(Offset, DL, PtrVT));


    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);

    GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,

                        DAG.getSignedConstant(GPRSize, DL, PtrVT));

    GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);


    MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,

                                  MachinePointerInfo(SV, Offset),

                                  Align(PtrSize)));

  }


  // void *__vr_top at offset 16 (8 on ILP32)

  Offset += PtrSize;

  int FPRSize = FuncInfo->getVarArgsFPRSize();

  if (FPRSize > 0) {

    SDValue VRTop, VRTopAddr;

    VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,

                            DAG.getConstant(Offset, DL, PtrVT));


    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);

    VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,

                        DAG.getSignedConstant(FPRSize, DL, PtrVT));

    VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);


    MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,

                                  MachinePointerInfo(SV, Offset),

                                  Align(PtrSize)));

  }


  // int __gr_offs at offset 24 (12 on ILP32)

  Offset += PtrSize;

  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,

                                   DAG.getConstant(Offset, DL, PtrVT));

  MemOps.push_back(

      DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),

                   GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));


  // int __vr_offs at offset 28 (16 on ILP32)

  Offset += 4;

  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,

                                   DAG.getConstant(Offset, DL, PtrVT));

  MemOps.push_back(

      DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),

                   VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));


  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

}


SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,

                                            SelectionDAG &DAG) const {

  MachineFunction &MF = DAG.getMachineFunction();

  Function &F = MF.getFunction();


  if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))

    return LowerWin64_VASTART(Op, DAG);

  else if (Subtarget->isTargetDarwin())

    return LowerDarwin_VASTART(Op, DAG);

  else

    return LowerAAPCS_VASTART(Op, DAG);

}


SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,

                                           SelectionDAG &DAG) const {

  // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single

  // pointer.

  SDLoc DL(Op);

  unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;

  unsigned VaListSize =

      (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())

          ? PtrSize

          : Subtarget->isTargetILP32() ? 20 : 32;

  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();

  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();


  return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),

                       DAG.getConstant(VaListSize, DL, MVT::i32),

                       Align(PtrSize), false, false, /*CI=*/nullptr,

                       std::nullopt, MachinePointerInfo(DestSV),

                       MachinePointerInfo(SrcSV));

}


SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {

  assert(Subtarget->isTargetDarwin() &&

         "automatic va_arg instruction only works on Darwin");


  const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

  EVT VT = Op.getValueType();

  SDLoc DL(Op);

  SDValue Chain = Op.getOperand(0);

  SDValue Addr = Op.getOperand(1);

  MaybeAlign Align(Op.getConstantOperandVal(3));

  unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;

  auto PtrVT = getPointerTy(DAG.getDataLayout());

  auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());

  SDValue VAList =

      DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));

  Chain = VAList.getValue(1);

  VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);


  if (VT.isScalableVector())

    report_fatal_error("Passing SVE types to variadic functions is "

                       "currently not supported");


  if (Align && *Align > MinSlotSize) {

    VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,

                         DAG.getConstant(Align->value() - 1, DL, PtrVT));

    VAList =

        DAG.getNode(ISD::AND, DL, PtrVT, VAList,

                    DAG.getSignedConstant(-(int64_t)Align->value(), DL, PtrVT));

  }


  Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());

  unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);


  // Scalar integer and FP values smaller than 64 bits are implicitly extended

  // up to 64 bits.  At the very least, we have to increase the striding of the

  // vaargs list to match this, and for FP values we need to introduce

  // FP_ROUND nodes as well.

  if (VT.isInteger() && !VT.isVector())

    ArgSize = std::max(ArgSize, MinSlotSize);

  bool NeedFPTrunc = false;

  if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {

    ArgSize = 8;

    NeedFPTrunc = true;

  }


  // Increment the pointer, VAList, to the next vaarg

  SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,

                               DAG.getConstant(ArgSize, DL, PtrVT));

  VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);


  // Store the incremented VAList to the legalized pointer

  SDValue APStore =

      DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));


  // Load the actual argument out of the pointer VAList

  if (NeedFPTrunc) {

    // Load the value as an f64.

    SDValue WideFP =

        DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());

    // Round the value down to an f32.

    SDValue NarrowFP =

        DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),

                    DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));

    SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };

    // Merge the rounded value with the chain output of the load.

    return DAG.getMergeValues(Ops, DL);

  }


  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());

}


SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,

                                              SelectionDAG &DAG) const {

  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

  MFI.setFrameAddressIsTaken(true);


  EVT VT = Op.getValueType();

  SDLoc DL(Op);

  unsigned Depth = Op.getConstantOperandVal(0);

  SDValue FrameAddr =

      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);

  while (Depth--)

    FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,

                            MachinePointerInfo());


  if (Subtarget->isTargetILP32())

    FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,

                            DAG.getValueType(VT));


  return FrameAddr;

}


SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,

                                              SelectionDAG &DAG) const {

  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();


  EVT VT = getPointerTy(DAG.getDataLayout());

  int FI = MFI.CreateFixedObject(4, 0, false);

  return DAG.getFrameIndex(FI, VT);

}


#define GET_REGISTER_MATCHER

#include "AArch64GenAsmMatcher.inc"


// FIXME? Maybe this could be a TableGen attribute on some registers and

// this table could be generated automatically from RegInfo.

Register AArch64TargetLowering::

getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {

  Register Reg = MatchRegisterName(RegName);

  if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {

    const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();

    unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);

    if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&

        !MRI->isReservedReg(MF, Reg))

      Reg = Register();

  }

  return Reg;

}


SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,

                                                     SelectionDAG &DAG) const {

  DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);


  EVT VT = Op.getValueType();

  SDLoc DL(Op);


  SDValue FrameAddr =

      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);

  SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));


  return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);

}


SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,

                                               SelectionDAG &DAG) const {

  MachineFunction &MF = DAG.getMachineFunction();

  MachineFrameInfo &MFI = MF.getFrameInfo();

  MFI.setReturnAddressIsTaken(true);


  EVT VT = Op.getValueType();

  SDLoc DL(Op);

  unsigned Depth = Op.getConstantOperandVal(0);

  SDValue ReturnAddress;

  if (Depth) {

    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);

    SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));

    ReturnAddress = DAG.getLoad(

        VT, DL, DAG.getEntryNode(),

        DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());

  } else {

    // Return LR, which contains the return address. Mark it an implicit

    // live-in.

    Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);

    ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);

  }


  // The XPACLRI instruction assembles to a hint-space instruction before

  // Armv8.3-A therefore this instruction can be safely used for any pre

  // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use

  // that instead.

  SDNode *St;

  if (Subtarget->hasPAuth()) {

    St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);

  } else {

    // XPACLRI operates on LR therefore we must move the operand accordingly.

    SDValue Chain =

        DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);

    St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);

  }

  return SDValue(St, 0);

}


/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two

/// i32 values and take a 2 x i32 value to shift plus a shift amount.

SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,

                                               SelectionDAG &DAG) const {

  SDValue Lo, Hi;

  expandShiftParts(Op.getNode(), Lo, Hi, DAG);

  return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));

}


bool AArch64TargetLowering::isOffsetFoldingLegal(

    const GlobalAddressSDNode *GA) const {

  // Offsets are folded in the DAG combine rather than here so that we can

  // intelligently choose an offset based on the uses.

  return false;

}


bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,

                                         bool OptForSize) const {

  bool IsLegal = false;

  // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and

  // 16-bit case when target has full fp16 support.

  // We encode bf16 bit patterns as if they were fp16. This results in very

  // strange looking assembly but should populate the register with appropriate

  // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will

  // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the

  // FP16 1.9375 which shares the same bit pattern as BF16 1.5.

  // FIXME: We should be able to handle f128 as well with a clever lowering.

  const APInt ImmInt = Imm.bitcastToAPInt();

  if (VT == MVT::f64)

    IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();

  else if (VT == MVT::f32)

    IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();

  else if (VT == MVT::f16 || VT == MVT::bf16)

    IsLegal =

        (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||

        Imm.isPosZero();


  // If we can not materialize in immediate field for fmov, check if the

  // value can be encoded as the immediate operand of a logical instruction.

  // The immediate value will be created with either MOVZ, MOVN, or ORR.

  // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to

  //       generate that fmov.

  if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {

    // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;

    // however the mov+fmov sequence is always better because of the reduced

    // cache pressure. The timings are still the same if you consider

    // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the

    // movw+movk is fused). So we limit up to 2 instrdduction at most.

    SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;

    AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), Insn);

    assert(Insn.size() <= 4 &&

           "Should be able to build any value with at most 4 moves");

    unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));

    IsLegal = Insn.size() <= Limit;

  }


  LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT

                    << " imm value: "; Imm.dump(););

  return IsLegal;

}


//===----------------------------------------------------------------------===//

//                          AArch64 Optimization Hooks

//===----------------------------------------------------------------------===//


static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,

                           SDValue Operand, SelectionDAG &DAG,

                           int &ExtraSteps) {

  EVT VT = Operand.getValueType();

  if ((ST->hasNEON() &&

       (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||

        VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||

        VT == MVT::v4f32)) ||

      (ST->hasSVE() &&

       (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {

    if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) {

      // For the reciprocal estimates, convergence is quadratic, so the number

      // of digits is doubled after each iteration.  In ARMv8, the accuracy of

      // the initial estimate is 2^-8.  Thus the number of extra steps to refine

      // the result for float (23 mantissa bits) is 2 and for double (52

      // mantissa bits) is 3.

      constexpr unsigned AccurateBits = 8;

      unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());

      ExtraSteps = DesiredBits <= AccurateBits

                       ? 0

                       : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);

    }


    return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);

  }


  return SDValue();

}


SDValue

AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,

                                        const DenormalMode &Mode) const {

  SDLoc DL(Op);

  EVT VT = Op.getValueType();

  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

  SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);

  return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);

}


SDValue

AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,

                                                   SelectionDAG &DAG) const {

  return Op;

}


SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,

                                               SelectionDAG &DAG, int Enabled,

                                               int &ExtraSteps,

                                               bool &UseOneConst,

                                               bool Reciprocal) const {

  if (Enabled == ReciprocalEstimate::Enabled ||

      (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))

    if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,

                                       DAG, ExtraSteps)) {

      SDLoc DL(Operand);

      EVT VT = Operand.getValueType();


      // Ensure nodes can be recognized by isAssociativeAndCommutative.

      SDNodeFlags Flags =

          SDNodeFlags::AllowReassociation | SDNodeFlags::NoSignedZeros;


      // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)

      // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)

      for (int i = ExtraSteps; i > 0; --i) {

        SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,

                                   Flags);

        Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);

        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);

      }

      if (!Reciprocal)

        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);


      ExtraSteps = 0;

      return Estimate;

    }


  return SDValue();

}


SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,

                                                SelectionDAG &DAG, int Enabled,

                                                int &ExtraSteps) const {

  if (Enabled == ReciprocalEstimate::Enabled)

    if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,

                                       DAG, ExtraSteps)) {

      SDLoc DL(Operand);

      EVT VT = Operand.getValueType();


      SDNodeFlags Flags = SDNodeFlags::AllowReassociation;


      // Newton reciprocal iteration: E * (2 - X * E)

      // AArch64 reciprocal iteration instruction: (2 - M * N)

      for (int i = ExtraSteps; i > 0; --i) {

        SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,

                                   Estimate, Flags);

        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);

      }


      ExtraSteps = 0;

      return Estimate;

    }


  return SDValue();

}


//===----------------------------------------------------------------------===//

//                          AArch64 Inline Assembly Support

//===----------------------------------------------------------------------===//


// Table of Constraints

// TODO: This is the current set of constraints supported by ARM for the

// compiler, not all of them may make sense.

//

// r - A general register

// w - An FP/SIMD register of some size in the range v0-v31

// x - An FP/SIMD register of some size in the range v0-v15

// I - Constant that can be used with an ADD instruction

// J - Constant that can be used with a SUB instruction

// K - Constant that can be used with a 32-bit logical instruction

// L - Constant that can be used with a 64-bit logical instruction

// M - Constant that can be used as a 32-bit MOV immediate

// N - Constant that can be used as a 64-bit MOV immediate

// Q - A memory reference with base register and no offset

// S - A symbolic address

// Y - Floating point constant zero

// Z - Integer constant zero

//

//   Note that general register operands will be output using their 64-bit x

// register name, whatever the size of the variable, unless the asm operand

// is prefixed by the %w modifier. Floating-point and SIMD register operands

// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or

// %q modifier.

const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {

  // At this point, we have to lower this constraint to something else, so we

  // lower it to an "r" or "w". However, by doing this we will force the result

  // to be in register, while the X constraint is much more permissive.

  //

  // Although we are correct (we are free to emit anything, without

  // constraints), we might break use cases that would expect us to be more

  // efficient and emit something else.

  if (!Subtarget->hasFPARMv8())

    return "r";


  if (ConstraintVT.isFloatingPoint())

    return "w";


  if (ConstraintVT.isVector() &&

     (ConstraintVT.getSizeInBits() == 64 ||

      ConstraintVT.getSizeInBits() == 128))

    return "w";


  return "r";

}


enum class PredicateConstraint { Uph, Upl, Upa };


// Returns a {Reg, RegisterClass} tuple if the constraint is

// a specific predicate register.

//

// For some constraint like "{pn3}" the default path in

// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a

// suitable register class for this register is "PPRorPNR", after which it

// determines that nxv16i1 is an appropriate type for the constraint, which is

// not what we want. The code here pre-empts this by matching the register

// explicitly.

static std::optional<std::pair<unsigned, const TargetRegisterClass *>>


parseSVERegAsConstraint(StringRef Constraint) {

  if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||

      (Constraint[1] != 'p' && Constraint[1] != 'z'))

    return std::nullopt;


  bool IsPredicate = Constraint[1] == 'p';

  Constraint = Constraint.substr(2, Constraint.size() - 3);

  bool IsPredicateAsCount = IsPredicate && Constraint.starts_with("n");

  if (IsPredicateAsCount)

    Constraint = Constraint.drop_front(1);


  unsigned V;

  if (Constraint.getAsInteger(10, V) || V > 31)

    return std::nullopt;


  if (IsPredicateAsCount)

    return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);

  if (IsPredicate)

    return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);

  return std::make_pair(AArch64::Z0 + V, &AArch64::ZPRRegClass);

}


static std::optional<PredicateConstraint>


parsePredicateConstraint(StringRef Constraint) {

  return StringSwitch<std::optional<PredicateConstraint>>(Constraint)

      .Case("Uph", PredicateConstraint::Uph)

      .Case("Upl", PredicateConstraint::Upl)

      .Case("Upa", PredicateConstraint::Upa)

      .Default(std::nullopt);

}


static const TargetRegisterClass *


getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) {

  if (VT != MVT::aarch64svcount &&

      (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))

    return nullptr;


  switch (Constraint) {

  case PredicateConstraint::Uph:

    return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass

                                     : &AArch64::PPR_p8to15RegClass;

  case PredicateConstraint::Upl:

    return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass

                                     : &AArch64::PPR_3bRegClass;

  case PredicateConstraint::Upa:

    return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass

                                     : &AArch64::PPRRegClass;

  }


  llvm_unreachable("Missing PredicateConstraint!");

}


enum class ReducedGprConstraint { Uci, Ucj };


static std::optional<ReducedGprConstraint>


parseReducedGprConstraint(StringRef Constraint) {

  return StringSwitch<std::optional<ReducedGprConstraint>>(Constraint)

      .Case("Uci", ReducedGprConstraint::Uci)

      .Case("Ucj", ReducedGprConstraint::Ucj)

      .Default(std::nullopt);

}


static const TargetRegisterClass *


getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT) {

  if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)

    return nullptr;


  switch (Constraint) {

  case ReducedGprConstraint::Uci:

    return &AArch64::MatrixIndexGPR32_8_11RegClass;

  case ReducedGprConstraint::Ucj:

    return &AArch64::MatrixIndexGPR32_12_15RegClass;

  }


  llvm_unreachable("Missing ReducedGprConstraint!");

}


// The set of cc code supported is from

// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands


static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint) {

  AArch64CC::CondCode Cond = StringSwitch<AArch64CC::CondCode>(Constraint)

                                 .Case("{@cchi}", AArch64CC::HI)

                                 .Case("{@cccs}", AArch64CC::HS)

                                 .Case("{@cclo}", AArch64CC::LO)

                                 .Case("{@ccls}", AArch64CC::LS)

                                 .Case("{@cccc}", AArch64CC::LO)

                                 .Case("{@cceq}", AArch64CC::EQ)

                                 .Case("{@ccgt}", AArch64CC::GT)

                                 .Case("{@ccge}", AArch64CC::GE)

                                 .Case("{@cclt}", AArch64CC::LT)

                                 .Case("{@ccle}", AArch64CC::LE)

                                 .Case("{@cchs}", AArch64CC::HS)

                                 .Case("{@ccne}", AArch64CC::NE)

                                 .Case("{@ccvc}", AArch64CC::VC)

                                 .Case("{@ccpl}", AArch64CC::PL)

                                 .Case("{@ccvs}", AArch64CC::VS)

                                 .Case("{@ccmi}", AArch64CC::MI)

                                 .Default(AArch64CC::Invalid);

  return Cond;

}


/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,

/// WZR, invert(<cond>)'.


static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL,

                        SelectionDAG &DAG) {

  return DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,

                     DAG.getConstant(0, DL, MVT::i32),

                     DAG.getConstant(0, DL, MVT::i32),

                     getCondCode(DAG, getInvertedCondCode(CC)), NZCV);

}


// Lower @cc flag output via getSETCC.

SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(

    SDValue &Chain, SDValue &Glue, const SDLoc &DL,

    const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {

  AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);

  if (Cond == AArch64CC::Invalid)

    return SDValue();

  // The output variable should be a scalar integer.

  if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||

      OpInfo.ConstraintVT.getSizeInBits() < 8)

    report_fatal_error("Flag output operand is of invalid type");


  // Get NZCV register. Only update chain when copyfrom is glued.

  if (Glue.getNode()) {

    Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue);

    Chain = Glue.getValue(1);

  } else

    Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT);

  // Extract CC code.

  SDValue CC = getSETCC(Cond, Glue, DL, DAG);


  SDValue Result;


  // Truncate or ZERO_EXTEND based on value types.

  if (OpInfo.ConstraintVT.getSizeInBits() <= 32)

    Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);

  else

    Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);


  return Result;

}


/// getConstraintType - Given a constraint letter, return the type of

/// constraint it is for this target.

AArch64TargetLowering::ConstraintType

AArch64TargetLowering::getConstraintType(StringRef Constraint) const {

  if (Constraint.size() == 1) {

    switch (Constraint[0]) {

    default:

      break;

    case 'x':

    case 'w':

    case 'y':

      return C_RegisterClass;

    // An address with a single base register. Due to the way we

    // currently handle addresses it is the same as 'r'.

    case 'Q':

      return C_Memory;

    case 'I':

    case 'J':

    case 'K':

    case 'L':

    case 'M':

    case 'N':

    case 'Y':

    case 'Z':

      return C_Immediate;

    case 'z':

    case 'S': // A symbol or label reference with a constant offset

      return C_Other;

    }

  } else if (parsePredicateConstraint(Constraint))

    return C_RegisterClass;

  else if (parseReducedGprConstraint(Constraint))

    return C_RegisterClass;

  else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)

    return C_Other;

  return TargetLowering::getConstraintType(Constraint);

}


/// Examine constraint type and operand type and determine a weight value.

/// This object must already have been set up with the operand type

/// and the current alternative constraint selected.

TargetLowering::ConstraintWeight

AArch64TargetLowering::getSingleConstraintMatchWeight(

    AsmOperandInfo &info, const char *constraint) const {

  ConstraintWeight weight = CW_Invalid;

  Value *CallOperandVal = info.CallOperandVal;

  // If we don't have a value, we can't do a match,

  // but allow it at the lowest weight.

  if (!CallOperandVal)

    return CW_Default;

  Type *type = CallOperandVal->getType();

  // Look at the constraint type.

  switch (*constraint) {

  default:

    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);

    break;

  case 'x':

  case 'w':

  case 'y':

    if (type->isFloatingPointTy() || type->isVectorTy())

      weight = CW_Register;

    break;

  case 'z':

    weight = CW_Constant;

    break;

  case 'U':

    if (parsePredicateConstraint(constraint) ||

        parseReducedGprConstraint(constraint))

      weight = CW_Register;

    break;

  }

  return weight;

}


std::pair<unsigned, const TargetRegisterClass *>

AArch64TargetLowering::getRegForInlineAsmConstraint(

    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {

  if (Constraint.size() == 1) {

    switch (Constraint[0]) {

    case 'r':

      if (VT.isScalableVector())

        return std::make_pair(0U, nullptr);

      if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)

        return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);

      if (VT.getFixedSizeInBits() == 64)

        return std::make_pair(0U, &AArch64::GPR64commonRegClass);

      return std::make_pair(0U, &AArch64::GPR32commonRegClass);

    case 'w': {

      if (!Subtarget->hasFPARMv8())

        break;

      if (VT.isScalableVector()) {

        if (VT.getVectorElementType() != MVT::i1)

          return std::make_pair(0U, &AArch64::ZPRRegClass);

        return std::make_pair(0U, nullptr);

      }

      if (VT == MVT::Other)

        break;

      uint64_t VTSize = VT.getFixedSizeInBits();

      if (VTSize == 16)

        return std::make_pair(0U, &AArch64::FPR16RegClass);

      if (VTSize == 32)

        return std::make_pair(0U, &AArch64::FPR32RegClass);

      if (VTSize == 64)

        return std::make_pair(0U, &AArch64::FPR64RegClass);

      if (VTSize == 128)

        return std::make_pair(0U, &AArch64::FPR128RegClass);

      break;

    }

    // The instructions that this constraint is designed for can

    // only take 128-bit registers so just use that regclass.

    case 'x':

      if (!Subtarget->hasFPARMv8())

        break;

      if (VT.isScalableVector())

        return std::make_pair(0U, &AArch64::ZPR_4bRegClass);

      if (VT.getSizeInBits() == 128)

        return std::make_pair(0U, &AArch64::FPR128_loRegClass);

      break;

    case 'y':

      if (!Subtarget->hasFPARMv8())

        break;

      if (VT.isScalableVector())

        return std::make_pair(0U, &AArch64::ZPR_3bRegClass);

      break;

    }

  } else {

    if (const auto P = parseSVERegAsConstraint(Constraint)) {

      // SME functions that are not in streaming mode, should

      // still observe clobbers of Z-registers by clobbering

      // the lower 128bits of those registers.

      if (AArch64::ZPRRegClass.hasSubClassEq(P->second) &&

          !Subtarget->isSVEorStreamingSVEAvailable())

        return std::make_pair(TRI->getSubReg(P->first, AArch64::zsub),

                              &AArch64::FPR128RegClass);

      return *P;

    }

    if (const auto PC = parsePredicateConstraint(Constraint))

      if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))

        return std::make_pair(0U, RegClass);


    if (const auto RGC = parseReducedGprConstraint(Constraint))

      if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))

        return std::make_pair(0U, RegClass);

  }

  if (StringRef("{cc}").equals_insensitive(Constraint) ||

      parseConstraintCode(Constraint) != AArch64CC::Invalid)

    return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);


  if (Constraint == "{za}") {

    return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);

  }


  if (Constraint == "{zt0}") {

    return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);

  }


  // Use the default implementation in TargetLowering to convert the register

  // constraint into a member of a register class.

  std::pair<unsigned, const TargetRegisterClass *> Res;

  Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);


  // Not found as a standard register?

  if (!Res.second) {

    unsigned Size = Constraint.size();

    if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&

        tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {

      int RegNo;

      bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);

      if (!Failed && RegNo >= 0 && RegNo <= 31) {

        // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.

        // By default we'll emit v0-v31 for this unless there's a modifier where

        // we'll emit the correct register as well.

        if (VT != MVT::Other && VT.getSizeInBits() == 64) {

          Res.first = AArch64::FPR64RegClass.getRegister(RegNo);

          Res.second = &AArch64::FPR64RegClass;

        } else {

          Res.first = AArch64::FPR128RegClass.getRegister(RegNo);

          Res.second = &AArch64::FPR128RegClass;

        }

      }

    }

  }


  if (Res.second && !Subtarget->hasFPARMv8() &&

      !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&

      !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))

    return std::make_pair(0U, nullptr);


  return Res;

}


EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,

                                                  llvm::Type *Ty,

                                                  bool AllowUnknown) const {

  if (Subtarget->hasLS64() && Ty->isIntegerTy(512))

    return EVT(MVT::i64x8);


  return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);

}


/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops

/// vector.  If it is invalid, don't add anything to Ops.

void AArch64TargetLowering::LowerAsmOperandForConstraint(

    SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,

    SelectionDAG &DAG) const {

  SDValue Result;


  // Currently only support length 1 constraints.

  if (Constraint.size() != 1)

    return;


  char ConstraintLetter = Constraint[0];

  switch (ConstraintLetter) {

  default:

    break;


  // This set of constraints deal with valid constants for various instructions.

  // Validate and return a target constant for them if we can.

  case 'z': {

    // 'z' maps to xzr or wzr so it needs an input of 0.

    if (!isNullConstant(Op))

      return;


    if (Op.getValueType() == MVT::i64)

      Result = DAG.getRegister(AArch64::XZR, MVT::i64);

    else

      Result = DAG.getRegister(AArch64::WZR, MVT::i32);

    break;

  }

  case 'S':

    // Use the generic code path for "s". In GCC's aarch64 port, "S" is

    // supported for PIC while "s" isn't, making "s" less useful. We implement

    // "S" but not "s".

    TargetLowering::LowerAsmOperandForConstraint(Op, "s", Ops, DAG);

    break;


  case 'I':

  case 'J':

  case 'K':

  case 'L':

  case 'M':

  case 'N':

    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);

    if (!C)

      return;


    // Grab the value and do some validation.

    uint64_t CVal = C->getZExtValue();

    switch (ConstraintLetter) {

    // The I constraint applies only to simple ADD or SUB immediate operands:

    // i.e. 0 to 4095 with optional shift by 12

    // The J constraint applies only to ADD or SUB immediates that would be

    // valid when negated, i.e. if [an add pattern] were to be output as a SUB

    // instruction [or vice versa], in other words -1 to -4095 with optional

    // left shift by 12.

    case 'I':

      if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))

        break;

      return;

    case 'J': {

      uint64_t NVal = -C->getSExtValue();

      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {

        CVal = C->getSExtValue();

        break;

      }

      return;

    }

    // The K and L constraints apply *only* to logical immediates, including

    // what used to be the MOVI alias for ORR (though the MOVI alias has now

    // been removed and MOV should be used). So these constraints have to

    // distinguish between bit patterns that are valid 32-bit or 64-bit

    // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but

    // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice

    // versa.

    case 'K':

      if (AArch64_AM::isLogicalImmediate(CVal, 32))

        break;

      return;

    case 'L':

      if (AArch64_AM::isLogicalImmediate(CVal, 64))

        break;

      return;

    // The M and N constraints are a superset of K and L respectively, for use

    // with the MOV (immediate) alias. As well as the logical immediates they

    // also match 32 or 64-bit immediates that can be loaded either using a

    // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca

    // (M) or 64-bit 0x1234000000000000 (N) etc.

    // As a note some of this code is liberally stolen from the asm parser.

    case 'M': {

      if (!isUInt<32>(CVal))

        return;

      if (AArch64_AM::isLogicalImmediate(CVal, 32))

        break;

      if ((CVal & 0xFFFF) == CVal)

        break;

      if ((CVal & 0xFFFF0000ULL) == CVal)

        break;

      uint64_t NCVal = ~(uint32_t)CVal;

      if ((NCVal & 0xFFFFULL) == NCVal)

        break;

      if ((NCVal & 0xFFFF0000ULL) == NCVal)

        break;

      return;

    }

    case 'N': {

      if (AArch64_AM::isLogicalImmediate(CVal, 64))

        break;

      if ((CVal & 0xFFFFULL) == CVal)

        break;

      if ((CVal & 0xFFFF0000ULL) == CVal)

        break;

      if ((CVal & 0xFFFF00000000ULL) == CVal)

        break;

      if ((CVal & 0xFFFF000000000000ULL) == CVal)

        break;

      uint64_t NCVal = ~CVal;

      if ((NCVal & 0xFFFFULL) == NCVal)

        break;

      if ((NCVal & 0xFFFF0000ULL) == NCVal)

        break;

      if ((NCVal & 0xFFFF00000000ULL) == NCVal)

        break;

      if ((NCVal & 0xFFFF000000000000ULL) == NCVal)

        break;

      return;

    }

    default:

      return;

    }


    // All assembler immediates are 64-bit integers.

    Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);

    break;

  }


  if (Result.getNode()) {

    Ops.push_back(Result);

    return;

  }


  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);

}


//===----------------------------------------------------------------------===//

//                     AArch64 Advanced SIMD Support

//===----------------------------------------------------------------------===//


/// WidenVector - Given a value in the V64 register class, produce the

/// equivalent value in the V128 register class.


static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {

  EVT VT = V64Reg.getValueType();

  unsigned NarrowSize = VT.getVectorNumElements();

  MVT EltTy = VT.getVectorElementType().getSimpleVT();

  MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);

  SDLoc DL(V64Reg);


  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),

                     V64Reg, DAG.getConstant(0, DL, MVT::i64));

}


/// getExtFactor - Determine the adjustment factor for the position when

/// generating an "extract from vector registers" instruction.


static unsigned getExtFactor(SDValue &V) {

  EVT EltType = V.getValueType().getVectorElementType();

  return EltType.getSizeInBits() / 8;

}


// Check if a vector is built from one vector via extracted elements of

// another together with an AND mask, ensuring that all elements fit

// within range. This can be reconstructed using AND and NEON's TBL1.


SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) {

  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");

  SDLoc DL(Op);

  EVT VT = Op.getValueType();

  assert(!VT.isScalableVector() &&

         "Scalable vectors cannot be used with ISD::BUILD_VECTOR");


  // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map

  // directly to TBL1.

  if (VT != MVT::v16i8 && VT != MVT::v8i8)

    return SDValue();


  unsigned NumElts = VT.getVectorNumElements();

  assert((NumElts == 8 || NumElts == 16) &&

         "Need to have exactly 8 or 16 elements in vector.");


  SDValue SourceVec;

  SDValue MaskSourceVec;

  SmallVector<SDValue, 16> AndMaskConstants;


  for (unsigned i = 0; i < NumElts; ++i) {

    SDValue V = Op.getOperand(i);

    if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

      return SDValue();


    SDValue OperandSourceVec = V.getOperand(0);

    if (!SourceVec)

      SourceVec = OperandSourceVec;

    else if (SourceVec != OperandSourceVec)

      return SDValue();


    // This only looks at shuffles with elements that are

    // a) truncated by a constant AND mask extracted from a mask vector, or

    // b) extracted directly from a mask vector.

    SDValue MaskSource = V.getOperand(1);

    if (MaskSource.getOpcode() == ISD::AND) {

      if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))

        return SDValue();


      AndMaskConstants.push_back(MaskSource.getOperand(1));

      MaskSource = MaskSource->getOperand(0);

    } else if (!AndMaskConstants.empty()) {

      // Either all or no operands should have an AND mask.

      return SDValue();

    }


    // An ANY_EXTEND may be inserted between the AND and the source vector

    // extraction. We don't care about that, so we can just skip it.

    if (MaskSource.getOpcode() == ISD::ANY_EXTEND)

      MaskSource = MaskSource.getOperand(0);


    if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

      return SDValue();


    SDValue MaskIdx = MaskSource.getOperand(1);

    if (!isa<ConstantSDNode>(MaskIdx) ||

        !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))

      return SDValue();


    // We only apply this if all elements come from the same vector with the

    // same vector type.

    if (!MaskSourceVec) {

      MaskSourceVec = MaskSource->getOperand(0);

      if (MaskSourceVec.getValueType() != VT)

        return SDValue();

    } else if (MaskSourceVec != MaskSource->getOperand(0)) {

      return SDValue();

    }

  }


  // We need a v16i8 for TBL, so we extend the source with a placeholder vector

  // for v8i8 to get a v16i8. As the pattern we are replacing is extract +

  // insert, we know that the index in the mask must be smaller than the number

  // of elements in the source, or we would have an out-of-bounds access.

  if (NumElts == 8)

    SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, SourceVec,

                            DAG.getUNDEF(VT));


  // Preconditions met, so we can use a vector (AND +) TBL to build this vector.

  if (!AndMaskConstants.empty())

    MaskSourceVec = DAG.getNode(ISD::AND, DL, VT, MaskSourceVec,

                                DAG.getBuildVector(VT, DL, AndMaskConstants));


  return DAG.getNode(

      ISD::INTRINSIC_WO_CHAIN, DL, VT,

      DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), SourceVec,

      MaskSourceVec);

}


// Gather data to see if the operation can be modelled as a

// shuffle in combination with VEXTs.


SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,

                                                  SelectionDAG &DAG) const {

  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");

  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");

  SDLoc DL(Op);

  EVT VT = Op.getValueType();

  assert(!VT.isScalableVector() &&

         "Scalable vectors cannot be used with ISD::BUILD_VECTOR");

  unsigned NumElts = VT.getVectorNumElements();


  struct ShuffleSourceInfo {

    SDValue Vec;

    unsigned MinElt;

    unsigned MaxElt;


    // We may insert some combination of BITCASTs and VEXT nodes to force Vec to

    // be compatible with the shuffle we intend to construct. As a result

    // ShuffleVec will be some sliding window into the original Vec.

    SDValue ShuffleVec;


    // Code should guarantee that element i in Vec starts at element "WindowBase

    // + i * WindowScale in ShuffleVec".

    int WindowBase;

    int WindowScale;


    ShuffleSourceInfo(SDValue Vec)

      : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),

          ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}


    bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }

  };


  // First gather all vectors used as an immediate source for this BUILD_VECTOR

  // node.

  SmallVector<ShuffleSourceInfo, 2> Sources;

  for (unsigned i = 0; i < NumElts; ++i) {

    SDValue V = Op.getOperand(i);

    if (V.isUndef())

      continue;

    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

             !isa<ConstantSDNode>(V.getOperand(1)) ||

             V.getOperand(0).getValueType().isScalableVector()) {

      LLVM_DEBUG(

          dbgs() << "Reshuffle failed: "

                    "a shuffle can only come from building a vector from "

                    "various elements of other fixed-width vectors, provided "

                    "their indices are constant\n");

      return SDValue();

    }


    // Add this element source to the list if it's not already there.

    SDValue SourceVec = V.getOperand(0);

    auto Source = find(Sources, SourceVec);

    if (Source == Sources.end())

      Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));


    // Update the minimum and maximum lane number seen.

    unsigned EltNo = V.getConstantOperandVal(1);

    Source->MinElt = std::min(Source->MinElt, EltNo);

    Source->MaxElt = std::max(Source->MaxElt, EltNo);

  }


  // If we have 3 or 4 sources, try to generate a TBL, which will at least be

  // better than moving to/from gpr registers for larger vectors.

  if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {

    // Construct a mask for the tbl. We may need to adjust the index for types

    // larger than i8.

    SmallVector<unsigned, 16> Mask;

    unsigned OutputFactor = VT.getScalarSizeInBits() / 8;

    for (unsigned I = 0; I < NumElts; ++I) {

      SDValue V = Op.getOperand(I);

      if (V.isUndef()) {

        for (unsigned OF = 0; OF < OutputFactor; OF++)

          Mask.push_back(-1);

        continue;

      }

      // Set the Mask lanes adjusted for the size of the input and output

      // lanes. The Mask is always i8, so it will set OutputFactor lanes per

      // output element, adjusted in their positions per input and output types.

      unsigned Lane = V.getConstantOperandVal(1);

      for (unsigned S = 0; S < Sources.size(); S++) {

        if (V.getOperand(0) == Sources[S].Vec) {

          unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();

          unsigned InputBase = 16 * S + Lane * InputSize / 8;

          for (unsigned OF = 0; OF < OutputFactor; OF++)

            Mask.push_back(InputBase + OF);

          break;

        }

      }

    }


    // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to

    // v16i8, and the TBLMask

    SmallVector<SDValue, 16> TBLOperands;

    TBLOperands.push_back(DAG.getConstant(Sources.size() == 3

                                              ? Intrinsic::aarch64_neon_tbl3

                                              : Intrinsic::aarch64_neon_tbl4,

                                          DL, MVT::i32));

    for (unsigned i = 0; i < Sources.size(); i++) {

      SDValue Src = Sources[i].Vec;

      EVT SrcVT = Src.getValueType();

      Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);

      assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&

             "Expected a legally typed vector");

      if (SrcVT.is64BitVector())

        Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Src,

                          DAG.getUNDEF(MVT::v8i8));

      TBLOperands.push_back(Src);

    }


    SmallVector<SDValue, 16> TBLMask;

    for (unsigned i = 0; i < Mask.size(); i++)

      TBLMask.push_back(DAG.getConstant(Mask[i], DL, MVT::i32));

    assert((Mask.size() == 8 || Mask.size() == 16) &&

           "Expected a v8i8 or v16i8 Mask");

    TBLOperands.push_back(DAG.getBuildVector(

        Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, DL, TBLMask));


    SDValue Shuffle =

        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL,

                    Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);

    return DAG.getBitcast(VT, Shuffle);

  }


  if (Sources.size() > 2) {

    LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "

                      << "sensible when at most two source vectors are "

                      << "involved\n");

    return SDValue();

  }


  // Find out the smallest element size among result and two sources, and use

  // it as element size to build the shuffle_vector.

  EVT SmallestEltTy = VT.getVectorElementType();

  for (auto &Source : Sources) {

    EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();

    if (SrcEltTy.bitsLT(SmallestEltTy)) {

      SmallestEltTy = SrcEltTy;

    }

  }

  unsigned ResMultiplier =

      VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();

  uint64_t VTSize = VT.getFixedSizeInBits();

  NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();

  EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);


  // If the source vector is too wide or too narrow, we may nevertheless be able

  // to construct a compatible shuffle either by concatenating it with UNDEF or

  // extracting a suitable range of elements.

  for (auto &Src : Sources) {

    EVT SrcVT = Src.ShuffleVec.getValueType();


    TypeSize SrcVTSize = SrcVT.getSizeInBits();

    if (SrcVTSize == TypeSize::getFixed(VTSize))

      continue;


    // This stage of the search produces a source with the same element type as

    // the original, but with a total width matching the BUILD_VECTOR output.

    EVT EltVT = SrcVT.getVectorElementType();

    unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();

    EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);


    if (SrcVTSize.getFixedValue() < VTSize) {

      assert(2 * SrcVTSize == VTSize);

      // We can pad out the smaller vector for free, so if it's part of a

      // shuffle...

      Src.ShuffleVec =

          DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Src.ShuffleVec,

                      DAG.getUNDEF(Src.ShuffleVec.getValueType()));

      continue;

    }


    if (SrcVTSize.getFixedValue() != 2 * VTSize) {

      LLVM_DEBUG(

          dbgs() << "Reshuffle failed: result vector too small to extract\n");

      return SDValue();

    }


    if (Src.MaxElt - Src.MinElt >= NumSrcElts) {

      LLVM_DEBUG(

          dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");

      return SDValue();

    }


    if (Src.MinElt >= NumSrcElts) {

      // The extraction can just take the second half

      Src.ShuffleVec =

          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,

                      DAG.getConstant(NumSrcElts, DL, MVT::i64));

      Src.WindowBase = -NumSrcElts;

    } else if (Src.MaxElt < NumSrcElts) {

      // The extraction can just take the first half

      Src.ShuffleVec =

          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,

                      DAG.getConstant(0, DL, MVT::i64));

    } else {

      // An actual VEXT is needed

      SDValue VEXTSrc1 =

          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,

                      DAG.getConstant(0, DL, MVT::i64));

      SDValue VEXTSrc2 =

          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,

                      DAG.getConstant(NumSrcElts, DL, MVT::i64));

      unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);


      if (!SrcVT.is64BitVector()) {

        LLVM_DEBUG(

          dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "

                    "for SVE vectors.");

        return SDValue();

      }


      Src.ShuffleVec =

          DAG.getNode(AArch64ISD::EXT, DL, DestVT, VEXTSrc1, VEXTSrc2,

                      DAG.getConstant(Imm, DL, MVT::i32));

      Src.WindowBase = -Src.MinElt;

    }

  }


  // Another possible incompatibility occurs from the vector element types. We

  // can fix this by bitcasting the source vectors to the same type we intend

  // for the shuffle.

  for (auto &Src : Sources) {

    EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();

    if (SrcEltTy == SmallestEltTy)

      continue;

    assert(ShuffleVT.getVectorElementType() == SmallestEltTy);

    if (DAG.getDataLayout().isBigEndian()) {

      Src.ShuffleVec =

          DAG.getNode(AArch64ISD::NVCAST, DL, ShuffleVT, Src.ShuffleVec);

    } else {

      Src.ShuffleVec = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Src.ShuffleVec);

    }

    Src.WindowScale =

        SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();

    Src.WindowBase *= Src.WindowScale;

  }


  // Final check before we try to actually produce a shuffle.

  LLVM_DEBUG({

    for (auto Src : Sources)

      assert(Src.ShuffleVec.getValueType() == ShuffleVT);

  });


  // The stars all align, our next step is to produce the mask for the shuffle.

  SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);

  int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();

  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {

    SDValue Entry = Op.getOperand(i);

    if (Entry.isUndef())

      continue;


    auto Src = find(Sources, Entry.getOperand(0));

    int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();


    // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit

    // trunc. So only std::min(SrcBits, DestBits) actually get defined in this

    // segment.

    EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();

    int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),

                               VT.getScalarSizeInBits());

    int LanesDefined = BitsDefined / BitsPerShuffleLane;


    // This source is expected to fill ResMultiplier lanes of the final shuffle,

    // starting at the appropriate offset.

    int *LaneMask = &Mask[i * ResMultiplier];


    int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;

    ExtractBase += NumElts * (Src - Sources.begin());

    for (int j = 0; j < LanesDefined; ++j)

      LaneMask[j] = ExtractBase + j;

  }


  // Final check before we try to produce nonsense...

  if (!isShuffleMaskLegal(Mask, ShuffleVT)) {

    LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");

    return SDValue();

  }


  SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };

  for (unsigned i = 0; i < Sources.size(); ++i)

    ShuffleOps[i] = Sources[i].ShuffleVec;


  SDValue Shuffle =

      DAG.getVectorShuffle(ShuffleVT, DL, ShuffleOps[0], ShuffleOps[1], Mask);

  SDValue V;

  if (DAG.getDataLayout().isBigEndian()) {

    V = DAG.getNode(AArch64ISD::NVCAST, DL, VT, Shuffle);

  } else {

    V = DAG.getNode(ISD::BITCAST, DL, VT, Shuffle);

  }


  LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();

             dbgs() << "Reshuffle, creating node: "; V.dump(););


  return V;

}


// check if an EXT instruction can handle the shuffle mask when the

// vector sources of the shuffle are the same.


static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {

  unsigned NumElts = VT.getVectorNumElements();


  // Assume that the first shuffle index is not UNDEF.  Fail if it is.

  if (M[0] < 0)

    return false;


  Imm = M[0];


  // If this is a VEXT shuffle, the immediate value is the index of the first

  // element.  The other shuffle indices must be the successive elements after

  // the first one.

  unsigned ExpectedElt = Imm;

  for (unsigned i = 1; i < NumElts; ++i) {

    // Increment the expected index.  If it wraps around, just follow it

    // back to index zero and keep going.

    ++ExpectedElt;

    if (ExpectedElt == NumElts)

      ExpectedElt = 0;


    if (M[i] < 0)

      continue; // ignore UNDEF indices

    if (ExpectedElt != static_cast<unsigned>(M[i]))

      return false;

  }


  return true;

}


// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from

// v4i32s. This is really a truncate, which we can construct out of (legal)

// concats and truncate nodes.


static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {

  if (V.getValueType() != MVT::v16i8)

    return SDValue();

  assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");


  for (unsigned X = 0; X < 4; X++) {

    // Check the first item in each group is an extract from lane 0 of a v4i32

    // or v4i16.

    SDValue BaseExt = V.getOperand(X * 4);

    if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

        (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&

         BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||

        !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||

        BaseExt.getConstantOperandVal(1) != 0)

      return SDValue();

    SDValue Base = BaseExt.getOperand(0);

    // And check the other items are extracts from the same vector.

    for (unsigned Y = 1; Y < 4; Y++) {

      SDValue Ext = V.getOperand(X * 4 + Y);

      if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

          Ext.getOperand(0) != Base ||

          !isa<ConstantSDNode>(Ext.getOperand(1)) ||

          Ext.getConstantOperandVal(1) != Y)

        return SDValue();

    }

  }


  // Turn the buildvector into a series of truncates and concates, which will

  // become uzip1's. Any v4i32s we found get truncated to v4i16, which are

  // concat together to produce 2 v8i16. These are both truncated and concat

  // together.

  SDLoc DL(V);

  SDValue Trunc[4] = {

      V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),

      V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};

  for (SDValue &V : Trunc)

    if (V.getValueType() == MVT::v4i32)

      V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);

  SDValue Concat0 =

      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);

  SDValue Concat1 =

      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);

  SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);

  SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);

  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);

}


/// Check if a vector shuffle corresponds to a DUP instructions with a larger

/// element width than the vector lane type. If that is the case the function

/// returns true and writes the value of the DUP instruction lane operand into

/// DupLaneOp


static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,

                          unsigned &DupLaneOp) {

  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&

         "Only possible block sizes for wide DUP are: 16, 32, 64");


  if (BlockSize <= VT.getScalarSizeInBits())

    return false;

  if (BlockSize % VT.getScalarSizeInBits() != 0)

    return false;

  if (VT.getSizeInBits() % BlockSize != 0)

    return false;


  size_t SingleVecNumElements = VT.getVectorNumElements();

  size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();

  size_t NumBlocks = VT.getSizeInBits() / BlockSize;


  // We are looking for masks like

  // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element

  // might be replaced by 'undefined'. BlockIndices will eventually contain

  // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]

  // for the above examples)

  SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);

  for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)

    for (size_t I = 0; I < NumEltsPerBlock; I++) {

      int Elt = M[BlockIndex * NumEltsPerBlock + I];

      if (Elt < 0)

        continue;

      // For now we don't support shuffles that use the second operand

      if ((unsigned)Elt >= SingleVecNumElements)

        return false;

      if (BlockElts[I] < 0)

        BlockElts[I] = Elt;

      else if (BlockElts[I] != Elt)

        return false;

    }


  // We found a candidate block (possibly with some undefs). It must be a

  // sequence of consecutive integers starting with a value divisible by

  // NumEltsPerBlock with some values possibly replaced by undef-s.


  // Find first non-undef element

  auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });

  assert(FirstRealEltIter != BlockElts.end() &&

         "Shuffle with all-undefs must have been caught by previous cases, "

         "e.g. isSplat()");

  if (FirstRealEltIter == BlockElts.end()) {

    DupLaneOp = 0;

    return true;

  }


  // Index of FirstRealElt in BlockElts

  size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();


  if ((unsigned)*FirstRealEltIter < FirstRealIndex)

    return false;

  // BlockElts[0] must have the following value if it isn't undef:

  size_t Elt0 = *FirstRealEltIter - FirstRealIndex;


  // Check the first element

  if (Elt0 % NumEltsPerBlock != 0)

    return false;

  // Check that the sequence indeed consists of consecutive integers (modulo

  // undefs)

  for (size_t I = 0; I < NumEltsPerBlock; I++)

    if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)

      return false;


  DupLaneOp = Elt0 / NumEltsPerBlock;

  return true;

}


// check if an EXT instruction can handle the shuffle mask when the

// vector sources of the shuffle are different.


static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,

                      unsigned &Imm) {

  // Look for the first non-undef element.

  const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });


  // Benefit from APInt to handle overflow when calculating expected element.

  unsigned NumElts = VT.getVectorNumElements();

  unsigned MaskBits = APInt(32, NumElts * 2).logBase2();

  APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,

                            /*implicitTrunc=*/true);

  // The following shuffle indices must be the successive elements after the

  // first real element.

  bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {

    return Elt != ExpectedElt++ && Elt >= 0;

  });

  if (FoundWrongElt)

    return false;


  // The index of an EXT is the first element if it is not UNDEF.

  // Watch out for the beginning UNDEFs. The EXT index should be the expected

  // value of the first element.  E.g.

  // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.

  // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.

  // ExpectedElt is the last mask index plus 1.

  Imm = ExpectedElt.getZExtValue();


  // There are two difference cases requiring to reverse input vectors.

  // For example, for vector <4 x i32> we have the following cases,

  // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)

  // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)

  // For both cases, we finally use mask <5, 6, 7, 0>, which requires

  // to reverse two input vectors.

  if (Imm < NumElts)

    ReverseEXT = true;

  else

    Imm -= NumElts;


  return true;

}


/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of

/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".

/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.


static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {

  unsigned NumElts = VT.getVectorNumElements();

  if (NumElts % 2 != 0)

    return false;

  WhichResult = (M[0] == 0 ? 0 : 1);

  unsigned Idx = WhichResult * NumElts / 2;

  for (unsigned i = 0; i != NumElts; i += 2) {

    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||

        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))

      return false;

    Idx += 1;

  }


  return true;

}


/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of

/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".

/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,


static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {

  unsigned Half = VT.getVectorNumElements() / 2;

  WhichResult = (M[0] == 0 ? 0 : 1);

  for (unsigned j = 0; j != 2; ++j) {

    unsigned Idx = WhichResult;

    for (unsigned i = 0; i != Half; ++i) {

      int MIdx = M[i + j * Half];

      if (MIdx >= 0 && (unsigned)MIdx != Idx)

        return false;

      Idx += 2;

    }

  }


  return true;

}


/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of

/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".

/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.


static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {

  unsigned NumElts = VT.getVectorNumElements();

  if (NumElts % 2 != 0)

    return false;

  WhichResult = (M[0] == 0 ? 0 : 1);

  for (unsigned i = 0; i < NumElts; i += 2) {

    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||

        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))

      return false;

  }

  return true;

}


static bool isINSMask(ArrayRef<int> M, int NumInputElements,

                      bool &DstIsLeft, int &Anomaly) {

  if (M.size() != static_cast<size_t>(NumInputElements))

    return false;


  int NumLHSMatch = 0, NumRHSMatch = 0;

  int LastLHSMismatch = -1, LastRHSMismatch = -1;


  for (int i = 0; i < NumInputElements; ++i) {

    if (M[i] == -1) {

      ++NumLHSMatch;

      ++NumRHSMatch;

      continue;

    }


    if (M[i] == i)

      ++NumLHSMatch;

    else

      LastLHSMismatch = i;


    if (M[i] == i + NumInputElements)

      ++NumRHSMatch;

    else

      LastRHSMismatch = i;

  }


  if (NumLHSMatch == NumInputElements - 1) {

    DstIsLeft = true;

    Anomaly = LastLHSMismatch;

    return true;

  } else if (NumRHSMatch == NumInputElements - 1) {

    DstIsLeft = false;

    Anomaly = LastRHSMismatch;

    return true;

  }


  return false;

}


static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {

  if (VT.getSizeInBits() != 128)

    return false;


  unsigned NumElts = VT.getVectorNumElements();


  for (int I = 0, E = NumElts / 2; I != E; I++) {

    if (Mask[I] != I)

      return false;

  }


  int Offset = NumElts / 2;

  for (int I = NumElts / 2, E = NumElts; I != E; I++) {

    if (Mask[I] != I + SplitLHS * Offset)

      return false;

  }


  return true;

}


static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {

  SDLoc DL(Op);

  EVT VT = Op.getValueType();

  SDValue V0 = Op.getOperand(0);

  SDValue V1 = Op.getOperand(1);

  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();


  if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||

      VT.getVectorElementType() != V1.getValueType().getVectorElementType())

    return SDValue();


  bool SplitV0 = V0.getValueSizeInBits() == 128;


  if (!isConcatMask(Mask, VT, SplitV0))

    return SDValue();


  EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());

  if (SplitV0) {

    V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,

                     DAG.getConstant(0, DL, MVT::i64));

  }

  if (V1.getValueSizeInBits() == 128) {

    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,

                     DAG.getConstant(0, DL, MVT::i64));

  }

  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);

}


/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit

/// the specified operations to build the shuffle. ID is the perfect-shuffle

//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle

//table entry and LHS/RHS are the immediate inputs for this stage of the

//shuffle.


static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2,

                                      unsigned PFEntry, SDValue LHS,

                                      SDValue RHS, SelectionDAG &DAG,

                                      const SDLoc &DL) {

  unsigned OpNum = (PFEntry >> 26) & 0x0F;

  unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);

  unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);


  enum {

    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>

    OP_VREV,

    OP_VDUP0,

    OP_VDUP1,

    OP_VDUP2,

    OP_VDUP3,

    OP_VEXT1,

    OP_VEXT2,

    OP_VEXT3,

    OP_VUZPL,  // VUZP, left result

    OP_VUZPR,  // VUZP, right result

    OP_VZIPL,  // VZIP, left result

    OP_VZIPR,  // VZIP, right result

    OP_VTRNL,  // VTRN, left result

    OP_VTRNR,  // VTRN, right result

    OP_MOVLANE // Move lane. RHSID is the lane to move into

  };


  if (OpNum == OP_COPY) {

    if (LHSID == (1 * 9 + 2) * 9 + 3)

      return LHS;

    assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");

    return RHS;

  }


  if (OpNum == OP_MOVLANE) {

    // Decompose a PerfectShuffle ID to get the Mask for lane Elt

    auto getPFIDLane = [](unsigned ID, int Elt) -> int {

      assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");

      Elt = 3 - Elt;

      while (Elt > 0) {

        ID /= 9;

        Elt--;

      }

      return (ID % 9 == 8) ? -1 : ID % 9;

    };


    // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We

    // get the lane to move from the PFID, which is always from the

    // original vectors (V1 or V2).

    SDValue OpLHS = GeneratePerfectShuffle(

        LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, DL);

    EVT VT = OpLHS.getValueType();

    assert(RHSID < 8 && "Expected a lane index for RHSID!");

    unsigned ExtLane = 0;

    SDValue Input;


    // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs

    // convert into a higher type.

    if (RHSID & 0x4) {

      int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;

      if (MaskElt == -1)

        MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;

      assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");

      ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);

      Input = MaskElt < 2 ? V1 : V2;

      if (VT.getScalarSizeInBits() == 16) {

        Input = DAG.getBitcast(MVT::v2f32, Input);

        OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);

      } else {

        assert(VT.getScalarSizeInBits() == 32 &&

               "Expected 16 or 32 bit shuffle elements");

        Input = DAG.getBitcast(MVT::v2f64, Input);

        OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);

      }

    } else {

      int MaskElt = getPFIDLane(ID, RHSID);

      assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");

      ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);

      Input = MaskElt < 4 ? V1 : V2;

      // Be careful about creating illegal types. Use f16 instead of i16.

      if (VT == MVT::v4i16) {

        Input = DAG.getBitcast(MVT::v4f16, Input);

        OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);

      }

    }

    SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,

                              Input.getValueType().getVectorElementType(),

                              Input, DAG.getVectorIdxConstant(ExtLane, DL));

    SDValue Ins =

        DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Input.getValueType(), OpLHS,

                    Ext, DAG.getVectorIdxConstant(RHSID & 0x3, DL));

    return DAG.getBitcast(VT, Ins);

  }


  SDValue OpLHS, OpRHS;

  OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,

                                 RHS, DAG, DL);

  OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,

                                 RHS, DAG, DL);

  EVT VT = OpLHS.getValueType();


  switch (OpNum) {

  default:

    llvm_unreachable("Unknown shuffle opcode!");

  case OP_VREV:

    // VREV divides the vector in half and swaps within the half.

    if (VT.getVectorElementType() == MVT::i32 ||

        VT.getVectorElementType() == MVT::f32)

      return DAG.getNode(AArch64ISD::REV64, DL, VT, OpLHS);

    // vrev <4 x i16> -> REV32

    if (VT.getVectorElementType() == MVT::i16 ||

        VT.getVectorElementType() == MVT::f16 ||

        VT.getVectorElementType() == MVT::bf16)

      return DAG.getNode(AArch64ISD::REV32, DL, VT, OpLHS);

    // vrev <4 x i8> -> REV16

    assert(VT.getVectorElementType() == MVT::i8);

    return DAG.getNode(AArch64ISD::REV16, DL, VT, OpLHS);

  case OP_VDUP0:

  case OP_VDUP1:

  case OP_VDUP2:

  case OP_VDUP3: {

    EVT EltTy = VT.getVectorElementType();

    unsigned Opcode;

    if (EltTy == MVT::i8)

      Opcode = AArch64ISD::DUPLANE8;

    else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)

      Opcode = AArch64ISD::DUPLANE16;

    else if (EltTy == MVT::i32 || EltTy == MVT::f32)

      Opcode = AArch64ISD::DUPLANE32;

    else if (EltTy == MVT::i64 || EltTy == MVT::f64)

      Opcode = AArch64ISD::DUPLANE64;

    else

      llvm_unreachable("Invalid vector element type?");


    if (VT.getSizeInBits() == 64)

      OpLHS = WidenVector(OpLHS, DAG);

    SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, DL, MVT::i64);

    return DAG.getNode(Opcode, DL, VT, OpLHS, Lane);

  }

  case OP_VEXT1:

  case OP_VEXT2:

  case OP_VEXT3: {

    unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);

    return DAG.getNode(AArch64ISD::EXT, DL, VT, OpLHS, OpRHS,

                       DAG.getConstant(Imm, DL, MVT::i32));

  }

  case OP_VUZPL:

    return DAG.getNode(AArch64ISD::UZP1, DL, VT, OpLHS, OpRHS);

  case OP_VUZPR:

    return DAG.getNode(AArch64ISD::UZP2, DL, VT, OpLHS, OpRHS);

  case OP_VZIPL:

    return DAG.getNode(AArch64ISD::ZIP1, DL, VT, OpLHS, OpRHS);

  case OP_VZIPR:

    return DAG.getNode(AArch64ISD::ZIP2, DL, VT, OpLHS, OpRHS);

  case OP_VTRNL:

    return DAG.getNode(AArch64ISD::TRN1, DL, VT, OpLHS, OpRHS);

  case OP_VTRNR:

    return DAG.getNode(AArch64ISD::TRN2, DL, VT, OpLHS, OpRHS);

  }

}


static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,

                           SelectionDAG &DAG) {

  // Check to see if we can use the TBL instruction.

  SDValue V1 = Op.getOperand(0);

  SDValue V2 = Op.getOperand(1);

  SDLoc DL(Op);


  EVT EltVT = Op.getValueType().getVectorElementType();

  unsigned BytesPerElt = EltVT.getSizeInBits() / 8;


  bool Swap = false;

  if (V1.isUndef() || isZerosVector(V1.getNode())) {

    std::swap(V1, V2);

    Swap = true;

  }


  // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill

  // out of range values with 0s. We do need to make sure that any out-of-range

  // values are really out-of-range for a v16i8 vector.

  bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());

  MVT IndexVT = MVT::v8i8;

  unsigned IndexLen = 8;

  if (Op.getValueSizeInBits() == 128) {

    IndexVT = MVT::v16i8;

    IndexLen = 16;

  }


  SmallVector<SDValue, 8> TBLMask;

  for (int Val : ShuffleMask) {

    for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {

      unsigned Offset = Byte + Val * BytesPerElt;

      if (Swap)

        Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;

      if (IsUndefOrZero && Offset >= IndexLen)

        Offset = 255;

      TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));

    }

  }


  SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);

  SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);


  SDValue Shuffle;

  if (IsUndefOrZero) {

    if (IndexLen == 8)

      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);

    Shuffle = DAG.getNode(

        ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,

        DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,

        DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));

  } else {

    if (IndexLen == 8) {

      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);

      Shuffle = DAG.getNode(

          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,

          DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,

          DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));

    } else {

      // FIXME: We cannot, for the moment, emit a TBL2 instruction because we

      // cannot currently represent the register constraints on the input

      // table registers.

      //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,

      //                   DAG.getBuildVector(IndexVT, DL, &TBLMask[0],

      //                   IndexLen));

      Shuffle = DAG.getNode(

          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,

          DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,

          V2Cst,

          DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));

    }

  }

  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);

}


static unsigned getDUPLANEOp(EVT EltType) {

  if (EltType == MVT::i8)

    return AArch64ISD::DUPLANE8;

  if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)

    return AArch64ISD::DUPLANE16;

  if (EltType == MVT::i32 || EltType == MVT::f32)

    return AArch64ISD::DUPLANE32;

  if (EltType == MVT::i64 || EltType == MVT::f64)

    return AArch64ISD::DUPLANE64;


  llvm_unreachable("Invalid vector element type?");

}


static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT,

                            unsigned Opcode, SelectionDAG &DAG) {

  // Try to eliminate a bitcasted extract subvector before a DUPLANE.

  auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {

    // Match: dup (bitcast (extract_subv X, C)), LaneC

    if (BitCast.getOpcode() != ISD::BITCAST ||

        BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)

      return false;


    // The extract index must align in the destination type. That may not

    // happen if the bitcast is from narrow to wide type.

    SDValue Extract = BitCast.getOperand(0);

    unsigned ExtIdx = Extract.getConstantOperandVal(1);

    unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();

    unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;

    unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();

    if (ExtIdxInBits % CastedEltBitWidth != 0)

      return false;


    // Can't handle cases where vector size is not 128-bit

    if (!Extract.getOperand(0).getValueType().is128BitVector())

      return false;


    // Update the lane value by offsetting with the scaled extract index.

    LaneC += ExtIdxInBits / CastedEltBitWidth;


    // Determine the casted vector type of the wide vector input.

    // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'

    // Examples:

    // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3

    // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5

    unsigned SrcVecNumElts =

        Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;

    CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),

                              SrcVecNumElts);

    return true;

  };

  MVT CastVT;

  if (getScaledOffsetDup(V, Lane, CastVT)) {

    V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));

  } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

             V.getOperand(0).getValueType().is128BitVector()) {

    // The lane is incremented by the index of the extract.

    // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3

    Lane += V.getConstantOperandVal(1);

    V = V.getOperand(0);

  } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {

    // The lane is decremented if we are splatting from the 2nd operand.

    // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1

    unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;

    Lane -= Idx * VT.getVectorNumElements() / 2;

    V = WidenVector(V.getOperand(Idx), DAG);

  } else if (VT.getSizeInBits() == 64) {

    // Widen the operand to 128-bit register with undef.

    V = WidenVector(V, DAG);

  }

  return DAG.getNode(Opcode, DL, VT, V, DAG.getConstant(Lane, DL, MVT::i64));

}


// Try to widen element type to get a new mask value for a better permutation

// sequence, so that we can use NEON shuffle instructions, such as zip1/2,

// UZP1/2, TRN1/2, REV, INS, etc.

// For example:

//  shufflevector <4 x i32> %a, <4 x i32> %b,

//                <4 x i32> <i32 6, i32 7, i32 2, i32 3>

// is equivalent to:

//  shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>

// Finally, we can get:

//  mov     v0.d[0], v1.d[1]


static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {

  SDLoc DL(Op);

  EVT VT = Op.getValueType();

  EVT ScalarVT = VT.getVectorElementType();

  unsigned ElementSize = ScalarVT.getFixedSizeInBits();

  SDValue V0 = Op.getOperand(0);

  SDValue V1 = Op.getOperand(1);

  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();


  // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...

  // We need to make sure the wider element type is legal. Thus, ElementSize

  // should be not larger than 32 bits, and i1 type should also be excluded.

  if (ElementSize > 32 || ElementSize == 1)

    return SDValue();


  SmallVector<int, 8> NewMask;

  if (widenShuffleMaskElts(Mask, NewMask)) {

    MVT NewEltVT = VT.isFloatingPoint()

                       ? MVT::getFloatingPointVT(ElementSize * 2)

                       : MVT::getIntegerVT(ElementSize * 2);

    MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);

    if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {

      V0 = DAG.getBitcast(NewVT, V0);

      V1 = DAG.getBitcast(NewVT, V1);

      return DAG.getBitcast(VT,

                            DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));

    }

  }


  return SDValue();

}


// Try to fold shuffle (tbl2, tbl2) into a single tbl4.


static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op,

                                               ArrayRef<int> ShuffleMask,

                                               SelectionDAG &DAG) {

  SDValue Tbl1 = Op->getOperand(0);

  SDValue Tbl2 = Op->getOperand(1);

  SDLoc DL(Op);

  SDValue Tbl2ID =

      DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i64);


  EVT VT = Op.getValueType();

  if (Tbl1.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||

      Tbl1.getOperand(0) != Tbl2ID ||

      Tbl2.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||

      Tbl2.getOperand(0) != Tbl2ID)

    return SDValue();


  if (Tbl1.getValueType() != MVT::v16i8 || Tbl2.getValueType() != MVT::v16i8)

    return SDValue();


  SDValue Mask1 = Tbl1.getOperand(3);

  SDValue Mask2 = Tbl2.getOperand(3);

  if (Mask1.getOpcode() != ISD::BUILD_VECTOR ||

      Mask2.getOpcode() != ISD::BUILD_VECTOR)

    return SDValue();


  SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());

  for (unsigned I = 0; I < 16; I++) {

    if (ShuffleMask[I] < 16)

      TBLMaskParts[I] = Mask1.getOperand(ShuffleMask[I]);

    else {

      auto *C = dyn_cast<ConstantSDNode>(Mask2.getOperand(ShuffleMask[I] - 16));

      if (!C)

        return SDValue();

      TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, DL, MVT::i32);

    }

  }


  SDValue TBLMask = DAG.getBuildVector(VT, DL, TBLMaskParts);

  SDValue ID =

      DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, DL, MVT::i64);


  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v16i8,

                     {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),

                      Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});

}


// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,

// but we don't have an appropriate instruction,

// so custom-lower it as ZIP1-with-zeros.

SDValue

AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,

                                                     SelectionDAG &DAG) const {

  SDLoc DL(Op);

  EVT VT = Op.getValueType();

  SDValue SrcOp = Op.getOperand(0);

  EVT SrcVT = SrcOp.getValueType();

  assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&

         "Unexpected extension factor.");

  unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();

  // FIXME: support multi-step zipping?

  if (Scale != 2)

    return SDValue();

  SDValue Zeros = DAG.getConstant(0, DL, SrcVT);

  return DAG.getBitcast(VT,

                        DAG.getNode(AArch64ISD::ZIP1, DL, SrcVT, SrcOp, Zeros));

}


SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,

                                                   SelectionDAG &DAG) const {

  SDLoc DL(Op);

  EVT VT = Op.getValueType();


  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());


  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))

    return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);


  // Convert shuffles that are directly supported on NEON to target-specific

  // DAG nodes, instead of keeping them as shuffles and matching them again

  // during code selection.  This is more efficient and avoids the possibility

  // of inconsistencies between legalization and selection.

  ArrayRef<int> ShuffleMask = SVN->getMask();


  SDValue V1 = Op.getOperand(0);

  SDValue V2 = Op.getOperand(1);


  assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");

  assert(ShuffleMask.size() == VT.getVectorNumElements() &&

         "Unexpected VECTOR_SHUFFLE mask size!");


  if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))

    return Res;


  if (SVN->isSplat()) {

    int Lane = SVN->getSplatIndex();

    // If this is undef splat, generate it via "just" vdup, if possible.

    if (Lane == -1)

      Lane = 0;


    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)

      return DAG.getNode(AArch64ISD::DUP, DL, V1.getValueType(),

                         V1.getOperand(0));

    // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-

    // constant. If so, we can just reference the lane's definition directly.

    if (V1.getOpcode() == ISD::BUILD_VECTOR &&

        !isa<ConstantSDNode>(V1.getOperand(Lane)))

      return DAG.getNode(AArch64ISD::DUP, DL, VT, V1.getOperand(Lane));


    // Otherwise, duplicate from the lane of the input vector.

    unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());

    return constructDup(V1, Lane, DL, VT, Opcode, DAG);

  }


  // Check if the mask matches a DUP for a wider element

  for (unsigned LaneSize : {64U, 32U, 16U}) {

    unsigned Lane = 0;

    if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {

      unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64

                                       : LaneSize == 32 ? AArch64ISD::DUPLANE32

                                                        : AArch64ISD::DUPLANE16;

      // Cast V1 to an integer vector with required lane size

      MVT NewEltTy = MVT::getIntegerVT(LaneSize);

      unsigned NewEltCount = VT.getSizeInBits() / LaneSize;

      MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);

      V1 = DAG.getBitcast(NewVecTy, V1);

      // Construct the DUP instruction

      V1 = constructDup(V1, Lane, DL, NewVecTy, Opcode, DAG);

      // Cast back to the original type

      return DAG.getBitcast(VT, V1);

    }

  }


  unsigned NumElts = VT.getVectorNumElements();

  unsigned EltSize = VT.getScalarSizeInBits();

  if (isREVMask(ShuffleMask, EltSize, NumElts, 64))

    return DAG.getNode(AArch64ISD::REV64, DL, V1.getValueType(), V1);

  if (isREVMask(ShuffleMask, EltSize, NumElts, 32))

    return DAG.getNode(AArch64ISD::REV32, DL, V1.getValueType(), V1);

  if (isREVMask(ShuffleMask, EltSize, NumElts, 16))

    return DAG.getNode(AArch64ISD::REV16, DL, V1.getValueType(), V1);


  if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&

      ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {

    SDValue Rev = DAG.getNode(AArch64ISD::REV64, DL, VT, V1);

    return DAG.getNode(AArch64ISD::EXT, DL, VT, Rev, Rev,

                       DAG.getConstant(8, DL, MVT::i32));

  }


  bool ReverseEXT = false;

  unsigned Imm;

  if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {

    if (ReverseEXT)

      std::swap(V1, V2);

    Imm *= getExtFactor(V1);

    return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V2,

                       DAG.getConstant(Imm, DL, MVT::i32));

  } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {

    Imm *= getExtFactor(V1);

    return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V1,

                       DAG.getConstant(Imm, DL, MVT::i32));

  }


  unsigned WhichResult;

  if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {

    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;

    return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);

  }

  if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {

    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;

    return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);

  }

  if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {

    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;

    return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);

  }


  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {

    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;

    return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);

  }

  if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {

    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;

    return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);

  }

  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {

    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;

    return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);

  }


  if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))

    return Concat;


  bool DstIsLeft;

  int Anomaly;

  int NumInputElements = V1.getValueType().getVectorNumElements();

  if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {

    SDValue DstVec = DstIsLeft ? V1 : V2;

    SDValue DstLaneV = DAG.getConstant(Anomaly, DL, MVT::i64);


    SDValue SrcVec = V1;

    int SrcLane = ShuffleMask[Anomaly];

    if (SrcLane >= NumInputElements) {

      SrcVec = V2;

      SrcLane -= NumElts;

    }

    SDValue SrcLaneV = DAG.getConstant(SrcLane, DL, MVT::i64);


    EVT ScalarVT = VT.getVectorElementType();


    if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())

      ScalarVT = MVT::i32;


    return DAG.getNode(

        ISD::INSERT_VECTOR_ELT, DL, VT, DstVec,

        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SrcVec, SrcLaneV),

        DstLaneV);

  }


  if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))

    return NewSD;


  // If the shuffle is not directly supported and it has 4 elements, use

  // the PerfectShuffle-generated table to synthesize it from other shuffles.

  if (NumElts == 4) {

    unsigned PFIndexes[4];

    for (unsigned i = 0; i != 4; ++i) {

      if (ShuffleMask[i] < 0)

        PFIndexes[i] = 8;

      else

        PFIndexes[i] = ShuffleMask[i];

    }


    // Compute the index in the perfect shuffle table.

    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +

                            PFIndexes[2] * 9 + PFIndexes[3];

    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];

    return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,

                                  DL);

  }


  // Check for a "select shuffle", generating a BSL to pick between lanes in

  // V1/V2.

  if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {

    assert(VT.getScalarSizeInBits() <= 32 &&

           "Expected larger vector element sizes to be handled already");

    SmallVector<SDValue> MaskElts;

    for (int M : ShuffleMask)

      MaskElts.push_back(DAG.getConstant(

          M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, DL, MVT::i32));

    EVT IVT = VT.changeVectorElementTypeToInteger();

    SDValue MaskConst = DAG.getBuildVector(IVT, DL, MaskElts);

    return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, DL, IVT, MaskConst,

                                          DAG.getBitcast(IVT, V1),

                                          DAG.getBitcast(IVT, V2)));

  }


  // Fall back to generating a TBL

  return GenerateTBL(Op, ShuffleMask, DAG);

}


SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,

                                                 SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();


  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))

    return LowerToScalableOp(Op, DAG);


  assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&

         "Unexpected vector type!");


  // We can handle the constant cases during isel.

  if (isa<ConstantSDNode>(Op.getOperand(0)))

    return Op;


  // There isn't a natural way to handle the general i1 case, so we use some

  // trickery with whilelo.

  SDLoc DL(Op);

  SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);

  SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,

                         DAG.getValueType(MVT::i1));

  SDValue ID =

      DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);

  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);

  if (VT == MVT::nxv1i1)

    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,

                       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,

                                   Zero, SplatVal),

                       Zero);

  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);

}


SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,

                                             SelectionDAG &DAG) const {

  SDLoc DL(Op);


  EVT VT = Op.getValueType();

  if (!isTypeLegal(VT) || !VT.isScalableVector())

    return SDValue();


  // Current lowering only supports the SVE-ACLE types.

  if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)

    return SDValue();


  // The DUPQ operation is independent of element type so normalise to i64s.

  SDValue Idx128 = Op.getOperand(2);


  // DUPQ can be used when idx is in range.

  auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);

  if (CIdx && (CIdx->getZExtValue() <= 3)) {

    SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);

    return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);

  }


  SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));


  // The ACLE says this must produce the same result as:

  //   svtbl(data, svadd_x(svptrue_b64(),

  //                       svand_x(svptrue_b64(), svindex_u64(0, 1), 1),

  //                       index * 2))

  SDValue One = DAG.getConstant(1, DL, MVT::i64);

  SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);


  // create the vector 0,1,0,1,...

  SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);

  SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);


  // create the vector idx64,idx64+1,idx64,idx64+1,...

  SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);

  SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);

  SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);


  // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...

  SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);

  return DAG.getNode(ISD::BITCAST, DL, VT, TBL);

}


static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,

                               APInt &UndefBits) {

  EVT VT = BVN->getValueType(0);

  APInt SplatBits, SplatUndef;

  unsigned SplatBitSize;

  bool HasAnyUndefs;

  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {

    unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;


    for (unsigned i = 0; i < NumSplats; ++i) {

      CnstBits <<= SplatBitSize;

      UndefBits <<= SplatBitSize;

      CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());

      UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());

    }


    return true;

  }


  return false;

}


// Try 64-bit splatted SIMD immediate.


static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,

                                 const APInt &Bits) {

  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {

    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();

    EVT VT = Op.getValueType();

    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;


    if (AArch64_AM::isAdvSIMDModImmType10(Value)) {

      Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);


      SDLoc DL(Op);

      SDValue Mov =

          DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));

      return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);

    }

  }


  return SDValue();

}


// Try 32-bit splatted SIMD immediate.


static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,

                                  const APInt &Bits,

                                  const SDValue *LHS = nullptr) {

  EVT VT = Op.getValueType();

  if (VT.isFixedLengthVector() &&

      !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())

    return SDValue();


  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {

    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();

    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;

    bool isAdvSIMDModImm = false;

    uint64_t Shift;


    if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {

      Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);

      Shift = 0;

    }

    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {

      Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);

      Shift = 8;

    }

    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {

      Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);

      Shift = 16;

    }

    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {

      Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);

      Shift = 24;

    }


    if (isAdvSIMDModImm) {

      SDLoc DL(Op);

      SDValue Mov;


      if (LHS)

        Mov = DAG.getNode(NewOp, DL, MovTy,

                          DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),

                          DAG.getConstant(Value, DL, MVT::i32),

                          DAG.getConstant(Shift, DL, MVT::i32));

      else

        Mov =

            DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),

                        DAG.getConstant(Shift, DL, MVT::i32));


      return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);

    }

  }


  return SDValue();

}


// Try 16-bit splatted SIMD immediate.


static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,

                                  const APInt &Bits,

                                  const SDValue *LHS = nullptr) {

  EVT VT = Op.getValueType();

  if (VT.isFixedLengthVector() &&

      !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())

    return SDValue();


  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {

    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();

    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;

    bool isAdvSIMDModImm = false;

    uint64_t Shift;


    if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {

      Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);

      Shift = 0;

    }

    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {

      Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);

      Shift = 8;

    }


    if (isAdvSIMDModImm) {

      SDLoc DL(Op);

      SDValue Mov;


      if (LHS)

        Mov = DAG.getNode(NewOp, DL, MovTy,

                          DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),

                          DAG.getConstant(Value, DL, MVT::i32),

                          DAG.getConstant(Shift, DL, MVT::i32));

      else

        Mov =

            DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),

                        DAG.getConstant(Shift, DL, MVT::i32));


      return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);

    }

  }


  return SDValue();

}


// Try 32-bit splatted SIMD immediate with shifted ones.


static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,

                                    SelectionDAG &DAG, const APInt &Bits) {

  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {

    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();

    EVT VT = Op.getValueType();

    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;

    bool isAdvSIMDModImm = false;

    uint64_t Shift;


    if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {

      Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);

      Shift = 264;

    }

    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {

      Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);

      Shift = 272;

    }


    if (isAdvSIMDModImm) {

      SDLoc DL(Op);

      SDValue Mov =

          DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),

                      DAG.getConstant(Shift, DL, MVT::i32));

      return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);

    }

  }


  return SDValue();

}


// Try 8-bit splatted SIMD immediate.


static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,

                                 const APInt &Bits) {

  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {

    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();

    EVT VT = Op.getValueType();

    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;


    if (AArch64_AM::isAdvSIMDModImmType9(Value)) {

      Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);


      SDLoc DL(Op);

      SDValue Mov =

          DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));

      return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);

    }

  }


  return SDValue();

}


// Try FP splatted SIMD immediate.


static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,

                                  const APInt &Bits) {

  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {

    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();

    EVT VT = Op.getValueType();

    bool isWide = (VT.getSizeInBits() == 128);

    MVT MovTy;

    bool isAdvSIMDModImm = false;


    if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {

      Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);

      MovTy = isWide ? MVT::v4f32 : MVT::v2f32;

    }

    else if (isWide &&

             (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {

      Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);

      MovTy = MVT::v2f64;

    }


    if (isAdvSIMDModImm) {

      SDLoc DL(Op);

      SDValue Mov =

          DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));

      return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);

    }

  }


  return SDValue();

}


// Specialized code to quickly find if PotentialBVec is a BuildVector that

// consists of only the same constant int value, returned in reference arg

// ConstVal


static bool isAllConstantBuildVector(const SDValue &PotentialBVec,

                                     uint64_t &ConstVal) {

  BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);

  if (!Bvec)

    return false;

  ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));

  if (!FirstElt)

    return false;

  EVT VT = Bvec->getValueType(0);

  unsigned NumElts = VT.getVectorNumElements();

  for (unsigned i = 1; i < NumElts; ++i)

    if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)

      return false;

  ConstVal = FirstElt->getZExtValue();

  return true;

}


static bool isAllInactivePredicate(SDValue N) {

  // Look through cast.

  while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)

    N = N.getOperand(0);


  return ISD::isConstantSplatVectorAllZeros(N.getNode());

}


static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {

  unsigned NumElts = N.getValueType().getVectorMinNumElements();


  // Look through cast.

  while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {

    N = N.getOperand(0);

    // When reinterpreting from a type with fewer elements the "new" elements

    // are not active, so bail if they're likely to be used.

    if (N.getValueType().getVectorMinNumElements() < NumElts)

      return false;

  }


  if (ISD::isConstantSplatVectorAllOnes(N.getNode()))

    return true;


  // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size

  // or smaller than the implicit element type represented by N.

  // NOTE: A larger element count implies a smaller element type.

  if (N.getOpcode() == AArch64ISD::PTRUE &&

      N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)

    return N.getValueType().getVectorMinNumElements() >= NumElts;


  // If we're compiling for a specific vector-length, we can check if the

  // pattern's VL equals that of the scalable vector at runtime.

  if (N.getOpcode() == AArch64ISD::PTRUE) {

    const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();

    unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();

    unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();

    if (MaxSVESize && MinSVESize == MaxSVESize) {

      unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;

      unsigned PatNumElts =

          getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));

      return PatNumElts == (NumElts * VScale);

    }

  }


  return false;

}


// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),

// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a

// BUILD_VECTORs with constant element C1, C2 is a constant, and:

//   - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)

//   - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)

// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.


static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);


  if (!VT.isVector())

    return SDValue();


  SDLoc DL(N);


  SDValue And;

  SDValue Shift;


  SDValue FirstOp = N->getOperand(0);

  unsigned FirstOpc = FirstOp.getOpcode();

  SDValue SecondOp = N->getOperand(1);

  unsigned SecondOpc = SecondOp.getOpcode();


  // Is one of the operands an AND or a BICi? The AND may have been optimised to

  // a BICi in order to use an immediate instead of a register.

  // Is the other operand an shl or lshr? This will have been turned into:

  // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift

  // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.

  if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&

      (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||

       SecondOpc == AArch64ISD::SHL_PRED ||

       SecondOpc == AArch64ISD::SRL_PRED)) {

    And = FirstOp;

    Shift = SecondOp;


  } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&

             (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||

              FirstOpc == AArch64ISD::SHL_PRED ||

              FirstOpc == AArch64ISD::SRL_PRED)) {

    And = SecondOp;

    Shift = FirstOp;

  } else

    return SDValue();


  bool IsAnd = And.getOpcode() == ISD::AND;

  bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||

                      Shift.getOpcode() == AArch64ISD::SRL_PRED;

  bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||

                        Shift.getOpcode() == AArch64ISD::SRL_PRED;


  // Is the shift amount constant and are all lanes active?

  uint64_t C2;

  if (ShiftHasPredOp) {

    if (!isAllActivePredicate(DAG, Shift.getOperand(0)))

      return SDValue();

    APInt C;

    if (!ISD::isConstantSplatVector(Shift.getOperand(2).getNode(), C))

      return SDValue();

    C2 = C.getZExtValue();

  } else if (ConstantSDNode *C2node =

                 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))

    C2 = C2node->getZExtValue();

  else

    return SDValue();


  APInt C1AsAPInt;

  unsigned ElemSizeInBits = VT.getScalarSizeInBits();

  if (IsAnd) {

    // Is the and mask vector all constant?

    if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))

      return SDValue();

  } else {

    // Reconstruct the corresponding AND immediate from the two BICi immediates.

    ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));

    ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));

    assert(C1nodeImm && C1nodeShift);

    C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());

    C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);

  }


  // Is C1 == ~(Ones(ElemSizeInBits) << C2) or

  // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account

  // how much one can shift elements of a particular size?

  if (C2 > ElemSizeInBits)

    return SDValue();


  APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)

                                  : APInt::getLowBitsSet(ElemSizeInBits, C2);

  if (C1AsAPInt != RequiredC1)

    return SDValue();


  SDValue X = And.getOperand(0);

  SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);

  SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)

                               : Shift.getOperand(1);


  unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;

  return DAG.getNode(Inst, DL, VT, X, Y, Imm);

}


static SDValue tryLowerToBSL(SDValue N, SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);

  assert(VT.isVector() && "Expected vector type in tryLowerToBSL\n");

  SDLoc DL(N);

  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();


  if (VT.isScalableVector() && !Subtarget.hasSVE2())

    return SDValue();


  SDValue N0 = N->getOperand(0);

  if (N0.getOpcode() != ISD::AND)

    return SDValue();


  SDValue N1 = N->getOperand(1);

  if (N1.getOpcode() != ISD::AND)

    return SDValue();


  // InstCombine does (not (neg a)) => (add a -1).

  // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)

  // Loop over all combinations of AND operands.

  for (int i = 1; i >= 0; --i) {

    for (int j = 1; j >= 0; --j) {

      SDValue O0 = N0->getOperand(i);

      SDValue O1 = N1->getOperand(j);

      SDValue Sub, Add, SubSibling, AddSibling;


      // Find a SUB and an ADD operand, one from each AND.

      if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {

        Sub = O0;

        Add = O1;

        SubSibling = N0->getOperand(1 - i);

        AddSibling = N1->getOperand(1 - j);

      } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {

        Add = O0;

        Sub = O1;

        AddSibling = N0->getOperand(1 - i);

        SubSibling = N1->getOperand(1 - j);

      } else

        continue;


      if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))

        continue;


      // Constant ones is always righthand operand of the Add.

      if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))

        continue;


      if (Sub.getOperand(1) != Add.getOperand(0))

        continue;


      return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);

    }

  }


  // (or (and a b) (and (not a) c)) => (bsl a b c)

  // We only have to look for constant vectors here since the general, variable

  // case can be handled in TableGen.

  unsigned Bits = VT.getScalarSizeInBits();

  for (int i = 1; i >= 0; --i)

    for (int j = 1; j >= 0; --j) {

      APInt Val1, Val2;


      if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&

          ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) &&

          ~Val1.trunc(Bits) == Val2.trunc(Bits)) {

        return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),

                           N0->getOperand(1 - i), N1->getOperand(1 - j));

      }

      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));

      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));

      if (!BVN0 || !BVN1)

        continue;


      bool FoundMatch = true;

      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {

        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));

        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));

        if (!CN0 || !CN1 ||

            CN0->getAPIntValue().trunc(Bits) !=

                ~CN1->getAsAPIntVal().trunc(Bits)) {

          FoundMatch = false;

          break;

        }

      }

      if (FoundMatch)

        return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),

                           N0->getOperand(1 - i), N1->getOperand(1 - j));

    }


  return SDValue();

}


SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,

                                             SelectionDAG &DAG) const {

  if (useSVEForFixedLengthVectorVT(Op.getValueType(),

                                   !Subtarget->isNeonAvailable()))

    return LowerToScalableOp(Op, DAG);


  if (SDValue Res = tryLowerToBSL(Op, DAG))

    return Res;


  // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))

  if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))

    return Res;


  EVT VT = Op.getValueType();

  if (VT.isScalableVector())

    return Op;


  SDValue LHS = Op.getOperand(0);

  BuildVectorSDNode *BVN =

      dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());

  if (!BVN) {

    // OR commutes, so try swapping the operands.

    LHS = Op.getOperand(1);

    BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());

  }

  if (!BVN)

    return Op;


  APInt DefBits(VT.getSizeInBits(), 0);

  APInt UndefBits(VT.getSizeInBits(), 0);

  if (resolveBuildVector(BVN, DefBits, UndefBits)) {

    SDValue NewOp;


    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,

                                    DefBits, &LHS)) ||

        (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,

                                    DefBits, &LHS)))

      return NewOp;


    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,

                                    UndefBits, &LHS)) ||

        (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,

                                    UndefBits, &LHS)))

      return NewOp;

  }


  // We can always fall back to a non-immediate OR.

  return Op;

}


// Normalize the operands of BUILD_VECTOR. The value of constant operands will

// be truncated to fit element width.


static SDValue NormalizeBuildVector(SDValue Op,

                                    SelectionDAG &DAG) {

  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");

  SDLoc DL(Op);

  EVT VT = Op.getValueType();

  EVT EltTy= VT.getVectorElementType();


  if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)

    return Op;


  SmallVector<SDValue, 16> Ops;

  for (SDValue Lane : Op->ops()) {

    // For integer vectors, type legalization would have promoted the

    // operands already. Otherwise, if Op is a floating-point splat

    // (with operands cast to integers), then the only possibilities

    // are constants and UNDEFs.

    if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {

      Lane = DAG.getConstant(

          CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),

          DL, MVT::i32);

    } else if (Lane.getNode()->isUndef()) {

      Lane = DAG.getUNDEF(MVT::i32);

    } else {

      assert(Lane.getValueType() == MVT::i32 &&

             "Unexpected BUILD_VECTOR operand type");

    }

    Ops.push_back(Lane);

  }

  return DAG.getBuildVector(VT, DL, Ops);

}


static SDValue trySVESplat64(SDValue Op, SelectionDAG &DAG,

                             const AArch64Subtarget *ST, APInt &DefBits) {

  EVT VT = Op.getValueType();

  // TODO: We should be able to support 64-bit destinations too

  if (!ST->hasSVE() || !VT.is128BitVector() ||

      DefBits.getHiBits(64) != DefBits.getLoBits(64))

    return SDValue();


  // See if we can make use of the SVE dup instruction.

  APInt Val64 = DefBits.trunc(64);

  int32_t ImmVal, ShiftVal;

  if (!AArch64_AM::isSVECpyDupImm(64, Val64.getSExtValue(), ImmVal, ShiftVal))

    return SDValue();


  SDLoc DL(Op);

  SDValue SplatVal = DAG.getSplatVector(MVT::nxv2i64, DL,

                                        DAG.getConstant(Val64, DL, MVT::i64));

  SDValue Res = convertFromScalableVector(DAG, MVT::v2i64, SplatVal);

  return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Res);

}


static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,

                                   const AArch64Subtarget *ST) {

  EVT VT = Op.getValueType();

  assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&

         "Expected a legal NEON vector");


  APInt DefBits(VT.getSizeInBits(), 0);

  APInt UndefBits(VT.getSizeInBits(), 0);

  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());

  if (resolveBuildVector(BVN, DefBits, UndefBits)) {

    auto TryMOVIWithBits = [&](APInt DefBits) {

      SDValue NewOp;

      if ((NewOp =

               tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||

          (NewOp =

               tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||

          (NewOp =

               tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||

          (NewOp =

               tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||

          (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||

          (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))

        return NewOp;


      APInt NotDefBits = ~DefBits;

      if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,

                                      NotDefBits)) ||

          (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG,

                                        NotDefBits)) ||

          (NewOp =

               tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))

        return NewOp;

      return SDValue();

    };

    if (SDValue R = TryMOVIWithBits(DefBits))

      return R;

    if (SDValue R = TryMOVIWithBits(UndefBits))

      return R;


    // Try to materialise the constant using SVE when available.

    if (SDValue R = trySVESplat64(Op, DAG, ST, DefBits))

      return R;


    // See if a fneg of the constant can be materialized with a MOVI, etc

    auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {

      // FNegate each sub-element of the constant

      assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);

      APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)

                      .zext(VT.getSizeInBits());

      APInt NegBits(VT.getSizeInBits(), 0);

      unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();

      for (unsigned i = 0; i < NumElts; i++)

        NegBits |= Neg << (FVT.getScalarSizeInBits() * i);

      NegBits = DefBits ^ NegBits;


      // Try to create the new constants with MOVI, and if so generate a fneg

      // for it.

      if (SDValue NewOp = TryMOVIWithBits(NegBits)) {

        SDLoc DL(Op);

        MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);

        return DAG.getNode(

            AArch64ISD::NVCAST, DL, VT,

            DAG.getNode(ISD::FNEG, DL, VFVT,

                        DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));

      }

      return SDValue();

    };

    SDValue R;

    if ((R = TryWithFNeg(DefBits, MVT::f32)) ||

        (R = TryWithFNeg(DefBits, MVT::f64)) ||

        (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))

      return R;

  }


  return SDValue();

}


SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(

    SDValue Op, SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  SDLoc DL(Op);

  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

  auto *BVN = cast<BuildVectorSDNode>(Op);


  if (auto SeqInfo = BVN->isConstantSequence()) {

    SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);

    SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);

    SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);

    return convertFromScalableVector(DAG, VT, Seq);

  }


  unsigned NumElems = VT.getVectorNumElements();

  if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||

      NumElems <= 1 || BVN->isConstant())

    return SDValue();


  auto IsExtractElt = [](SDValue Op) {

    return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;

  };


  // For integer types that are not already in vectors limit to at most four

  // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.

  if (VT.getScalarType().isInteger() &&

      NumElems - count_if(Op->op_values(), IsExtractElt) > 4)

    return SDValue();


  // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.

  SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);

  SmallVector<SDValue, 16> Intermediates = map_to_vector<16>(

      Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {

        return Op.isUndef() ? Undef

                            : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,

                                          ContainerVT, Undef, Op, ZeroI64);

      });


  ElementCount ZipEC = ContainerVT.getVectorElementCount();

  while (Intermediates.size() > 1) {

    EVT ZipVT = getPackedSVEVectorVT(ZipEC);


    for (unsigned I = 0; I < Intermediates.size(); I += 2) {

      SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);

      SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);

      Intermediates[I / 2] =

          Op1.isUndef() ? Op0

                        : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);

    }


    Intermediates.resize(Intermediates.size() / 2);

    ZipEC = ZipEC.divideCoefficientBy(2);

  }


  assert(Intermediates.size() == 1);

  SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);

  return convertFromScalableVector(DAG, VT, Vec);

}


SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,

                                                 SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();


  bool OverrideNEON = !Subtarget->isNeonAvailable() ||

                      cast<BuildVectorSDNode>(Op)->isConstantSequence();

  if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))

    return LowerFixedLengthBuildVectorToSVE(Op, DAG);


  // Try to build a simple constant vector.

  Op = NormalizeBuildVector(Op, DAG);

  // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,

  // abort.

  if (Op.getOpcode() != ISD::BUILD_VECTOR)

    return SDValue();


  // Certain vector constants, used to express things like logical NOT and

  // arithmetic NEG, are passed through unmodified.  This allows special

  // patterns for these operations to match, which will lower these constants

  // to whatever is proven necessary.

  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());

  if (BVN->isConstant()) {

    if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {

      unsigned BitSize = VT.getVectorElementType().getSizeInBits();

      APInt Val(BitSize,

                Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());

      if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))

        return Op;

    }

    if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())

      if (Const->isZero() && !Const->isNegative())

        return Op;

  }


  if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))

    return V;


  // Scan through the operands to find some interesting properties we can

  // exploit:

  //   1) If only one value is used, we can use a DUP, or

  //   2) if only the low element is not undef, we can just insert that, or

  //   3) if only one constant value is used (w/ some non-constant lanes),

  //      we can splat the constant value into the whole vector then fill

  //      in the non-constant lanes.

  //   4) FIXME: If different constant values are used, but we can intelligently

  //             select the values we'll be overwriting for the non-constant

  //             lanes such that we can directly materialize the vector

  //             some other way (MOVI, e.g.), we can be sneaky.

  //   5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.

  SDLoc DL(Op);

  unsigned NumElts = VT.getVectorNumElements();

  bool isOnlyLowElement = true;

  bool usesOnlyOneValue = true;

  bool usesOnlyOneConstantValue = true;

  bool isConstant = true;

  bool AllLanesExtractElt = true;

  unsigned NumConstantLanes = 0;

  unsigned NumDifferentLanes = 0;

  unsigned NumUndefLanes = 0;

  SDValue Value;

  SDValue ConstantValue;

  SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;

  unsigned ConsecutiveValCount = 0;

  SDValue PrevVal;

  for (unsigned i = 0; i < NumElts; ++i) {

    SDValue V = Op.getOperand(i);

    if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

      AllLanesExtractElt = false;

    if (V.isUndef()) {

      ++NumUndefLanes;

      continue;

    }

    if (i > 0)

      isOnlyLowElement = false;

    if (!isIntOrFPConstant(V))

      isConstant = false;


    if (isIntOrFPConstant(V)) {

      ++NumConstantLanes;

      if (!ConstantValue.getNode())

        ConstantValue = V;

      else if (ConstantValue != V)

        usesOnlyOneConstantValue = false;

    }


    if (!Value.getNode())

      Value = V;

    else if (V != Value) {

      usesOnlyOneValue = false;

      ++NumDifferentLanes;

    }


    if (PrevVal != V) {

      ConsecutiveValCount = 0;

      PrevVal = V;

    }


    // Keep different values and its last consecutive count. For example,

    //

    //  t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,

    //                            t24, t24, t24, t24, t24, t24, t24, t24

    //  t23 = consecutive count 8

    //  t24 = consecutive count 8

    // ------------------------------------------------------------------

    //  t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,

    //                            t24, t24, t24, t24, t24, t24, t24, t24

    //  t23 = consecutive count 5

    //  t24 = consecutive count 9

    DifferentValueMap[V] = ++ConsecutiveValCount;

  }


  if (!Value.getNode()) {

    LLVM_DEBUG(

        dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");

    return DAG.getUNDEF(VT);

  }


  // Convert BUILD_VECTOR where all elements but the lowest are undef into

  // SCALAR_TO_VECTOR, except for when we have a single-element constant vector

  // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.

  if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {

    LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "

                         "SCALAR_TO_VECTOR node\n");

    return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);

  }


  if (AllLanesExtractElt) {

    SDNode *Vector = nullptr;

    bool Even = false;

    bool Odd = false;

    // Check whether the extract elements match the Even pattern <0,2,4,...> or

    // the Odd pattern <1,3,5,...>.

    for (unsigned i = 0; i < NumElts; ++i) {

      SDValue V = Op.getOperand(i);

      const SDNode *N = V.getNode();

      if (!isa<ConstantSDNode>(N->getOperand(1))) {

        Even = false;

        Odd = false;

        break;

      }

      SDValue N0 = N->getOperand(0);


      // All elements are extracted from the same vector.

      if (!Vector) {

        Vector = N0.getNode();

        // Check that the type of EXTRACT_VECTOR_ELT matches the type of

        // BUILD_VECTOR.

        if (VT.getVectorElementType() !=

            N0.getValueType().getVectorElementType())

          break;

      } else if (Vector != N0.getNode()) {

        Odd = false;

        Even = false;

        break;

      }


      // Extracted values are either at Even indices <0,2,4,...> or at Odd

      // indices <1,3,5,...>.

      uint64_t Val = N->getConstantOperandVal(1);

      if (Val == 2 * i) {

        Even = true;

        continue;

      }

      if (Val - 1 == 2 * i) {

        Odd = true;

        continue;

      }


      // Something does not match: abort.

      Odd = false;

      Even = false;

      break;

    }

    if (Even || Odd) {

      SDValue LHS =

          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(Vector, 0),

                      DAG.getConstant(0, DL, MVT::i64));

      SDValue RHS =

          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(Vector, 0),

                      DAG.getConstant(NumElts, DL, MVT::i64));


      if (Even && !Odd)

        return DAG.getNode(AArch64ISD::UZP1, DL, VT, LHS, RHS);

      if (Odd && !Even)

        return DAG.getNode(AArch64ISD::UZP2, DL, VT, LHS, RHS);

    }

  }


  // Use DUP for non-constant splats. For f32 constant splats, reduce to

  // i32 and try again.

  if (usesOnlyOneValue) {

    if (!isConstant) {

      if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

          Value.getValueType() != VT) {

        LLVM_DEBUG(

            dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");

        return DAG.getNode(AArch64ISD::DUP, DL, VT, Value);

      }


      // This is actually a DUPLANExx operation, which keeps everything vectory.


      SDValue Lane = Value.getOperand(1);

      Value = Value.getOperand(0);

      if (Value.getValueSizeInBits() == 64) {

        LLVM_DEBUG(

            dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "

                      "widening it\n");

        Value = WidenVector(Value, DAG);

      }


      unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());

      return DAG.getNode(Opcode, DL, VT, Value, Lane);

    }


    if (VT.getVectorElementType().isFloatingPoint()) {

      SmallVector<SDValue, 8> Ops;

      EVT EltTy = VT.getVectorElementType();

      assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||

               EltTy == MVT::f64) && "Unsupported floating-point vector type");

      LLVM_DEBUG(

          dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "

                    "BITCASTS, and try again\n");

      MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());

      for (unsigned i = 0; i < NumElts; ++i)

        Ops.push_back(DAG.getNode(ISD::BITCAST, DL, NewType, Op.getOperand(i)));

      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);

      SDValue Val = DAG.getBuildVector(VecVT, DL, Ops);

      LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";

                 Val.dump(););

      Val = LowerBUILD_VECTOR(Val, DAG);

      if (Val.getNode())

        return DAG.getNode(ISD::BITCAST, DL, VT, Val);

    }

  }


  // If we need to insert a small number of different non-constant elements and

  // the vector width is sufficiently large, prefer using DUP with the common

  // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,

  // skip the constant lane handling below.

  bool PreferDUPAndInsert =

      !isConstant && NumDifferentLanes >= 1 &&

      NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&

      NumDifferentLanes >= NumConstantLanes;


  // If there was only one constant value used and for more than one lane,

  // start by splatting that value, then replace the non-constant lanes. This

  // is better than the default, which will perform a separate initialization

  // for each lane.

  if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {

    // Firstly, try to materialize the splat constant.

    SDValue Val = DAG.getSplatBuildVector(VT, DL, ConstantValue);

    unsigned BitSize = VT.getScalarSizeInBits();

    APInt ConstantValueAPInt(1, 0);

    if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))

      ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);

    if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&

        !ConstantValueAPInt.isAllOnes()) {

      Val = ConstantBuildVector(Val, DAG, Subtarget);

      if (!Val)

        // Otherwise, materialize the constant and splat it.

        Val = DAG.getNode(AArch64ISD::DUP, DL, VT, ConstantValue);

    }


    // Now insert the non-constant lanes.

    for (unsigned i = 0; i < NumElts; ++i) {

      SDValue V = Op.getOperand(i);

      SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);

      if (!isIntOrFPConstant(V) && !V.isUndef())

        // Note that type legalization likely mucked about with the VT of the

        // source operand, so we may have to convert it here before inserting.

        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Val, V, LaneIdx);

    }

    return Val;

  }


  // This will generate a load from the constant pool.

  if (isConstant) {

    LLVM_DEBUG(

        dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "

                  "expansion\n");

    return SDValue();

  }


  // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from

  // v4i32s. This is really a truncate, which we can construct out of (legal)

  // concats and truncate nodes.

  if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))

    return M;


  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.

  if (NumElts >= 4) {

    if (SDValue Shuffle = ReconstructShuffle(Op, DAG))

      return Shuffle;


    if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))

      return Shuffle;

  }


  if (PreferDUPAndInsert) {

    // First, build a constant vector with the common element.

    SmallVector<SDValue, 8> Ops(NumElts, Value);

    SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, DL, Ops), DAG);

    // Next, insert the elements that do not match the common value.

    for (unsigned I = 0; I < NumElts; ++I)

      if (Op.getOperand(I) != Value)

        NewVector =

            DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NewVector,

                        Op.getOperand(I), DAG.getConstant(I, DL, MVT::i64));


    return NewVector;

  }


  // If vector consists of two different values, try to generate two DUPs and

  // (CONCAT_VECTORS or VECTOR_SHUFFLE).

  if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {

    SmallVector<SDValue, 2> Vals;

    // Check the consecutive count of the value is the half number of vector

    // elements. In this case, we can use CONCAT_VECTORS. For example,

    //

    // canUseVECTOR_CONCAT = true;

    //  t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,

    //                            t24, t24, t24, t24, t24, t24, t24, t24

    //

    // canUseVECTOR_CONCAT = false;

    //  t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,

    //                            t24, t24, t24, t24, t24, t24, t24, t24

    bool canUseVECTOR_CONCAT = true;

    for (auto Pair : DifferentValueMap) {

      // Check different values have same length which is NumElts / 2.

      if (Pair.second != NumElts / 2)

        canUseVECTOR_CONCAT = false;

      Vals.push_back(Pair.first);

    }


    // If canUseVECTOR_CONCAT is true, we can generate two DUPs and

    // CONCAT_VECTORs. For example,

    //

    //  t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,

    //                            t24, t24, t24, t24, t24, t24, t24, t24

    // ==>

    //    t26: v8i8 = AArch64ISD::DUP t23

    //    t28: v8i8 = AArch64ISD::DUP t24

    //  t29: v16i8 = concat_vectors t26, t28

    if (canUseVECTOR_CONCAT) {

      EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());

      if (isTypeLegal(SubVT) && SubVT.isVector() &&

          SubVT.getVectorNumElements() >= 2) {

        SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);

        SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);

        SDValue DUP1 =

            LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops1), DAG);

        SDValue DUP2 =

            LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops2), DAG);

        SDValue CONCAT_VECTORS =

            DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, DUP1, DUP2);

        return CONCAT_VECTORS;

      }

    }


    // Let's try to generate VECTOR_SHUFFLE. For example,

    //

    //  t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26

    //  ==>

    //    t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26

    //    t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25

    //  t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28

    if (NumElts >= 8) {

      SmallVector<int, 16> MaskVec;

      // Build mask for VECTOR_SHUFLLE.

      SDValue FirstLaneVal = Op.getOperand(0);

      for (unsigned i = 0; i < NumElts; ++i) {

        SDValue Val = Op.getOperand(i);

        if (FirstLaneVal == Val)

          MaskVec.push_back(i);

        else

          MaskVec.push_back(i + NumElts);

      }


      SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);

      SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);

      SDValue VEC1 = DAG.getBuildVector(VT, DL, Ops1);

      SDValue VEC2 = DAG.getBuildVector(VT, DL, Ops2);

      SDValue VECTOR_SHUFFLE =

          DAG.getVectorShuffle(VT, DL, VEC1, VEC2, MaskVec);

      return VECTOR_SHUFFLE;

    }

  }


  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we

  // know the default expansion would otherwise fall back on something even

  // worse. For a vector with one or two non-undef values, that's

  // scalar_to_vector for the elements followed by a shuffle (provided the

  // shuffle is valid for the target) and materialization element by element

  // on the stack followed by a load for everything else.

  if (!isConstant && !usesOnlyOneValue) {

    LLVM_DEBUG(

        dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "

                  "of INSERT_VECTOR_ELT\n");


    SDValue Vec = DAG.getUNDEF(VT);

    SDValue Op0 = Op.getOperand(0);

    unsigned i = 0;


    // Use SCALAR_TO_VECTOR for lane zero to

    // a) Avoid a RMW dependency on the full vector register, and

    // b) Allow the register coalescer to fold away the copy if the

    //    value is already in an S or D register, and we're forced to emit an

    //    INSERT_SUBREG that we can't fold anywhere.

    //

    // We also allow types like i8 and i16 which are illegal scalar but legal

    // vector element types. After type-legalization the inserted value is

    // extended (i32) and it is safe to cast them to the vector type by ignoring

    // the upper bits of the lowest lane (e.g. v8i8, v4i16).

    if (!Op0.isUndef()) {

      LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");

      Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Op0);

      ++i;

    }

    LLVM_DEBUG({

      if (i < NumElts)

        dbgs() << "Creating nodes for the other vector elements:\n";

    });

    for (; i < NumElts; ++i) {

      SDValue V = Op.getOperand(i);

      if (V.isUndef())

        continue;

      SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);

      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);

    }

    return Vec;

  }


  LLVM_DEBUG(

      dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "

                "better alternative\n");

  return SDValue();

}


SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,

                                                   SelectionDAG &DAG) const {

  if (useSVEForFixedLengthVectorVT(Op.getValueType(),

                                   !Subtarget->isNeonAvailable()))

    return LowerFixedLengthConcatVectorsToSVE(Op, DAG);


  assert(Op.getValueType().isScalableVector() &&

         isTypeLegal(Op.getValueType()) &&

         "Expected legal scalable vector type!");


  if (isTypeLegal(Op.getOperand(0).getValueType())) {

    unsigned NumOperands = Op->getNumOperands();

    assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&

           "Unexpected number of operands in CONCAT_VECTORS");


    if (NumOperands == 2)

      return Op;


    // Concat each pair of subvectors and pack into the lower half of the array.

    SmallVector<SDValue> ConcatOps(Op->ops());

    while (ConcatOps.size() > 1) {

      for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {

        SDValue V1 = ConcatOps[I];

        SDValue V2 = ConcatOps[I + 1];

        EVT SubVT = V1.getValueType();

        EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());

        ConcatOps[I / 2] =

            DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);

      }

      ConcatOps.resize(ConcatOps.size() / 2);

    }

    return ConcatOps[0];

  }


  return SDValue();

}


SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,

                                                      SelectionDAG &DAG) const {

  assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");


  if (useSVEForFixedLengthVectorVT(Op.getValueType(),

                                   !Subtarget->isNeonAvailable()))

    return LowerFixedLengthInsertVectorElt(Op, DAG);


  EVT VT = Op.getOperand(0).getValueType();


  if (VT.getScalarType() == MVT::i1) {

    EVT VectorVT = getPromotedVTForPredicate(VT);

    SDLoc DL(Op);

    SDValue ExtendedVector =

        DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);

    SDValue ExtendedValue =

        DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,

                             VectorVT.getScalarType().getSizeInBits() < 32

                                 ? MVT::i32

                                 : VectorVT.getScalarType());

    ExtendedVector =

        DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,

                    ExtendedValue, Op.getOperand(2));

    return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);

  }


  // Check for non-constant or out of range lane.

  ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));

  if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())

    return SDValue();


  return Op;

}


SDValue

AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,

                                               SelectionDAG &DAG) const {

  assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");

  EVT VT = Op.getOperand(0).getValueType();


  if (VT.getScalarType() == MVT::i1) {

    // We can't directly extract from an SVE predicate; extend it first.

    // (This isn't the only possible lowering, but it's straightforward.)

    EVT VectorVT = getPromotedVTForPredicate(VT);

    SDLoc DL(Op);

    SDValue Extend =

        DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));

    MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;

    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,

                                  Extend, Op.getOperand(1));

    return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());

  }


  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))

    return LowerFixedLengthExtractVectorElt(Op, DAG);


  // Check for non-constant or out of range lane.

  ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));

  if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())

    return SDValue();


  // Insertion/extraction are legal for V128 types.

  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||

      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||

      VT == MVT::v8f16 || VT == MVT::v8bf16)

    return Op;


  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&

      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&

      VT != MVT::v4bf16)

    return SDValue();


  // For V64 types, we perform extraction by expanding the value

  // to a V128 type and perform the extraction on that.

  SDLoc DL(Op);

  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);

  EVT WideTy = WideVec.getValueType();


  EVT ExtrTy = WideTy.getVectorElementType();

  if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)

    ExtrTy = MVT::i32;


  // For extractions, we just return the result directly.

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,

                     Op.getOperand(1));

}


SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,

                                                      SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  assert(VT.isFixedLengthVector() &&

         "Only cases that extract a fixed length vector are supported!");

  EVT InVT = Op.getOperand(0).getValueType();


  // If we don't have legal types yet, do nothing

  if (!isTypeLegal(InVT))

    return SDValue();


  if (InVT.is128BitVector()) {

    assert(VT.is64BitVector() && "Extracting unexpected vector type!");

    unsigned Idx = Op.getConstantOperandVal(1);


    // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.

    if (Idx == 0)

      return Op;


    // If this is extracting the upper 64-bits of a 128-bit vector, we match

    // that directly.

    if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())

      return Op;

  }


  if (InVT.isScalableVector() ||

      useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {

    SDLoc DL(Op);

    SDValue Vec = Op.getOperand(0);

    SDValue Idx = Op.getOperand(1);


    EVT PackedVT = getPackedSVEVectorVT(InVT.getVectorElementType());

    if (PackedVT != InVT) {

      // Pack input into the bottom part of an SVE register and try again.

      SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,

                                      DAG.getUNDEF(PackedVT), Vec,

                                      DAG.getVectorIdxConstant(0, DL));

      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);

    }


    // This will get matched by custom code during ISelDAGToDAG.

    if (isNullConstant(Idx))

      return Op;


    assert(InVT.isScalableVector() && "Unexpected vector type!");

    // Move requested subvector to the start of the vector and try again.

    SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);

    return convertFromScalableVector(DAG, VT, Splice);

  }


  return SDValue();

}


SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,

                                                     SelectionDAG &DAG) const {

  assert(Op.getValueType().isScalableVector() &&

         "Only expect to lower inserts into scalable vectors!");


  EVT InVT = Op.getOperand(1).getValueType();

  unsigned Idx = Op.getConstantOperandVal(2);


  SDValue Vec0 = Op.getOperand(0);

  SDValue Vec1 = Op.getOperand(1);

  SDLoc DL(Op);

  EVT VT = Op.getValueType();


  if (InVT.isScalableVector()) {

    if (!isTypeLegal(VT))

      return SDValue();


    // Break down insert_subvector into simpler parts.

    if (VT.getVectorElementType() == MVT::i1) {

      unsigned NumElts = VT.getVectorMinNumElements();

      EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());


      SDValue Lo, Hi;

      Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,

                       DAG.getVectorIdxConstant(0, DL));

      Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,

                       DAG.getVectorIdxConstant(NumElts / 2, DL));

      if (Idx < (NumElts / 2))

        Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,

                         DAG.getVectorIdxConstant(Idx, DL));

      else

        Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,

                         DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));


      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

    }


    // We can select these directly.

    if (isTypeLegal(InVT) && Vec0.isUndef())

      return Op;


    // Ensure the subvector is half the size of the main vector.

    if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))

      return SDValue();


    // Here narrow and wide refers to the vector element types. After "casting"

    // both vectors must have the same bit length and so because the subvector

    // has fewer elements, those elements need to be bigger.

    EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());

    EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());


    // NOP cast operands to the largest legal vector of the same element count.

    if (VT.isFloatingPoint()) {

      Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);

      Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);

    } else {

      // Legal integer vectors are already their largest so Vec0 is fine as is.

      Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);

      Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);

    }


    // To replace the top/bottom half of vector V with vector SubV we widen the

    // preserved half of V, concatenate this to SubV (the order depending on the

    // half being replaced) and then narrow the result.

    SDValue Narrow;

    if (Idx == 0) {

      SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);

      HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);

      Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);

    } else {

      assert(Idx == InVT.getVectorMinNumElements() &&

             "Invalid subvector index!");

      SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);

      LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);

      Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);

    }


    return getSVESafeBitCast(VT, Narrow, DAG);

  }


  if (Idx == 0 && isPackedVectorType(VT, DAG)) {

    // This will be matched by custom code during ISelDAGToDAG.

    if (Vec0.isUndef())

      return Op;


    std::optional<unsigned> PredPattern =

        getSVEPredPatternFromNumElements(InVT.getVectorNumElements());

    auto PredTy = VT.changeVectorElementType(MVT::i1);

    SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);

    SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);

    return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);

  }


  return SDValue();

}


static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {

  if (Op.getOpcode() != AArch64ISD::DUP &&

      Op.getOpcode() != ISD::SPLAT_VECTOR &&

      Op.getOpcode() != ISD::BUILD_VECTOR)

    return false;


  if (Op.getOpcode() == ISD::BUILD_VECTOR &&

      !isAllConstantBuildVector(Op, SplatVal))

    return false;


  if (Op.getOpcode() != ISD::BUILD_VECTOR &&

      !isa<ConstantSDNode>(Op->getOperand(0)))

    return false;


  SplatVal = Op->getConstantOperandVal(0);

  if (Op.getValueType().getVectorElementType() != MVT::i64)

    SplatVal = (int32_t)SplatVal;


  Negated = false;

  if (isPowerOf2_64(SplatVal))

    return true;


  Negated = true;

  if (isPowerOf2_64(-SplatVal)) {

    SplatVal = -SplatVal;

    return true;

  }


  return false;

}


SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  SDLoc DL(Op);


  if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))

    return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);


  assert(VT.isScalableVector() && "Expected a scalable vector.");


  bool Signed = Op.getOpcode() == ISD::SDIV;

  unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;


  bool Negated;

  uint64_t SplatVal;

  // NOTE: SRAD cannot be used to represent sdiv-by-one.

  if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) &&

      SplatVal > 1) {

    SDValue Pg = getPredicateForScalableVector(DAG, DL, VT);

    SDValue Res =

        DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),

                    DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32));

    if (Negated)

      Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);


    return Res;

  }


  if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)

    return LowerToPredicatedOp(Op, DAG, PredOpcode);


  // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit

  // operations, and truncate the result.

  EVT WidenedVT;

  if (VT == MVT::nxv16i8)

    WidenedVT = MVT::nxv8i16;

  else if (VT == MVT::nxv8i16)

    WidenedVT = MVT::nxv4i32;

  else

    llvm_unreachable("Unexpected Custom DIV operation");


  unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;

  unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;

  SDValue Op0Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(0));

  SDValue Op1Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(1));

  SDValue Op0Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(0));

  SDValue Op1Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(1));

  SDValue ResultLo = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Lo, Op1Lo);

  SDValue ResultHi = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Hi, Op1Hi);

  SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultLo);

  SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultHi);

  return DAG.getNode(AArch64ISD::UZP1, DL, VT, ResultLoCast, ResultHiCast);

}


bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(

    EVT VT, unsigned DefinedValues) const {

  if (!Subtarget->isNeonAvailable())

    return false;

  return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);

}


bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {

  // Currently no fixed length shuffles that require SVE are legal.

  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))

    return false;


  if (VT.getVectorNumElements() == 4 &&

      (VT.is128BitVector() || VT.is64BitVector())) {

    unsigned Cost = getPerfectShuffleCost(M);

    if (Cost <= 1)

      return true;

  }


  bool DummyBool;

  int DummyInt;

  unsigned DummyUnsigned;


  unsigned EltSize = VT.getScalarSizeInBits();

  unsigned NumElts = VT.getVectorNumElements();

  return (ShuffleVectorSDNode::isSplatMask(M) ||

          isREVMask(M, EltSize, NumElts, 64) ||

          isREVMask(M, EltSize, NumElts, 32) ||

          isREVMask(M, EltSize, NumElts, 16) ||

          isEXTMask(M, VT, DummyBool, DummyUnsigned) ||

          isSingletonEXTMask(M, VT, DummyUnsigned) ||

          isTRNMask(M, NumElts, DummyUnsigned) ||

          isUZPMask(M, NumElts, DummyUnsigned) ||

          isZIPMask(M, NumElts, DummyUnsigned) ||

          isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||

          isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||

          isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||

          isINSMask(M, NumElts, DummyBool, DummyInt) ||

          isConcatMask(M, VT, VT.getSizeInBits() == 128));

}


bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M,

                                                   EVT VT) const {

  // Just delegate to the generic legality, clear masks aren't special.

  return isShuffleMaskLegal(M, VT);

}


/// getVShiftImm - Check if this is a valid build_vector for the immediate

/// operand of a vector shift operation, where all the elements of the

/// build_vector must have the same constant integer value.


static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {

  // Ignore bit_converts.

  while (Op.getOpcode() == ISD::BITCAST)

    Op = Op.getOperand(0);

  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());

  APInt SplatBits, SplatUndef;

  unsigned SplatBitSize;

  bool HasAnyUndefs;

  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,

                                    HasAnyUndefs, ElementBits) ||

      SplatBitSize > ElementBits)

    return false;

  Cnt = SplatBits.getSExtValue();

  return true;

}


/// isVShiftLImm - Check if this is a valid build_vector for the immediate

/// operand of a vector shift left operation.  That value must be in the range:

///   0 <= Value < ElementBits for a left shift; or

///   0 <= Value <= ElementBits for a long left shift.


static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {

  assert(VT.isVector() && "vector shift count is not a vector type");

  int64_t ElementBits = VT.getScalarSizeInBits();

  if (!getVShiftImm(Op, ElementBits, Cnt))

    return false;

  return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);

}


/// isVShiftRImm - Check if this is a valid build_vector for the immediate

/// operand of a vector shift right operation. The value must be in the range:

///   1 <= Value <= ElementBits for a right shift; or


static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {

  assert(VT.isVector() && "vector shift count is not a vector type");

  int64_t ElementBits = VT.getScalarSizeInBits();

  if (!getVShiftImm(Op, ElementBits, Cnt))

    return false;

  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));

}


SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,

                                             SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();


  if (VT.getScalarType() == MVT::i1) {

    // Lower i1 truncate to `(x & 1) != 0`.

    SDLoc DL(Op);

    EVT OpVT = Op.getOperand(0).getValueType();

    SDValue Zero = DAG.getConstant(0, DL, OpVT);

    SDValue One = DAG.getConstant(1, DL, OpVT);

    SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Op.getOperand(0), One);

    return DAG.getSetCC(DL, VT, And, Zero, ISD::SETNE);

  }


  if (!VT.isVector() || VT.isScalableVector())

    return SDValue();


  if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),

                                   !Subtarget->isNeonAvailable()))

    return LowerFixedLengthVectorTruncateToSVE(Op, DAG);


  return SDValue();

}


// Check if we can we lower this SRL to a rounding shift instruction. ResVT is

// possibly a truncated type, it tells how many bits of the value are to be

// used.


static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT,

                                            SelectionDAG &DAG,

                                            unsigned &ShiftValue,

                                            SDValue &RShOperand) {

  if (Shift->getOpcode() != ISD::SRL)

    return false;


  EVT VT = Shift.getValueType();

  assert(VT.isScalableVT());


  auto ShiftOp1 =

      dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));

  if (!ShiftOp1)

    return false;


  ShiftValue = ShiftOp1->getZExtValue();

  if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())

    return false;


  SDValue Add = Shift->getOperand(0);

  if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())

    return false;


  assert(ResVT.getScalarSizeInBits() <= VT.getScalarSizeInBits() &&

         "ResVT must be truncated or same type as the shift.");

  // Check if an overflow can lead to incorrect results.

  uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();

  if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())

    return false;


  auto AddOp1 =

      dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));

  if (!AddOp1)

    return false;

  uint64_t AddValue = AddOp1->getZExtValue();

  if (AddValue != 1ULL << (ShiftValue - 1))

    return false;


  RShOperand = Add->getOperand(0);

  return true;

}


SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,

                                                      SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  SDLoc DL(Op);

  int64_t Cnt;


  if (!Op.getOperand(1).getValueType().isVector())

    return Op;

  unsigned EltSize = VT.getScalarSizeInBits();


  switch (Op.getOpcode()) {

  case ISD::SHL:

    if (VT.isScalableVector() ||

        useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))

      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);


    if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)

      return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),

                         DAG.getTargetConstant(Cnt, DL, MVT::i32));

    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,

                       DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,

                                       MVT::i32),

                       Op.getOperand(0), Op.getOperand(1));

  case ISD::SRA:

  case ISD::SRL:

    if (VT.isScalableVector() &&

        (Subtarget->hasSVE2() ||

         (Subtarget->hasSME() && Subtarget->isStreaming()))) {

      SDValue RShOperand;

      unsigned ShiftValue;

      if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))

        return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,

                           getPredicateForVector(DAG, DL, VT), RShOperand,

                           DAG.getTargetConstant(ShiftValue, DL, MVT::i32));

    }


    if (VT.isScalableVector() ||

        useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {

      unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED

                                                : AArch64ISD::SRL_PRED;

      return LowerToPredicatedOp(Op, DAG, Opc);

    }


    // Right shift immediate

    if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {

      unsigned Opc =

          (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;

      return DAG.getNode(Opc, DL, VT, Op.getOperand(0),

                         DAG.getTargetConstant(Cnt, DL, MVT::i32),

                         Op->getFlags());

    }


    // Right shift register.  Note, there is not a shift right register

    // instruction, but the shift left register instruction takes a signed

    // value, where negative numbers specify a right shift.

    unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl

                                                : Intrinsic::aarch64_neon_ushl;

    // negate the shift amount

    SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

                                   Op.getOperand(1));

    SDValue NegShiftLeft =

        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,

                    DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),

                    NegShift);

    return NegShiftLeft;

  }


  llvm_unreachable("unexpected shift opcode");

}


SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,

                                           SelectionDAG &DAG) const {

  if (Op.getValueType().isScalableVector())

    return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);


  if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),

                                   !Subtarget->isNeonAvailable()))

    return LowerFixedLengthVectorSetccToSVE(Op, DAG);


  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();

  SDValue LHS = Op.getOperand(0);

  SDValue RHS = Op.getOperand(1);

  EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();

  SDLoc DL(Op);


  if (LHS.getValueType().getVectorElementType().isInteger())

    return Op;


  assert(((!Subtarget->hasFullFP16() &&

           LHS.getValueType().getVectorElementType() != MVT::f16) ||

          LHS.getValueType().getVectorElementType() != MVT::bf16 ||

          LHS.getValueType().getVectorElementType() != MVT::f128) &&

         "Unexpected type!");


  // Lower isnan(x) | isnan(never-nan) to x != x.

  // Lower !isnan(x) & !isnan(never-nan) to x == x.

  if (CC == ISD::SETUO || CC == ISD::SETO) {

    bool OneNaN = false;

    if (LHS == RHS) {

      OneNaN = true;

    } else if (DAG.isKnownNeverNaN(RHS)) {

      OneNaN = true;

      RHS = LHS;

    } else if (DAG.isKnownNeverNaN(LHS)) {

      OneNaN = true;

      LHS = RHS;

    }

    if (OneNaN) {

      CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;

    }

  }


  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally

  // clean.  Some of them require two branches to implement.

  AArch64CC::CondCode CC1, CC2;

  bool ShouldInvert;

  changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);


  bool NoNaNs =

      getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();

  SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, DL, DAG);

  if (!Cmp.getNode())

    return SDValue();


  if (CC2 != AArch64CC::AL) {

    SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, DL, DAG);

    if (!Cmp2.getNode())

      return SDValue();


    Cmp = DAG.getNode(ISD::OR, DL, CmpVT, Cmp, Cmp2);

  }


  Cmp = DAG.getSExtOrTrunc(Cmp, DL, Op.getValueType());


  if (ShouldInvert)

    Cmp = DAG.getNOT(DL, Cmp, Cmp.getValueType());


  return Cmp;

}


static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,

                                  SelectionDAG &DAG) {

  SDValue VecOp = ScalarOp.getOperand(0);

  auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,

                     DAG.getConstant(0, DL, MVT::i64));

}


static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,

                                      SDLoc DL, SelectionDAG &DAG) {

  unsigned ScalarOpcode;

  switch (Opcode) {

  case ISD::VECREDUCE_AND:

    ScalarOpcode = ISD::AND;

    break;

  case ISD::VECREDUCE_OR:

    ScalarOpcode = ISD::OR;

    break;

  case ISD::VECREDUCE_XOR:

    ScalarOpcode = ISD::XOR;

    break;

  default:

    llvm_unreachable("Expected bitwise vector reduction");

    return SDValue();

  }


  EVT VecVT = Vec.getValueType();

  assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&

         "Expected power-of-2 length vector");


  EVT ElemVT = VecVT.getVectorElementType();


  SDValue Result;

  unsigned NumElems = VecVT.getVectorNumElements();


  // Special case for boolean reductions

  if (ElemVT == MVT::i1) {

    // Split large vectors into smaller ones

    if (NumElems > 16) {

      SDValue Lo, Hi;

      std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);

      EVT HalfVT = Lo.getValueType();

      SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);

      return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);

    }


    // Results of setcc operations get widened to 128 bits if their input

    // operands are 128 bits wide, otherwise vectors that are less than 64 bits

    // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets

    // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element

    // size leads to the best codegen, since e.g. setcc results might need to be

    // truncated otherwise.

    unsigned ExtendedWidth = 64;

    if (Vec.getOpcode() == ISD::SETCC &&

        Vec.getOperand(0).getValueSizeInBits() >= 128) {

      ExtendedWidth = 128;

    }

    EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));


    // any_ext doesn't work with umin/umax, so only use it for uadd.

    unsigned ExtendOp =

        ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;

    SDValue Extended = DAG.getNode(

        ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);

    // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so

    // in that case we bitcast the sign extended values from v2i64 to v4i32

    // before reduction for optimal code generation.

    if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&

        NumElems == 2 && ExtendedWidth == 128) {

      Extended = DAG.getBitcast(MVT::v4i32, Extended);

      ExtendedVT = MVT::i32;

    }

    switch (ScalarOpcode) {

    case ISD::AND:

      Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);

      break;

    case ISD::OR:

      Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);

      break;

    case ISD::XOR:

      Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);

      break;

    default:

      llvm_unreachable("Unexpected Opcode");

    }


    Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);

  } else {

    // Iteratively split the vector in half and combine using the bitwise

    // operation until it fits in a 64 bit register.

    while (VecVT.getSizeInBits() > 64) {

      SDValue Lo, Hi;

      std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);

      VecVT = Lo.getValueType();

      NumElems = VecVT.getVectorNumElements();

      Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);

    }


    EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());


    // Do the remaining work on a scalar since it allows the code generator to

    // combine the shift and bitwise operation into one instruction and since

    // integer instructions can have higher throughput than vector instructions.

    SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);


    // Iteratively combine the lower and upper halves of the scalar using the

    // bitwise operation, halving the relevant region of the scalar in each

    // iteration, until the relevant region is just one element of the original

    // vector.

    for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {

      SDValue ShiftAmount =

          DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);

      SDValue Shifted =

          DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);

      Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);

    }


    Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);

  }


  return DAG.getAnyExtOrTrunc(Result, DL, VT);

}


SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,

                                              SelectionDAG &DAG) const {

  SDValue Src = Op.getOperand(0);

  EVT SrcVT = Src.getValueType();


  // Scalarize v2f16 to turn it into a faddp. This will be more efficient than

  // widening by inserting zeroes.

  if (Subtarget->hasFullFP16() && Op.getOpcode() == ISD::VECREDUCE_FADD &&

      SrcVT == MVT::v2f16) {

    SDLoc DL(Op);

    return DAG.getNode(ISD::FADD, DL, MVT::f16,

                       DAG.getExtractVectorElt(DL, MVT::f16, Src, 0),

                       DAG.getExtractVectorElt(DL, MVT::f16, Src, 1));

  }


  // Try to lower fixed length reductions to SVE.

  bool OverrideNEON = !Subtarget->isNeonAvailable() ||

                      Op.getOpcode() == ISD::VECREDUCE_AND ||

                      Op.getOpcode() == ISD::VECREDUCE_OR ||

                      Op.getOpcode() == ISD::VECREDUCE_XOR ||

                      Op.getOpcode() == ISD::VECREDUCE_FADD ||

                      (Op.getOpcode() != ISD::VECREDUCE_ADD &&

                       SrcVT.getVectorElementType() == MVT::i64);

  if (SrcVT.isScalableVector() ||

      useSVEForFixedLengthVectorVT(

          SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {


    if (SrcVT.getVectorElementType() == MVT::i1)

      return LowerPredReductionToSVE(Op, DAG);


    switch (Op.getOpcode()) {

    case ISD::VECREDUCE_ADD:

      return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);

    case ISD::VECREDUCE_AND:

      return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);

    case ISD::VECREDUCE_OR:

      return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);

    case ISD::VECREDUCE_SMAX:

      return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);

    case ISD::VECREDUCE_SMIN:

      return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);

    case ISD::VECREDUCE_UMAX:

      return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);

    case ISD::VECREDUCE_UMIN:

      return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);

    case ISD::VECREDUCE_XOR:

      return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);

    case ISD::VECREDUCE_FADD:

      return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);

    case ISD::VECREDUCE_FMAX:

      return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);

    case ISD::VECREDUCE_FMIN:

      return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);

    case ISD::VECREDUCE_FMAXIMUM:

      return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);

    case ISD::VECREDUCE_FMINIMUM:

      return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);

    default:

      llvm_unreachable("Unhandled fixed length reduction");

    }

  }


  // Lower NEON reductions.

  SDLoc DL(Op);

  switch (Op.getOpcode()) {

  case ISD::VECREDUCE_AND:

  case ISD::VECREDUCE_OR:

  case ISD::VECREDUCE_XOR:

    return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),

                                  Op.getValueType(), DL, DAG);

  case ISD::VECREDUCE_ADD:

    return getReductionSDNode(AArch64ISD::UADDV, DL, Op, DAG);

  case ISD::VECREDUCE_SMAX:

    return getReductionSDNode(AArch64ISD::SMAXV, DL, Op, DAG);

  case ISD::VECREDUCE_SMIN:

    return getReductionSDNode(AArch64ISD::SMINV, DL, Op, DAG);

  case ISD::VECREDUCE_UMAX:

    return getReductionSDNode(AArch64ISD::UMAXV, DL, Op, DAG);

  case ISD::VECREDUCE_UMIN:

    return getReductionSDNode(AArch64ISD::UMINV, DL, Op, DAG);

  default:

    llvm_unreachable("Unhandled reduction");

  }

}


SDValue AArch64TargetLowering::LowerVECREDUCE_MUL(SDValue Op,

                                                  SelectionDAG &DAG) const {

  SDLoc DL(Op);

  SDValue Src = Op.getOperand(0);

  EVT SrcVT = Src.getValueType();

  assert(SrcVT.isScalableVector() && "Unexpected operand type!");


  SDVTList SrcVTs = DAG.getVTList(SrcVT, SrcVT);

  unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode());

  SDValue Identity = DAG.getNeutralElement(BaseOpc, DL, SrcVT, Op->getFlags());


  // Whilst we don't know the size of the vector we do know the maximum size so

  // can perform a tree reduction with an identity vector, which means once we

  // arrive at the result the remaining stages (when the vector is smaller than

  // the maximum) have no affect.


  unsigned Segments = AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock;

  unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements());


  for (unsigned I = 0; I < Stages; ++I) {

    Src = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, SrcVTs, Src, Identity);

    Src = DAG.getNode(BaseOpc, DL, SrcVT, Src.getValue(0), Src.getValue(1));

  }


  return DAG.getExtractVectorElt(DL, Op.getValueType(), Src, 0);

}


SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,

                                                    SelectionDAG &DAG) const {

  auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();

  // No point replacing if we don't have the relevant instruction/libcall anyway

  if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())

    return SDValue();


  // LSE has an atomic load-clear instruction, but not a load-and.

  SDLoc DL(Op);

  MVT VT = Op.getSimpleValueType();

  assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");

  SDValue RHS = Op.getOperand(2);

  AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());

  RHS = DAG.getNode(ISD::XOR, DL, VT, DAG.getAllOnesConstant(DL, VT), RHS);

  return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, DL, AN->getMemoryVT(),

                       Op.getOperand(0), Op.getOperand(1), RHS,

                       AN->getMemOperand());

}


SDValue

AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,

                                                      SelectionDAG &DAG) const {


  SDLoc DL(Op);

  // Get the inputs.

  SDNode *Node = Op.getNode();

  SDValue Chain = Op.getOperand(0);

  SDValue Size = Op.getOperand(1);

  MaybeAlign Align =

      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();

  EVT VT = Node->getValueType(0);


  if (DAG.getMachineFunction().getFunction().hasFnAttribute(

          "no-stack-arg-probe")) {

    SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);

    Chain = SP.getValue(1);

    SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);

    if (Align)

      SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),

                       DAG.getSignedConstant(-Align->value(), DL, VT));

    Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);

    SDValue Ops[2] = {SP, Chain};

    return DAG.getMergeValues(Ops, DL);

  }


  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);


  EVT PtrVT = getPointerTy(DAG.getDataLayout());

  SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),

                                               PtrVT, 0);


  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();

  const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();

  if (Subtarget->hasCustomCallingConv())

    TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);


  Size = DAG.getNode(ISD::SRL, DL, MVT::i64, Size,

                     DAG.getConstant(4, DL, MVT::i64));

  Chain = DAG.getCopyToReg(Chain, DL, AArch64::X15, Size, SDValue());

  Chain =

      DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),

                  Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),

                  DAG.getRegisterMask(Mask), Chain.getValue(1));

  // To match the actual intent better, we should read the output from X15 here

  // again (instead of potentially spilling it to the stack), but rereading Size

  // from X15 here doesn't work at -O0, since it thinks that X15 is undefined

  // here.


  Size = DAG.getNode(ISD::SHL, DL, MVT::i64, Size,

                     DAG.getConstant(4, DL, MVT::i64));


  SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);

  Chain = SP.getValue(1);

  SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);

  if (Align)

    SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),

                     DAG.getSignedConstant(-Align->value(), DL, VT));

  Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);


  Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL);


  SDValue Ops[2] = {SP, Chain};

  return DAG.getMergeValues(Ops, DL);

}


SDValue

AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,

                                                     SelectionDAG &DAG) const {

  // Get the inputs.

  SDNode *Node = Op.getNode();

  SDValue Chain = Op.getOperand(0);

  SDValue Size = Op.getOperand(1);


  MaybeAlign Align =

      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();

  SDLoc DL(Op);

  EVT VT = Node->getValueType(0);


  // Construct the new SP value in a GPR.

  SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);

  Chain = SP.getValue(1);

  SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);

  if (Align)

    SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),

                     DAG.getSignedConstant(-Align->value(), DL, VT));


  // Set the real SP to the new value with a probing loop.

  Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, DL, MVT::Other, Chain, SP);

  SDValue Ops[2] = {SP, Chain};

  return DAG.getMergeValues(Ops, DL);

}


SDValue

AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

                                               SelectionDAG &DAG) const {

  MachineFunction &MF = DAG.getMachineFunction();


  if (Subtarget->isTargetWindows())

    return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);

  else if (hasInlineStackProbe(MF))

    return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);

  else

    return SDValue();

}


SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,

                                        unsigned NewOp) const {

  if (Subtarget->hasSVE2())

    return LowerToPredicatedOp(Op, DAG, NewOp);


  // Default to expand.

  return SDValue();

}


SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,

                                           SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  assert(VT != MVT::i64 && "Expected illegal VSCALE node");


  SDLoc DL(Op);

  APInt MulImm = Op.getConstantOperandAPInt(0);

  return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,

                            VT);

}


/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.

template <unsigned NumVecs>

static bool


setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,

              AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {

  Info.opc = ISD::INTRINSIC_VOID;

  // Retrieve EC from first vector argument.

  const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());

  ElementCount EC = VT.getVectorElementCount();

#ifndef NDEBUG

  // Check the assumption that all input vectors are the same type.

  for (unsigned I = 0; I < NumVecs; ++I)

    assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&

           "Invalid type.");

#endif

  // memVT is `NumVecs * VT`.

  Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),

                                EC * NumVecs);

  Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);

  Info.offset = 0;

  Info.align.reset();

  Info.flags = MachineMemOperand::MOStore;

  return true;

}


/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as

/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment

/// specified in the intrinsic calls.


bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

                                               const CallInst &I,

                                               MachineFunction &MF,

                                               unsigned Intrinsic) const {

  auto &DL = I.getDataLayout();

  switch (Intrinsic) {

  case Intrinsic::aarch64_sve_st2:

    return setInfoSVEStN<2>(*this, DL, Info, I);

  case Intrinsic::aarch64_sve_st3:

    return setInfoSVEStN<3>(*this, DL, Info, I);

  case Intrinsic::aarch64_sve_st4:

    return setInfoSVEStN<4>(*this, DL, Info, I);

  case Intrinsic::aarch64_neon_ld2:

  case Intrinsic::aarch64_neon_ld3:

  case Intrinsic::aarch64_neon_ld4:

  case Intrinsic::aarch64_neon_ld1x2:

  case Intrinsic::aarch64_neon_ld1x3:

  case Intrinsic::aarch64_neon_ld1x4: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;

    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);

    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);

    Info.offset = 0;

    Info.align.reset();

    // volatile loads with NEON intrinsics not supported

    Info.flags = MachineMemOperand::MOLoad;

    return true;

  }

  case Intrinsic::aarch64_neon_ld2lane:

  case Intrinsic::aarch64_neon_ld3lane:

  case Intrinsic::aarch64_neon_ld4lane:

  case Intrinsic::aarch64_neon_ld2r:

  case Intrinsic::aarch64_neon_ld3r:

  case Intrinsic::aarch64_neon_ld4r: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    // ldx return struct with the same vec type

    Type *RetTy = I.getType();

    auto *StructTy = cast<StructType>(RetTy);

    unsigned NumElts = StructTy->getNumElements();

    Type *VecTy = StructTy->getElementType(0);

    MVT EleVT = MVT::getVT(VecTy).getVectorElementType();

    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);

    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);

    Info.offset = 0;

    Info.align.reset();

    // volatile loads with NEON intrinsics not supported

    Info.flags = MachineMemOperand::MOLoad;

    return true;

  }

  case Intrinsic::aarch64_neon_st2:

  case Intrinsic::aarch64_neon_st3:

  case Intrinsic::aarch64_neon_st4:

  case Intrinsic::aarch64_neon_st1x2:

  case Intrinsic::aarch64_neon_st1x3:

  case Intrinsic::aarch64_neon_st1x4: {

    Info.opc = ISD::INTRINSIC_VOID;

    unsigned NumElts = 0;

    for (const Value *Arg : I.args()) {

      Type *ArgTy = Arg->getType();

      if (!ArgTy->isVectorTy())

        break;

      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;

    }

    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);

    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);

    Info.offset = 0;

    Info.align.reset();

    // volatile stores with NEON intrinsics not supported

    Info.flags = MachineMemOperand::MOStore;

    return true;

  }

  case Intrinsic::aarch64_neon_st2lane:

  case Intrinsic::aarch64_neon_st3lane:

  case Intrinsic::aarch64_neon_st4lane: {

    Info.opc = ISD::INTRINSIC_VOID;

    unsigned NumElts = 0;

    // all the vector type is same

    Type *VecTy = I.getArgOperand(0)->getType();

    MVT EleVT = MVT::getVT(VecTy).getVectorElementType();


    for (const Value *Arg : I.args()) {

      Type *ArgTy = Arg->getType();

      if (!ArgTy->isVectorTy())

        break;

      NumElts += 1;

    }


    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);

    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);

    Info.offset = 0;

    Info.align.reset();

    // volatile stores with NEON intrinsics not supported

    Info.flags = MachineMemOperand::MOStore;

    return true;

  }

  case Intrinsic::aarch64_ldaxr:

  case Intrinsic::aarch64_ldxr: {

    Type *ValTy = I.getParamElementType(0);

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::getVT(ValTy);

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.align = DL.getABITypeAlign(ValTy);

    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;

    return true;

  }

  case Intrinsic::aarch64_stlxr:

  case Intrinsic::aarch64_stxr: {

    Type *ValTy = I.getParamElementType(1);

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::getVT(ValTy);

    Info.ptrVal = I.getArgOperand(1);

    Info.offset = 0;

    Info.align = DL.getABITypeAlign(ValTy);

    Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;

    return true;

  }

  case Intrinsic::aarch64_ldaxp:

  case Intrinsic::aarch64_ldxp:

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::i128;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.align = Align(16);

    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;

    return true;

  case Intrinsic::aarch64_stlxp:

  case Intrinsic::aarch64_stxp:

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::i128;

    Info.ptrVal = I.getArgOperand(2);

    Info.offset = 0;

    Info.align = Align(16);

    Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;

    return true;

  case Intrinsic::aarch64_sve_ldnt1: {

    Type *ElTy = cast<VectorType>(I.getType())->getElementType();

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::getVT(I.getType());

    Info.ptrVal = I.getArgOperand(1);

    Info.offset = 0;

    Info.align = DL.getABITypeAlign(ElTy);

    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;

    return true;

  }

  case Intrinsic::aarch64_sve_stnt1: {

    Type *ElTy =

        cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::getVT(I.getOperand(0)->getType());

    Info.ptrVal = I.getArgOperand(2);

    Info.offset = 0;

    Info.align = DL.getABITypeAlign(ElTy);

    Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;

    return true;

  }

  case Intrinsic::aarch64_mops_memset_tag: {

    Value *Dst = I.getArgOperand(0);

    Value *Val = I.getArgOperand(1);

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::getVT(Val->getType());

    Info.ptrVal = Dst;

    Info.offset = 0;

    Info.align = I.getParamAlign(0).valueOrOne();

    Info.flags = MachineMemOperand::MOStore;

    // The size of the memory being operated on is unknown at this point

    Info.size = MemoryLocation::UnknownSize;

    return true;

  }

  default:

    break;

  }


  return false;

}


bool AArch64TargetLowering::shouldReduceLoadWidth(

    SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,

    std::optional<unsigned> ByteOffset) const {

  // TODO: This may be worth removing. Check regression tests for diffs.

  if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT,

                                                 ByteOffset))

    return false;


  // If we're reducing the load width in order to avoid having to use an extra

  // instruction to do extension then it's probably a good idea.

  if (ExtTy != ISD::NON_EXTLOAD)

    return true;

  // Don't reduce load width if it would prevent us from combining a shift into

  // the offset.

  MemSDNode *Mem = dyn_cast<MemSDNode>(Load);

  assert(Mem);

  const SDValue &Base = Mem->getBasePtr();

  if (Base.getOpcode() == ISD::ADD &&

      Base.getOperand(1).getOpcode() == ISD::SHL &&

      Base.getOperand(1).hasOneUse() &&

      Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {

    // It's unknown whether a scalable vector has a power-of-2 bitwidth.

    if (Mem->getMemoryVT().isScalableVector())

      return false;

    // The shift can be combined if it matches the size of the value being

    // loaded (and so reducing the width would make it not match).

    uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);

    uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;

    if (ShiftAmount == Log2_32(LoadBytes))

      return false;

  }

  // We have no reason to disallow reducing the load width, so allow it.

  return true;

}


// Treat a sext_inreg(extract(..)) as free if it has multiple uses.


bool AArch64TargetLowering::shouldRemoveRedundantExtend(SDValue Extend) const {

  EVT VT = Extend.getValueType();

  if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {

    SDValue Extract = Extend.getOperand(0);

    if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())

      Extract = Extract.getOperand(0);

    if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {

      EVT VecVT = Extract.getOperand(0).getValueType();

      if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)

        return false;

    }

  }

  return true;

}


// Truncations from 64-bit GPR to 32-bit GPR is free.


bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {

  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

    return false;

  uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();

  uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();

  return NumBits1 > NumBits2;

}


bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {

  if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())

    return false;

  uint64_t NumBits1 = VT1.getFixedSizeInBits();

  uint64_t NumBits2 = VT2.getFixedSizeInBits();

  return NumBits1 > NumBits2;

}


/// Check if it is profitable to hoist instruction in then/else to if.

/// Not profitable if I and it's user can form a FMA instruction

/// because we prefer FMSUB/FMADD.


bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {

  if (I->getOpcode() != Instruction::FMul)

    return true;


  if (!I->hasOneUse())

    return true;


  Instruction *User = I->user_back();


  if (!(User->getOpcode() == Instruction::FSub ||

        User->getOpcode() == Instruction::FAdd))

    return true;


  const TargetOptions &Options = getTargetMachine().Options;

  const Function *F = I->getFunction();

  const DataLayout &DL = F->getDataLayout();

  Type *Ty = User->getOperand(0)->getType();


  return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&

           isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&

           (Options.AllowFPOpFusion == FPOpFusion::Fast ||

            I->getFastMathFlags().allowContract()));

}


// All 32-bit GPR operations implicitly zero the high-half of the corresponding

// 64-bit GPR.


bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {

  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

    return false;

  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();

  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();

  return NumBits1 == 32 && NumBits2 == 64;

}


bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {

  if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())

    return false;

  unsigned NumBits1 = VT1.getSizeInBits();

  unsigned NumBits2 = VT2.getSizeInBits();

  return NumBits1 == 32 && NumBits2 == 64;

}


bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {

  EVT VT1 = Val.getValueType();

  if (isZExtFree(VT1, VT2)) {

    return true;

  }


  if (Val.getOpcode() != ISD::LOAD)

    return false;


  // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.

  return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&

          VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&

          VT1.getSizeInBits() <= 32);

}


bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {

  if (isa<FPExtInst>(Ext))

    return false;


  // Vector types are not free.

  if (Ext->getType()->isVectorTy())

    return false;


  for (const Use &U : Ext->uses()) {

    // The extension is free if we can fold it with a left shift in an

    // addressing mode or an arithmetic operation: add, sub, and cmp.


    // Is there a shift?

    const Instruction *Instr = cast<Instruction>(U.getUser());


    // Is this a constant shift?

    switch (Instr->getOpcode()) {

    case Instruction::Shl:

      if (!isa<ConstantInt>(Instr->getOperand(1)))

        return false;

      break;

    case Instruction::GetElementPtr: {

      gep_type_iterator GTI = gep_type_begin(Instr);

      auto &DL = Ext->getDataLayout();

      std::advance(GTI, U.getOperandNo()-1);

      Type *IdxTy = GTI.getIndexedType();

      // This extension will end up with a shift because of the scaling factor.

      // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.

      // Get the shift amount based on the scaling factor:

      // log2(sizeof(IdxTy)) - log2(8).

      if (IdxTy->isScalableTy())

        return false;

      uint64_t ShiftAmt =

          llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -

          3;

      // Is the constant foldable in the shift of the addressing mode?

      // I.e., shift amount is between 1 and 4 inclusive.

      if (ShiftAmt == 0 || ShiftAmt > 4)

        return false;

      break;

    }

    case Instruction::Trunc:

      // Check if this is a noop.

      // trunc(sext ty1 to ty2) to ty1.

      if (Instr->getType() == Ext->getOperand(0)->getType())

        continue;

      [[fallthrough]];

    default:

      return false;

    }


    // At this point we can use the bfm family, so this extension is free

    // for that use.

  }

  return true;

}


static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,

                                 unsigned NumElts, bool IsLittleEndian,

                                 SmallVectorImpl<int> &Mask) {

  if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)

    return false;


  assert(DstWidth % SrcWidth == 0 &&

         "TBL lowering is not supported for a conversion instruction with this "

         "source and destination element type.");


  unsigned Factor = DstWidth / SrcWidth;

  unsigned MaskLen = NumElts * Factor;


  Mask.clear();

  Mask.resize(MaskLen, NumElts);


  unsigned SrcIndex = 0;

  for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)

    Mask[I] = SrcIndex++;


  return true;

}


static Value *createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op,

                                      FixedVectorType *ZExtTy,

                                      FixedVectorType *DstTy,

                                      bool IsLittleEndian) {

  auto *SrcTy = cast<FixedVectorType>(Op->getType());

  unsigned NumElts = SrcTy->getNumElements();

  auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();

  auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();


  SmallVector<int> Mask;

  if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))

    return nullptr;


  auto *FirstEltZero = Builder.CreateInsertElement(

      PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));

  Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);

  Result = Builder.CreateBitCast(Result, DstTy);

  if (DstTy != ZExtTy)

    Result = Builder.CreateZExt(Result, ZExtTy);

  return Result;

}


static Value *createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op,

                                      FixedVectorType *DstTy,

                                      bool IsLittleEndian) {

  auto *SrcTy = cast<FixedVectorType>(Op->getType());

  auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();

  auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();


  SmallVector<int> Mask;

  if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),

                            !IsLittleEndian, Mask))

    return nullptr;


  auto *FirstEltZero = Builder.CreateInsertElement(

      PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));


  return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);

}


static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {

  IRBuilder<> Builder(TI);

  SmallVector<Value *> Parts;

  int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();

  auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());

  auto *DstTy = cast<FixedVectorType>(TI->getType());

  assert(SrcTy->getElementType()->isIntegerTy() &&

         "Non-integer type source vector element is not supported");

  assert(DstTy->getElementType()->isIntegerTy(8) &&

         "Unsupported destination vector element type");

  unsigned SrcElemTySz =

      cast<IntegerType>(SrcTy->getElementType())->getBitWidth();

  unsigned DstElemTySz =

      cast<IntegerType>(DstTy->getElementType())->getBitWidth();

  assert((SrcElemTySz % DstElemTySz == 0) &&

         "Cannot lower truncate to tbl instructions for a source element size "

         "that is not divisible by the destination element size");

  unsigned TruncFactor = SrcElemTySz / DstElemTySz;

  assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&

         "Unsupported source vector element type size");

  Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);


  // Create a mask to choose every nth byte from the source vector table of

  // bytes to create the truncated destination vector, where 'n' is the truncate

  // ratio. For example, for a truncate from Yxi64 to Yxi8, choose

  // 0,8,16,..Y*8th bytes for the little-endian format

  SmallVector<Constant *, 16> MaskConst;

  for (int Itr = 0; Itr < 16; Itr++) {

    if (Itr < NumElements)

      MaskConst.push_back(Builder.getInt8(

          IsLittleEndian ? Itr * TruncFactor

                         : Itr * TruncFactor + (TruncFactor - 1)));

    else

      MaskConst.push_back(Builder.getInt8(255));

  }


  int MaxTblSz = 128 * 4;

  int MaxSrcSz = SrcElemTySz * NumElements;

  int ElemsPerTbl =

      (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);

  assert(ElemsPerTbl <= 16 &&

         "Maximum elements selected using TBL instruction cannot exceed 16!");


  int ShuffleCount = 128 / SrcElemTySz;

  SmallVector<int> ShuffleLanes;

  for (int i = 0; i < ShuffleCount; ++i)

    ShuffleLanes.push_back(i);


  // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles

  // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,

  // call TBL & save the result in a vector of TBL results for combining later.

  SmallVector<Value *> Results;

  while (ShuffleLanes.back() < NumElements) {

    Parts.push_back(Builder.CreateBitCast(

        Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));


    if (Parts.size() == 4) {

      Parts.push_back(ConstantVector::get(MaskConst));

      Results.push_back(

          Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));

      Parts.clear();

    }


    for (int i = 0; i < ShuffleCount; ++i)

      ShuffleLanes[i] += ShuffleCount;

  }


  assert((Parts.empty() || Results.empty()) &&

         "Lowering trunc for vectors requiring different TBL instructions is "

         "not supported!");

  // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD

  // registers

  if (!Parts.empty()) {

    Intrinsic::ID TblID;

    switch (Parts.size()) {

    case 1:

      TblID = Intrinsic::aarch64_neon_tbl1;

      break;

    case 2:

      TblID = Intrinsic::aarch64_neon_tbl2;

      break;

    case 3:

      TblID = Intrinsic::aarch64_neon_tbl3;

      break;

    }


    Parts.push_back(ConstantVector::get(MaskConst));

    Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));

  }


  // Extract the destination vector from TBL result(s) after combining them

  // where applicable. Currently, at most two TBLs are supported.

  assert(Results.size() <= 2 && "Trunc lowering does not support generation of "

                                "more than 2 tbl instructions!");

  Value *FinalResult = Results[0];

  if (Results.size() == 1) {

    if (ElemsPerTbl < 16) {

      SmallVector<int> FinalMask(ElemsPerTbl);

      std::iota(FinalMask.begin(), FinalMask.end(), 0);

      FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);

    }

  } else {

    SmallVector<int> FinalMask(ElemsPerTbl * Results.size());

    if (ElemsPerTbl < 16) {

      std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);

      std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);

    } else {

      std::iota(FinalMask.begin(), FinalMask.end(), 0);

    }

    FinalResult =

        Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);

  }


  TI->replaceAllUsesWith(FinalResult);

  TI->eraseFromParent();

}


bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(

    Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {

  // shuffle_vector instructions are serialized when targeting SVE,

  // see LowerSPLAT_VECTOR. This peephole is not beneficial.

  if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())

    return false;


  // Try to optimize conversions using tbl. This requires materializing constant

  // index vectors, which can increase code size and add loads. Skip the

  // transform unless the conversion is in a loop block guaranteed to execute

  // and we are not optimizing for size.

  Function *F = I->getParent()->getParent();

  if (!L || L->getHeader() != I->getParent() || F->hasOptSize())

    return false;


  auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());

  auto *DstTy = dyn_cast<FixedVectorType>(I->getType());

  if (!SrcTy || !DstTy)

    return false;


  // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be

  // lowered to tbl instructions to insert the original i8 elements

  // into i8x lanes. This is enabled for cases where it is beneficial.

  auto *ZExt = dyn_cast<ZExtInst>(I);

  if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {

    auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();

    if (DstWidth % 8 != 0)

      return false;


    auto *TruncDstType =

        cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));

    // If the ZExt can be lowered to a single ZExt to the next power-of-2 and

    // the remaining ZExt folded into the user, don't use tbl lowering.

    auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();

    if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,

                             TargetTransformInfo::getCastContextHint(I),

                             TTI::TCK_SizeAndLatency, I) == TTI::TCC_Free) {

      if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())

        return false;


      DstTy = TruncDstType;

    }


    // mul(zext(i8), sext) can be transformed into smull(zext, sext) which

    // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at

    // most one extra extend step is needed and using tbl is not profitable.

    // Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a

    // udot instruction.

    if (SrcWidth * 4 <= DstWidth) {

      if (all_of(I->users(), [&](auto *U) {

            auto *SingleUser = cast<Instruction>(&*U);

            if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))

              return true;

            if (match(SingleUser,

                      m_Intrinsic<Intrinsic::vector_partial_reduce_add>(

                          m_Value(), m_Specific(I))))

              return true;

            return false;

          }))

        return false;

    }


    if (DstTy->getScalarSizeInBits() >= 64)

      return false;


    IRBuilder<> Builder(ZExt);

    Value *Result = createTblShuffleForZExt(

        Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),

        DstTy, Subtarget->isLittleEndian());

    if (!Result)

      return false;

    ZExt->replaceAllUsesWith(Result);

    ZExt->eraseFromParent();

    return true;

  }


  auto *UIToFP = dyn_cast<UIToFPInst>(I);

  if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&

                  DstTy->getElementType()->isFloatTy()) ||

                 (SrcTy->getElementType()->isIntegerTy(16) &&

                  DstTy->getElementType()->isDoubleTy()))) {

    IRBuilder<> Builder(I);

    Value *ZExt = createTblShuffleForZExt(

        Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),

        FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());

    assert(ZExt && "Cannot fail for the i8 to float conversion");

    auto *UI = Builder.CreateUIToFP(ZExt, DstTy);

    I->replaceAllUsesWith(UI);

    I->eraseFromParent();

    return true;

  }


  auto *SIToFP = dyn_cast<SIToFPInst>(I);

  if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&

      DstTy->getElementType()->isFloatTy()) {

    IRBuilder<> Builder(I);

    auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),

                                            FixedVectorType::getInteger(DstTy),

                                            Subtarget->isLittleEndian());

    assert(Shuffle && "Cannot fail for the i8 to float conversion");

    auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));

    auto *AShr = Builder.CreateAShr(Cast, 24, "", true);

    auto *SI = Builder.CreateSIToFP(AShr, DstTy);

    I->replaceAllUsesWith(SI);

    I->eraseFromParent();

    return true;

  }


  // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui

  // followed by a truncate lowered to using tbl.4.

  auto *FPToUI = dyn_cast<FPToUIInst>(I);

  if (FPToUI &&

      (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&

      SrcTy->getElementType()->isFloatTy() &&

      DstTy->getElementType()->isIntegerTy(8)) {

    IRBuilder<> Builder(I);

    auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),

                                          VectorType::getInteger(SrcTy));

    auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);

    I->replaceAllUsesWith(TruncI);

    I->eraseFromParent();

    createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());

    return true;

  }


  // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate

  // tbl instruction selecting the lowest/highest (little/big endian) 8 bits

  // per lane of the input that is represented using 1,2,3 or 4 128-bit table

  // registers

  auto *TI = dyn_cast<TruncInst>(I);

  if (TI && DstTy->getElementType()->isIntegerTy(8) &&

      ((SrcTy->getElementType()->isIntegerTy(32) ||

        SrcTy->getElementType()->isIntegerTy(64)) &&

       (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {

    createTblForTrunc(TI, Subtarget->isLittleEndian());

    return true;

  }


  return false;

}


bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,

                                          Align &RequiredAlignment) const {

  if (!LoadedType.isSimple() ||

      (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))

    return false;

  // Cyclone supports unaligned accesses.

  RequiredAlignment = Align(1);

  unsigned NumBits = LoadedType.getSizeInBits();

  return NumBits == 32 || NumBits == 64;

}


/// A helper function for determining the number of interleaved accesses we

/// will generate when lowering accesses of the given type.


unsigned AArch64TargetLowering::getNumInterleavedAccesses(

    VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {

  unsigned VecSize = 128;

  unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());

  unsigned MinElts = VecTy->getElementCount().getKnownMinValue();

  if (UseScalable && isa<FixedVectorType>(VecTy))

    VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);

  return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);

}


MachineMemOperand::Flags


AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {

  if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&

      I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))

    return MOStridedAccess;

  return MachineMemOperand::MONone;

}


bool AArch64TargetLowering::isLegalInterleavedAccessType(

    VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {

  unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());

  auto EC = VecTy->getElementCount();

  unsigned MinElts = EC.getKnownMinValue();


  UseScalable = false;


  if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&

      (!Subtarget->useSVEForFixedLengthVectors() ||

       !getSVEPredPatternFromNumElements(MinElts)))

    return false;


  if (isa<ScalableVectorType>(VecTy) &&

      !Subtarget->isSVEorStreamingSVEAvailable())

    return false;


  // Ensure the number of vector elements is greater than 1.

  if (MinElts < 2)

    return false;


  // Ensure the element type is legal.

  if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)

    return false;


  if (EC.isScalable()) {

    UseScalable = true;

    return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;

  }


  unsigned VecSize = DL.getTypeSizeInBits(VecTy);

  if (Subtarget->useSVEForFixedLengthVectors()) {

    unsigned MinSVEVectorSize =

        std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);

    if (VecSize % MinSVEVectorSize == 0 ||

        (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&

         (!Subtarget->isNeonAvailable() || VecSize > 128))) {

      UseScalable = true;

      return true;

    }

  }


  // Ensure the total vector size is 64 or a multiple of 128. Types larger than

  // 128 will be split into multiple interleaved accesses.

  return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);

}


static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) {

  if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))

    return ScalableVectorType::get(VTy->getElementType(), 2);


  if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))

    return ScalableVectorType::get(VTy->getElementType(), 4);


  if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))

    return ScalableVectorType::get(VTy->getElementType(), 8);


  if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))

    return ScalableVectorType::get(VTy->getElementType(), 8);


  if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))

    return ScalableVectorType::get(VTy->getElementType(), 2);


  if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))

    return ScalableVectorType::get(VTy->getElementType(), 4);


  if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))

    return ScalableVectorType::get(VTy->getElementType(), 8);


  if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))

    return ScalableVectorType::get(VTy->getElementType(), 16);


  llvm_unreachable("Cannot handle input vector type");

}


static Function *getStructuredLoadFunction(Module *M, unsigned Factor,

                                           bool Scalable, Type *LDVTy,

                                           Type *PtrTy) {

  assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");

  static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,

                                            Intrinsic::aarch64_sve_ld3_sret,

                                            Intrinsic::aarch64_sve_ld4_sret};

  static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,

                                             Intrinsic::aarch64_neon_ld3,

                                             Intrinsic::aarch64_neon_ld4};

  if (Scalable)

    return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy});


  return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],

                                           {LDVTy, PtrTy});

}


static Function *getStructuredStoreFunction(Module *M, unsigned Factor,

                                            bool Scalable, Type *STVTy,

                                            Type *PtrTy) {

  assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");

  static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,

                                             Intrinsic::aarch64_sve_st3,

                                             Intrinsic::aarch64_sve_st4};

  static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,

                                              Intrinsic::aarch64_neon_st3,

                                              Intrinsic::aarch64_neon_st4};

  if (Scalable)

    return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy});


  return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],

                                           {STVTy, PtrTy});

}


/// Lower an interleaved load into a ldN intrinsic.

///

/// E.g. Lower an interleaved load (Factor = 2):

///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr

///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements

///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements

///

///      Into:

///        %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)

///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0

///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1


bool AArch64TargetLowering::lowerInterleavedLoad(

    Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,

    ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {

  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&

         "Invalid interleave factor");

  assert(!Shuffles.empty() && "Empty shufflevector input");

  assert(Shuffles.size() == Indices.size() &&

         "Unmatched number of shufflevectors and indices");


  auto *LI = dyn_cast<LoadInst>(Load);

  if (!LI)

    return false;

  assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");


  const DataLayout &DL = LI->getDataLayout();


  VectorType *VTy = Shuffles[0]->getType();


  // Skip if we do not have NEON and skip illegal vector types. We can

  // "legalize" wide vector types into multiple interleaved accesses as long as

  // the vector types are divisible by 128.

  bool UseScalable;

  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))

    return false;


  // Check if the interleave is a zext(shuffle), that can be better optimized

  // into shift / and masks. For the moment we do this just for uitofp (not

  // zext) to avoid issues with widening instructions.

  if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {

        return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&

               SI->getType()->getScalarSizeInBits() * 4 ==

                   SI->user_back()->getType()->getScalarSizeInBits();

      }))

    return false;


  unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);


  auto *FVTy = cast<FixedVectorType>(VTy);


  // A pointer vector can not be the return type of the ldN intrinsics. Need to

  // load integer vectors first and then convert to pointer vectors.

  Type *EltTy = FVTy->getElementType();

  if (EltTy->isPointerTy())

    FVTy =

        FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());


  // If we're going to generate more than one load, reset the sub-vector type

  // to something legal.

  FVTy = FixedVectorType::get(FVTy->getElementType(),

                              FVTy->getNumElements() / NumLoads);


  auto *LDVTy =

      UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;


  IRBuilder<> Builder(LI);


  // The base address of the load.

  Value *BaseAddr = LI->getPointerOperand();


  Type *PtrTy = LI->getPointerOperandType();

  Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),

                                 LDVTy->getElementCount());


  Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,

                                                UseScalable, LDVTy, PtrTy);


  // Holds sub-vectors extracted from the load intrinsic return values. The

  // sub-vectors are associated with the shufflevector instructions they will

  // replace.

  DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;


  Value *PTrue = nullptr;

  if (UseScalable) {

    std::optional<unsigned> PgPattern =

        getSVEPredPatternFromNumElements(FVTy->getNumElements());

    if (Subtarget->getMinSVEVectorSizeInBits() ==

            Subtarget->getMaxSVEVectorSizeInBits() &&

        Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))

      PgPattern = AArch64SVEPredPattern::all;


    auto *PTruePat =

        ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);

    PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},

                                    {PTruePat});

  }


  for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {


    // If we're generating more than one load, compute the base address of

    // subsequent loads as an offset from the previous.

    if (LoadCount > 0)

      BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,

                                            FVTy->getNumElements() * Factor);


    CallInst *LdN;

    if (UseScalable)

      LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");

    else

      LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");


    // Extract and store the sub-vectors returned by the load intrinsic.

    for (unsigned i = 0; i < Shuffles.size(); i++) {

      ShuffleVectorInst *SVI = Shuffles[i];

      unsigned Index = Indices[i];


      Value *SubVec = Builder.CreateExtractValue(LdN, Index);


      if (UseScalable)

        SubVec = Builder.CreateExtractVector(FVTy, SubVec, uint64_t(0));


      // Convert the integer vector to pointer vector if the element is pointer.

      if (EltTy->isPointerTy())

        SubVec = Builder.CreateIntToPtr(

            SubVec, FixedVectorType::get(SVI->getType()->getElementType(),

                                         FVTy->getNumElements()));


      SubVecs[SVI].push_back(SubVec);

    }

  }


  // Replace uses of the shufflevector instructions with the sub-vectors

  // returned by the load intrinsic. If a shufflevector instruction is

  // associated with more than one sub-vector, those sub-vectors will be

  // concatenated into a single wide vector.

  for (ShuffleVectorInst *SVI : Shuffles) {

    auto &SubVec = SubVecs[SVI];

    auto *WideVec =

        SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];

    SVI->replaceAllUsesWith(WideVec);

  }


  return true;

}


template <typename Iter>


bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {

  int MaxLookupDist = 20;

  unsigned IdxWidth = DL.getIndexSizeInBits(0);

  APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);

  const Value *PtrA1 =

      Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);


  while (++It != End) {

    if (It->isDebugOrPseudoInst())

      continue;

    if (MaxLookupDist-- == 0)

      break;

    if (const auto *SI = dyn_cast<StoreInst>(&*It)) {

      const Value *PtrB1 =

          SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(

              DL, OffsetB);

      if (PtrA1 == PtrB1 &&

          (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))

                  .abs() == 16)

        return true;

    }

  }


  return false;

}


/// Lower an interleaved store into a stN intrinsic.

///

/// E.g. Lower an interleaved store (Factor = 3):

///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,

///                 <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>

///        store <12 x i32> %i.vec, <12 x i32>* %ptr

///

///      Into:

///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>

///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>

///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>

///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)

///

/// Note that the new shufflevectors will be removed and we'll only generate one

/// st3 instruction in CodeGen.

///

/// Example for a more general valid mask (Factor 3). Lower:

///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,

///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>

///        store <12 x i32> %i.vec, <12 x i32>* %ptr

///

///      Into:

///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>

///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>

///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>

///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)


bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,

                                                  Value *LaneMask,

                                                  ShuffleVectorInst *SVI,

                                                  unsigned Factor,

                                                  const APInt &GapMask) const {


  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&

         "Invalid interleave factor");

  auto *SI = dyn_cast<StoreInst>(Store);

  if (!SI)

    return false;

  assert(!LaneMask && GapMask.popcount() == Factor &&

         "Unexpected mask on store");


  auto *VecTy = cast<FixedVectorType>(SVI->getType());

  assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");


  unsigned LaneLen = VecTy->getNumElements() / Factor;

  Type *EltTy = VecTy->getElementType();

  auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);


  const DataLayout &DL = SI->getDataLayout();

  bool UseScalable;


  // Skip if we do not have NEON and skip illegal vector types. We can

  // "legalize" wide vector types into multiple interleaved accesses as long as

  // the vector types are divisible by 128.

  if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))

    return false;


  unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);


  Value *Op0 = SVI->getOperand(0);

  Value *Op1 = SVI->getOperand(1);

  IRBuilder<> Builder(SI);


  // StN intrinsics don't support pointer vectors as arguments. Convert pointer

  // vectors to integer vectors.

  if (EltTy->isPointerTy()) {

    Type *IntTy = DL.getIntPtrType(EltTy);

    unsigned NumOpElts =

        cast<FixedVectorType>(Op0->getType())->getNumElements();


    // Convert to the corresponding integer vector.

    auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);

    Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);

    Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);


    SubVecTy = FixedVectorType::get(IntTy, LaneLen);

  }


  // If we're going to generate more than one store, reset the lane length

  // and sub-vector type to something legal.

  LaneLen /= NumStores;

  SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);


  auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))

                            : SubVecTy;


  // The base address of the store.

  Value *BaseAddr = SI->getPointerOperand();


  auto Mask = SVI->getShuffleMask();


  // Sanity check if all the indices are NOT in range.

  // If mask is `poison`, `Mask` may be a vector of -1s.

  // If all of them are `poison`, OOB read will happen later.

  if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {

    return false;

  }

  // A 64bit st2 which does not start at element 0 will involved adding extra

  // ext elements making the st2 unprofitable, and if there is a nearby store

  // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a

  // zip;ldp pair which has higher throughput.

  if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&

      (Mask[0] != 0 ||

       hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,

                            DL) ||

       hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),

                            BaseAddr, DL)))

    return false;


  Type *PtrTy = SI->getPointerOperandType();

  Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),

                                 STVTy->getElementCount());


  Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,

                                                 UseScalable, STVTy, PtrTy);


  Value *PTrue = nullptr;

  if (UseScalable) {

    std::optional<unsigned> PgPattern =

        getSVEPredPatternFromNumElements(SubVecTy->getNumElements());

    if (Subtarget->getMinSVEVectorSizeInBits() ==

            Subtarget->getMaxSVEVectorSizeInBits() &&

        Subtarget->getMinSVEVectorSizeInBits() ==

            DL.getTypeSizeInBits(SubVecTy))

      PgPattern = AArch64SVEPredPattern::all;


    auto *PTruePat =

        ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);

    PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},

                                    {PTruePat});

  }


  for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {


    SmallVector<Value *, 5> Ops;


    // Split the shufflevector operands into sub vectors for the new stN call.

    for (unsigned i = 0; i < Factor; i++) {

      Value *Shuffle;

      unsigned IdxI = StoreCount * LaneLen * Factor + i;

      if (Mask[IdxI] >= 0) {

        Shuffle = Builder.CreateShuffleVector(

            Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));

      } else {

        unsigned StartMask = 0;

        for (unsigned j = 1; j < LaneLen; j++) {

          unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;

          if (Mask[IdxJ] >= 0) {

            StartMask = Mask[IdxJ] - j;

            break;

          }

        }

        // Note: Filling undef gaps with random elements is ok, since

        // those elements were being written anyway (with undefs).

        // In the case of all undefs we're defaulting to using elems from 0

        // Note: StartMask cannot be negative, it's checked in

        // isReInterleaveMask

        Shuffle = Builder.CreateShuffleVector(

            Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));

      }


      if (UseScalable)

        Shuffle = Builder.CreateInsertVector(STVTy, PoisonValue::get(STVTy),

                                             Shuffle, uint64_t(0));


      Ops.push_back(Shuffle);

    }


    if (UseScalable)

      Ops.push_back(PTrue);


    // If we generating more than one store, we compute the base address of

    // subsequent stores as an offset from the previous.

    if (StoreCount > 0)

      BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),

                                            BaseAddr, LaneLen * Factor);


    Ops.push_back(BaseAddr);

    Builder.CreateCall(StNFunc, Ops);

  }

  return true;

}


bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(

    Instruction *Load, Value *Mask, IntrinsicInst *DI) const {

  const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());

  if (Factor != 2 && Factor != 3 && Factor != 4) {

    LLVM_DEBUG(dbgs() << "Matching ld2, ld3 and ld4 patterns failed\n");

    return false;

  }

  auto *LI = dyn_cast<LoadInst>(Load);

  if (!LI)

    return false;

  assert(!Mask && "Unexpected mask on a load\n");


  VectorType *VTy = getDeinterleavedVectorType(DI);


  const DataLayout &DL = LI->getModule()->getDataLayout();

  bool UseScalable;

  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))

    return false;


  // TODO: Add support for using SVE instructions with fixed types later, using

  // the code from lowerInterleavedLoad to obtain the correct container type.

  if (UseScalable && !VTy->isScalableTy())

    return false;


  unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);

  VectorType *LdTy =

      VectorType::get(VTy->getElementType(),

                      VTy->getElementCount().divideCoefficientBy(NumLoads));


  Type *PtrTy = LI->getPointerOperandType();

  Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,

                                                UseScalable, LdTy, PtrTy);


  IRBuilder<> Builder(LI);

  Value *Pred = nullptr;

  if (UseScalable)

    Pred =

        Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());


  Value *BaseAddr = LI->getPointerOperand();

  Value *Result = nullptr;

  if (NumLoads > 1) {

    // Create multiple legal small ldN.

    SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));

    for (unsigned I = 0; I < NumLoads; ++I) {

      Value *Offset = Builder.getInt64(I * Factor);


      Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});

      Value *LdN = nullptr;

      if (UseScalable)

        LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");

      else

        LdN = Builder.CreateCall(LdNFunc, Address, "ldN");

      Value *Idx =

          Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());

      for (unsigned J = 0; J < Factor; ++J) {

        ExtractedLdValues[J] = Builder.CreateInsertVector(

            VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);

      }

      LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());

    }


    // Merge the values from different factors.

    Result = PoisonValue::get(DI->getType());

    for (unsigned J = 0; J < Factor; ++J)

      Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);

  } else {

    if (UseScalable)

      Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");

    else

      Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");

  }


  // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4

  DI->replaceAllUsesWith(Result);

  return true;

}


bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(

    Instruction *Store, Value *Mask,

    ArrayRef<Value *> InterleavedValues) const {

  unsigned Factor = InterleavedValues.size();

  if (Factor != 2 && Factor != 3 && Factor != 4) {

    LLVM_DEBUG(dbgs() << "Matching st2, st3 and st4 patterns failed\n");

    return false;

  }

  StoreInst *SI = dyn_cast<StoreInst>(Store);

  if (!SI)

    return false;

  assert(!Mask && "Unexpected mask on plain store");


  VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());

  const DataLayout &DL = SI->getModule()->getDataLayout();


  bool UseScalable;

  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))

    return false;


  // TODO: Add support for using SVE instructions with fixed types later, using

  // the code from lowerInterleavedStore to obtain the correct container type.

  if (UseScalable && !VTy->isScalableTy())

    return false;


  unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);


  VectorType *StTy =

      VectorType::get(VTy->getElementType(),

                      VTy->getElementCount().divideCoefficientBy(NumStores));


  Type *PtrTy = SI->getPointerOperandType();

  Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,

                                                 UseScalable, StTy, PtrTy);


  IRBuilder<> Builder(SI);


  Value *BaseAddr = SI->getPointerOperand();

  Value *Pred = nullptr;


  if (UseScalable)

    Pred =

        Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());


  auto ExtractedValues = InterleavedValues;

  SmallVector<Value *, 4> StoreOperands(InterleavedValues);

  if (UseScalable)

    StoreOperands.push_back(Pred);

  StoreOperands.push_back(BaseAddr);

  for (unsigned I = 0; I < NumStores; ++I) {

    Value *Address = BaseAddr;

    if (NumStores > 1) {

      Value *Offset = Builder.getInt64(I * Factor);

      Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});

      Value *Idx =

          Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());

      for (unsigned J = 0; J < Factor; J++) {

        StoreOperands[J] =

            Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);

      }

      // update the address

      StoreOperands[StoreOperands.size() - 1] = Address;

    }

    Builder.CreateCall(StNFunc, StoreOperands);

  }

  return true;

}


EVT AArch64TargetLowering::getOptimalMemOpType(

    LLVMContext &Context, const MemOp &Op,

    const AttributeList &FuncAttributes) const {

  bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);

  bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;

  bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;

  // Only use AdvSIMD to implement memset of 32-byte and above. It would have

  // taken one instruction to materialize the v2i64 zero and one store (with

  // restrictive addressing mode). Just do i64 stores.

  bool IsSmallMemset = Op.isMemset() && Op.size() < 32;

  auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {

    if (Op.isAligned(AlignCheck))

      return true;

    unsigned Fast;

    return allowsMisalignedMemoryAccesses(VT, 0, Align(1),

                                          MachineMemOperand::MONone, &Fast) &&

           Fast;

  };


  if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&

      AlignmentIsAcceptable(MVT::v16i8, Align(16)))

    return MVT::v16i8;

  if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))

    return MVT::f128;

  if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))

    return MVT::i64;

  if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))

    return MVT::i32;

  return MVT::Other;

}


LLT AArch64TargetLowering::getOptimalMemOpLLT(

    const MemOp &Op, const AttributeList &FuncAttributes) const {

  bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);

  bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;

  bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;

  // Only use AdvSIMD to implement memset of 32-byte and above. It would have

  // taken one instruction to materialize the v2i64 zero and one store (with

  // restrictive addressing mode). Just do i64 stores.

  bool IsSmallMemset = Op.isMemset() && Op.size() < 32;

  auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {

    if (Op.isAligned(AlignCheck))

      return true;

    unsigned Fast;

    return allowsMisalignedMemoryAccesses(VT, 0, Align(1),

                                          MachineMemOperand::MONone, &Fast) &&

           Fast;

  };


  if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&

      AlignmentIsAcceptable(MVT::v2i64, Align(16)))

    return LLT::fixed_vector(2, 64);

  if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))

    return LLT::scalar(128);

  if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))

    return LLT::scalar(64);

  if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))

    return LLT::scalar(32);

  return LLT();

}


// 12-bit optionally shifted immediates are legal for adds.


bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {

  if (Immed == std::numeric_limits<int64_t>::min()) {

    return false;

  }

  // Same encoding for add/sub, just flip the sign.

  return isLegalArithImmed((uint64_t)std::abs(Immed));

}


bool AArch64TargetLowering::isLegalAddScalableImmediate(int64_t Imm) const {

  // We will only emit addvl/inc* instructions for SVE2

  if (!Subtarget->hasSVE2())

    return false;


  // addvl's immediates are in terms of the number of bytes in a register.

  // Since there are 16 in the base supported size (128bits), we need to

  // divide the immediate by that much to give us a useful immediate to

  // multiply by vscale. We can't have a remainder as a result of this.

  if (Imm % 16 == 0)

    return isInt<6>(Imm / 16);


  // Inc[b|h|w|d] instructions take a pattern and a positive immediate

  // multiplier. For now, assume a pattern of 'all'. Incb would be a subset

  // of addvl as a result, so only take h|w|d into account.

  // Dec[h|w|d] will cover subtractions.

  // Immediates are in the range [1,16], so we can't do a 2's complement check.

  // FIXME: Can we make use of other patterns to cover other immediates?


  // inch|dech

  if (Imm % 8 == 0)

    return std::abs(Imm / 8) <= 16;

  // incw|decw

  if (Imm % 4 == 0)

    return std::abs(Imm / 4) <= 16;

  // incd|decd

  if (Imm % 2 == 0)

    return std::abs(Imm / 2) <= 16;


  return false;

}


// Return false to prevent folding

// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,

// if the folding leads to worse code.


bool AArch64TargetLowering::isMulAddWithConstProfitable(

    SDValue AddNode, SDValue ConstNode) const {

  // Let the DAGCombiner decide for vector types and large types.

  const EVT VT = AddNode.getValueType();

  if (VT.isVector() || VT.getScalarSizeInBits() > 64)

    return true;


  // It is worse if c1 is legal add immediate, while c1*c2 is not

  // and has to be composed by at least two instructions.

  const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));

  const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);

  const int64_t C1 = C1Node->getSExtValue();

  const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();

  if (!isLegalAddImmediate(C1) || isLegalAddImmediate(C1C2.getSExtValue()))

    return true;

  SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;

  // Adapt to the width of a register.

  unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;

  AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), BitSize, Insn);

  if (Insn.size() > 1)

    return false;


  // Default to true and let the DAGCombiner decide.

  return true;

}


// Integer comparisons are implemented with ADDS/SUBS, so the range of valid

// immediates is the same as for an add or a sub.


bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {

  return isLegalAddImmediate(Immed);

}


/// isLegalAddressingMode - Return true if the addressing mode represented

/// by AM is legal for this target, for a load/store of the specified type.


bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,

                                                  const AddrMode &AMode, Type *Ty,

                                                  unsigned AS, Instruction *I) const {

  // AArch64 has five basic addressing modes:

  //  reg

  //  reg + 9-bit signed offset

  //  reg + SIZE_IN_BYTES * 12-bit unsigned offset

  //  reg1 + reg2

  //  reg + SIZE_IN_BYTES * reg


  // No global is ever allowed as a base.

  if (AMode.BaseGV)

    return false;


  // No reg+reg+imm addressing.

  if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)

    return false;


  // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and

  // `2*ScaledReg` into `BaseReg + ScaledReg`

  AddrMode AM = AMode;

  if (AM.Scale && !AM.HasBaseReg) {

    if (AM.Scale == 1) {

      AM.HasBaseReg = true;

      AM.Scale = 0;

    } else if (AM.Scale == 2) {

      AM.HasBaseReg = true;

      AM.Scale = 1;

    } else {

      return false;

    }

  }


  // A base register is required in all addressing modes.

  if (!AM.HasBaseReg)

    return false;


  if (Ty->isScalableTy()) {

    if (isa<ScalableVectorType>(Ty)) {

      // See if we have a foldable vscale-based offset, for vector types which

      // are either legal or smaller than the minimum; more work will be

      // required if we need to consider addressing for types which need

      // legalization by splitting.

      uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;

      if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&

          (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&

          isPowerOf2_64(VecNumBytes))

        return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);


      uint64_t VecElemNumBytes =

          DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;

      return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&

             (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);

    }


    return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;

  }


  // No scalable offsets allowed for non-scalable types.

  if (AM.ScalableOffset)

    return false;


  // check reg + imm case:

  // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12

  uint64_t NumBytes = 0;

  if (Ty->isSized()) {

    uint64_t NumBits = DL.getTypeSizeInBits(Ty);

    NumBytes = NumBits / 8;

    if (!isPowerOf2_64(NumBits))

      NumBytes = 0;

  }


  return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,

                                                          AM.Scale);

}


// Check whether the 2 offsets belong to the same imm24 range, and their high

// 12bits are same, then their high part can be decoded with the offset of add.

int64_t


AArch64TargetLowering::getPreferredLargeGEPBaseOffset(int64_t MinOffset,

                                                      int64_t MaxOffset) const {

  int64_t HighPart = MinOffset & ~0xfffULL;

  if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {

    // Rebase the value to an integer multiple of imm12.

    return HighPart;

  }


  return 0;

}


bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {

  // Consider splitting large offset of struct or array.

  return true;

}


bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(

    const MachineFunction &MF, EVT VT) const {

  EVT ScalarVT = VT.getScalarType();


  if (!ScalarVT.isSimple())

    return false;


  switch (ScalarVT.getSimpleVT().SimpleTy) {

  case MVT::f16:

    return Subtarget->hasFullFP16();

  case MVT::f32:

  case MVT::f64:

    return true;

  case MVT::bf16:

    return VT.isScalableVector() && Subtarget->hasSVEB16B16() &&

           Subtarget->isNonStreamingSVEorSME2Available();

  default:

    break;

  }


  return false;

}


bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,

                                                       Type *Ty) const {

  switch (Ty->getScalarType()->getTypeID()) {

  case Type::FloatTyID:

  case Type::DoubleTyID:

    return true;

  default:

    return false;

  }

}


bool AArch64TargetLowering::generateFMAsInMachineCombiner(

    EVT VT, CodeGenOptLevel OptLevel) const {

  return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&

         !useSVEForFixedLengthVectorVT(VT);

}


const MCPhysReg *


AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {

  // LR is a callee-save register, but we must treat it as clobbered by any call

  // site. Hence we include LR in the scratch registers, which are in turn added

  // as implicit-defs for stackmaps and patchpoints.

  static const MCPhysReg ScratchRegs[] = {

    AArch64::X16, AArch64::X17, AArch64::LR, 0

  };

  return ScratchRegs;

}


ArrayRef<MCPhysReg> AArch64TargetLowering::getRoundingControlRegisters() const {

  static const MCPhysReg RCRegs[] = {AArch64::FPCR};

  return RCRegs;

}


bool


AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,

                                                     CombineLevel Level) const {

  assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||

          N->getOpcode() == ISD::SRL) &&

         "Expected shift op");


  SDValue ShiftLHS = N->getOperand(0);

  EVT VT = N->getValueType(0);


  if (!ShiftLHS->hasOneUse())

    return false;


  if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&

      !ShiftLHS.getOperand(0)->hasOneUse())

    return false;


  // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not

  // combine it with shift 'N' to let it be lowered to UBFX except:

  // ((x >> C) & mask) << C.

  if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&

      isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {

    uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);

    if (isMask_64(TruncMask)) {

      SDValue AndLHS = ShiftLHS.getOperand(0);

      if (AndLHS.getOpcode() == ISD::SRL) {

        if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {

          if (N->getOpcode() == ISD::SHL)

            if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))

              return SRLC->getZExtValue() == SHLC->getZExtValue();

          return false;

        }

      }

    }

  }

  return true;

}


bool AArch64TargetLowering::isDesirableToCommuteXorWithShift(

    const SDNode *N) const {

  assert(N->getOpcode() == ISD::XOR &&

         (N->getOperand(0).getOpcode() == ISD::SHL ||

          N->getOperand(0).getOpcode() == ISD::SRL) &&

         "Expected XOR(SHIFT) pattern");


  // Only commute if the entire NOT mask is a hidden shifted mask.

  auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));

  auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));

  if (XorC && ShiftC) {

    unsigned MaskIdx, MaskLen;

    if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {

      unsigned ShiftAmt = ShiftC->getZExtValue();

      unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();

      if (N->getOperand(0).getOpcode() == ISD::SHL)

        return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);

      return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);

    }

  }


  return false;

}


bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(

    const SDNode *N) const {

  assert(((N->getOpcode() == ISD::SHL &&

           N->getOperand(0).getOpcode() == ISD::SRL) ||

          (N->getOpcode() == ISD::SRL &&

           N->getOperand(0).getOpcode() == ISD::SHL)) &&

         "Expected shift-shift mask");

  // Don't allow multiuse shift folding with the same shift amount.

  if (!N->getOperand(0)->hasOneUse())

    return false;


  // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.

  EVT VT = N->getValueType(0);

  if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {

    auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));

    auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));

    return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());

  }


  // We do not need to fold when this shifting used in specific load case:

  // (ldr x, (add x, (shl (srl x, c1) 2)))

  if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {

    if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {

      unsigned ShlAmt = C2->getZExtValue();

      if (auto ShouldADD = *N->user_begin();

          ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {

        if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {

          EVT MemVT = Load->getMemoryVT();


          if (Load->getValueType(0).isScalableVector())

            return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits();


          if (isIndexedLoadLegal(ISD::PRE_INC, MemVT))

            return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits();

        }

      }

    }

  }


  return true;

}


bool AArch64TargetLowering::shouldFoldSelectWithIdentityConstant(

    unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,

    SDValue Y) const {

  return VT.isScalableVector() && isTypeLegal(VT) &&

         SelectOpcode == ISD::VSELECT;

}


bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,

                                                              Type *Ty) const {

  assert(Ty->isIntegerTy());


  unsigned BitSize = Ty->getPrimitiveSizeInBits();

  if (BitSize == 0)

    return false;


  int64_t Val = Imm.getSExtValue();

  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))

    return true;


  if (Val < 0)

    Val = ~Val;

  if (BitSize == 32)

    Val &= (1LL << 32) - 1;


  unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;

  // MOVZ is free so return true for one or fewer MOVK.

  return Shift < 3;

}


bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,

                                                    unsigned Index) const {

  if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))

    return false;


  return (Index == 0 || Index == ResVT.getVectorMinNumElements());

}


/// Turn vector tests of the signbit in the form of:

///   xor (sra X, elt_size(X)-1), -1

/// into:

///   cmge X, X, #0


static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,

                                         const AArch64Subtarget *Subtarget) {

  EVT VT = N->getValueType(0);

  if (!Subtarget->hasNEON() || !VT.isVector())

    return SDValue();


  // There must be a shift right algebraic before the xor, and the xor must be a

  // 'not' operation.

  SDValue Shift = N->getOperand(0);

  SDValue Ones = N->getOperand(1);

  if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||

      !ISD::isBuildVectorAllOnes(Ones.getNode()))

    return SDValue();


  // The shift should be smearing the sign bit across each vector element.

  auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));

  EVT ShiftEltTy = Shift.getValueType().getVectorElementType();

  if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)

    return SDValue();


  SDLoc DL(N);

  SDValue Zero = DAG.getConstant(0, DL, Shift.getValueType());

  return DAG.getSetCC(DL, VT, Shift.getOperand(0), Zero, ISD::SETGE);

}


// Given a vecreduce_add node, detect the below pattern and convert it to the

// node sequence with UABDL, [S|U]ADB and UADDLP.

//

// i32 vecreduce_add(

//  v16i32 abs(

//    v16i32 sub(

//     v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))

//

//  or

//

// i32 vecreduce_add(

//  v16i32 zext(

//   v16i16 abs(

//    v16i16 sub(

//     v16i16 [sign|zero]_extend(v16i8 a), v16i16 [sign|zero]_extend(v16i8 b))))

//

// =================>

// i32 vecreduce_add(

//   v4i32 UADDLP(

//     v8i16 add(

//       v8i16 zext(

//         v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b

//       v8i16 zext(

//         v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b


static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,

                                                    SelectionDAG &DAG) {

  // Assumed i32 vecreduce_add

  if (N->getValueType(0) != MVT::i32)

    return SDValue();


  SDValue VecReduceOp0 = N->getOperand(0);

  bool SawTrailingZext = false;

  // Look through an optional post-ABS ZEXT from v16i16 -> v16i32.

  if (VecReduceOp0.getOpcode() == ISD::ZERO_EXTEND &&

      VecReduceOp0->getValueType(0) == MVT::v16i32 &&

      VecReduceOp0->getOperand(0)->getOpcode() == ISD::ABS &&

      VecReduceOp0->getOperand(0)->getValueType(0) == MVT::v16i16) {

    SawTrailingZext = true;

    VecReduceOp0 = VecReduceOp0.getOperand(0);

  }


  // Peel off an optional post-ABS extend (v16i16 -> v16i32).

  MVT AbsInputVT = SawTrailingZext ? MVT::v16i16 : MVT::v16i32;

  // Assumed v16i16 or v16i32 abs input

  unsigned Opcode = VecReduceOp0.getOpcode();

  if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != AbsInputVT)

    return SDValue();


  SDValue ABS = VecReduceOp0;

  // Assumed v16i16 or v16i32 sub

  if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||

      ABS->getOperand(0)->getValueType(0) != AbsInputVT)

    return SDValue();


  SDValue SUB = ABS->getOperand(0);

  unsigned Opcode0 = SUB->getOperand(0).getOpcode();

  unsigned Opcode1 = SUB->getOperand(1).getOpcode();

  // Assumed v16i16 or v16i32 type

  if (SUB->getOperand(0)->getValueType(0) != AbsInputVT ||

      SUB->getOperand(1)->getValueType(0) != AbsInputVT)

    return SDValue();


  // Assumed zext or sext

  bool IsZExt = false;

  if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {

    IsZExt = true;

  } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {

    IsZExt = false;

  } else

    return SDValue();


  SDValue EXT0 = SUB->getOperand(0);

  SDValue EXT1 = SUB->getOperand(1);

  // Assumed zext's operand has v16i8 type

  if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||

      EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)

    return SDValue();


  // Pattern is detected. Let's convert it to sequence of nodes.

  SDLoc DL(N);


  // First, create the node pattern of UABD/SABD.

  SDValue UABDHigh8Op0 =

      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),

                  DAG.getConstant(8, DL, MVT::i64));

  SDValue UABDHigh8Op1 =

      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),

                  DAG.getConstant(8, DL, MVT::i64));

  SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,

                                  UABDHigh8Op0, UABDHigh8Op1);

  SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);


  // Second, create the node pattern of UABAL.

  SDValue UABDLo8Op0 =

      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),

                  DAG.getConstant(0, DL, MVT::i64));

  SDValue UABDLo8Op1 =

      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),

                  DAG.getConstant(0, DL, MVT::i64));

  SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,

                                UABDLo8Op0, UABDLo8Op1);

  SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);

  SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);


  // Third, create the node of UADDLP.

  SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);


  // Fourth, create the node of VECREDUCE_ADD.

  return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);

}


static SDValue


performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,

                             const AArch64Subtarget *ST) {

  if (DCI.isBeforeLegalize())

    return SDValue();


  if (SDValue While = optimizeIncrementingWhile(N, DCI.DAG, /*IsSigned=*/false,

                                                /*IsEqual=*/false))

    return While;


  if (!N->getValueType(0).isScalableVector() ||

      (!ST->hasSVE2p1() && !(ST->hasSME2() && ST->isStreaming())))

    return SDValue();


  // Count the number of users which are extract_vectors.

  unsigned NumExts = count_if(N->users(), [](SDNode *Use) {

    return Use->getOpcode() == ISD::EXTRACT_SUBVECTOR;

  });


  auto MaskEC = N->getValueType(0).getVectorElementCount();

  if (!MaskEC.isKnownMultipleOf(NumExts))

    return SDValue();


  ElementCount ExtMinEC = MaskEC.divideCoefficientBy(NumExts);

  if (ExtMinEC.getKnownMinValue() < 2)

    return SDValue();


  SmallVector<SDNode *> Extracts(NumExts, nullptr);

  for (SDNode *Use : N->users()) {

    if (Use->getOpcode() != ISD::EXTRACT_SUBVECTOR)

      continue;


    // Ensure the extract type is correct (e.g. if NumExts is 4 and

    // the mask return type is nxv8i1, each extract should be nxv2i1.

    if (Use->getValueType(0).getVectorElementCount() != ExtMinEC)

      return SDValue();


    // There should be exactly one extract for each part of the mask.

    unsigned Offset = Use->getConstantOperandVal(1);

    unsigned Part = Offset / ExtMinEC.getKnownMinValue();

    if (Extracts[Part] != nullptr)

      return SDValue();


    Extracts[Part] = Use;

  }


  SelectionDAG &DAG = DCI.DAG;

  SDLoc DL(N);

  SDValue ID =

      DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);


  SDValue Idx = N->getOperand(0);

  SDValue TC = N->getOperand(1);

  if (Idx.getValueType() != MVT::i64) {

    Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);

    TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);

  }


  // Create the whilelo_x2 intrinsics from each pair of extracts

  EVT ExtVT = Extracts[0]->getValueType(0);

  EVT DoubleExtVT = ExtVT.getDoubleNumVectorElementsVT(*DAG.getContext());

  auto R =

      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});

  DCI.CombineTo(Extracts[0], R.getValue(0));

  DCI.CombineTo(Extracts[1], R.getValue(1));

  SmallVector<SDValue> Concats = {DAG.getNode(

      ISD::CONCAT_VECTORS, DL, DoubleExtVT, R.getValue(0), R.getValue(1))};


  if (NumExts == 2) {

    assert(N->getValueType(0) == DoubleExtVT);

    return Concats[0];

  }


  auto Elts =

      DAG.getElementCount(DL, MVT::i64, ExtVT.getVectorElementCount() * 2);

  for (unsigned I = 2; I < NumExts; I += 2) {

    // After the first whilelo_x2, we need to increment the starting value.

    Idx = DAG.getNode(ISD::UADDSAT, DL, MVT::i64, Idx, Elts);

    R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});

    DCI.CombineTo(Extracts[I], R.getValue(0));

    DCI.CombineTo(Extracts[I + 1], R.getValue(1));

    Concats.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, DoubleExtVT,

                                  R.getValue(0), R.getValue(1)));

  }


  return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), Concats);

}


// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce

//   vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))

//   vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))

// If we have vectors larger than v16i8 we extract v16i8 vectors,

// Follow the same steps above to get DOT instructions concatenate them

// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).


static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,

                                          const AArch64Subtarget *ST) {

  if (!ST->isNeonAvailable())

    return SDValue();


  if (!ST->hasDotProd())

    return performVecReduceAddCombineWithUADDLP(N, DAG);


  SDValue Op0 = N->getOperand(0);

  if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||

      Op0.getValueType().getVectorElementType() != MVT::i32)

    return SDValue();


  unsigned ExtOpcode = Op0.getOpcode();

  SDValue A = Op0;

  SDValue B;

  unsigned DotOpcode;

  if (ExtOpcode == ISD::MUL) {

    A = Op0.getOperand(0);

    B = Op0.getOperand(1);

    if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())

      return SDValue();

    auto OpCodeA = A.getOpcode();

    if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)

      return SDValue();


    auto OpCodeB = B.getOpcode();

    if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)

      return SDValue();


    if (OpCodeA == OpCodeB) {

      DotOpcode =

          OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;

    } else {

      // Check USDOT support support

      if (!ST->hasMatMulInt8())

        return SDValue();

      DotOpcode = AArch64ISD::USDOT;

      if (OpCodeA == ISD::SIGN_EXTEND)

        std::swap(A, B);

    }

  } else if (ExtOpcode == ISD::ZERO_EXTEND) {

    DotOpcode = AArch64ISD::UDOT;

  } else if (ExtOpcode == ISD::SIGN_EXTEND) {

    DotOpcode = AArch64ISD::SDOT;

  } else {

    return SDValue();

  }


  EVT Op0VT = A.getOperand(0).getValueType();

  bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;

  bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;

  if (!IsValidElementCount || !IsValidSize)

    return SDValue();


  SDLoc DL(Op0);

  // For non-mla reductions B can be set to 1. For MLA we take the operand of

  // the extend B.

  if (!B)

    B = DAG.getConstant(1, DL, Op0VT);

  else

    B = B.getOperand(0);


  unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;

  unsigned NumOfVecReduce;

  EVT TargetType;

  if (IsMultipleOf16) {

    NumOfVecReduce = Op0VT.getVectorNumElements() / 16;

    TargetType = MVT::v4i32;

  } else {

    NumOfVecReduce = Op0VT.getVectorNumElements() / 8;

    TargetType = MVT::v2i32;

  }

  // Handle the case where we need to generate only one Dot operation.

  if (NumOfVecReduce == 1) {

    SDValue Zeros = DAG.getConstant(0, DL, TargetType);

    SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,

                              A.getOperand(0), B);

    return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);

  }

  // Generate Dot instructions that are multiple of 16.

  unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;

  SmallVector<SDValue, 4> SDotVec16;

  unsigned I = 0;

  for (; I < VecReduce16Num; I += 1) {

    SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);

    SDValue Op0 =

        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),

                    DAG.getConstant(I * 16, DL, MVT::i64));

    SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,

                              DAG.getConstant(I * 16, DL, MVT::i64));

    SDValue Dot =

        DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);

    SDotVec16.push_back(Dot);

  }

  // Concatenate dot operations.

  EVT SDot16EVT =

      EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);

  SDValue ConcatSDot16 =

      DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);

  SDValue VecReduceAdd16 =

      DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);

  unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;

  if (VecReduce8Num == 0)

    return VecReduceAdd16;


  // Generate the remainder Dot operation that is multiple of 8.

  SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);

  SDValue Vec8Op0 =

      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),

                  DAG.getConstant(I * 16, DL, MVT::i64));

  SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,

                                DAG.getConstant(I * 16, DL, MVT::i64));

  SDValue Dot =

      DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);

  SDValue VecReduceAdd8 =

      DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);

  return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,

                     VecReduceAdd8);

}


// Given an (integer) vecreduce, we know the order of the inputs does not

// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))

// into UADDV(UADDLP(x)). This can also happen through an extra add, where we

// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).


static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) {

  auto DetectAddExtract = [&](SDValue A) {

    // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning

    // UADDLP(x) if found.

    assert(A.getOpcode() == ISD::ADD);

    EVT VT = A.getValueType();

    SDValue Op0 = A.getOperand(0);

    SDValue Op1 = A.getOperand(1);

    if (Op0.getOpcode() != Op1.getOpcode() ||

        (Op0.getOpcode() != ISD::ZERO_EXTEND &&

         Op0.getOpcode() != ISD::SIGN_EXTEND))

      return SDValue();

    SDValue Ext0 = Op0.getOperand(0);

    SDValue Ext1 = Op1.getOperand(0);

    if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

        Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

        Ext0.getOperand(0) != Ext1.getOperand(0) ||

        Ext0.getOperand(0).getValueType().isScalableVector())

      return SDValue();

    // Check that the type is twice the add types, and the extract are from

    // upper/lower parts of the same source.

    if (Ext0.getOperand(0).getValueType().getVectorNumElements() !=

        VT.getVectorNumElements() * 2)

      return SDValue();

    if ((Ext0.getConstantOperandVal(1) != 0 ||

         Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) &&

        (Ext1.getConstantOperandVal(1) != 0 ||

         Ext0.getConstantOperandVal(1) != VT.getVectorNumElements()))

      return SDValue();

    unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP

                                                          : AArch64ISD::SADDLP;

    return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));

  };


  if (SDValue R = DetectAddExtract(A))

    return R;


  if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())

    if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))

      return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,

                         A.getOperand(1));

  if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())

    if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))

      return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,

                         A.getOperand(0));

  return SDValue();

}


// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into

// UADDLV(concat), where the concat represents the 64-bit zext sources.


static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG) {

  // Look for add(zext(64-bit source), zext(64-bit source)), returning

  // UADDLV(concat(zext, zext)) if found.

  assert(A.getOpcode() == ISD::ADD);

  EVT VT = A.getValueType();

  if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)

    return SDValue();

  SDValue Op0 = A.getOperand(0);

  SDValue Op1 = A.getOperand(1);

  if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())

    return SDValue();

  SDValue Ext0 = Op0.getOperand(0);

  SDValue Ext1 = Op1.getOperand(0);

  EVT ExtVT0 = Ext0.getValueType();

  EVT ExtVT1 = Ext1.getValueType();

  // Check zext VTs are the same and 64-bit length.

  if (ExtVT0 != ExtVT1 ||

      VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))

    return SDValue();

  // Get VT for concat of zext sources.

  EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());

  SDValue Concat =

      DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);


  switch (VT.getSimpleVT().SimpleTy) {

  case MVT::v2i64:

  case MVT::v4i32:

    return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);

  case MVT::v8i16: {

    SDValue Uaddlv =

        DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);

    return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);

  }

  default:

    llvm_unreachable("Unhandled vector type");

  }

}


static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {

  SDValue A = N->getOperand(0);

  if (A.getOpcode() == ISD::ADD) {

    if (SDValue R = performUADDVAddCombine(A, DAG))

      return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);

    else if (SDValue R = performUADDVZextCombine(A, DAG))

      return R;

  }


  // uaddv(A) --> A if all lanes of A are known to be zeros except the 0th lane.

  MVT OpVT = A.getSimpleValueType();

  assert(N->getSimpleValueType(0) == OpVT &&

         "The operand type should be consistent with the result type of UADDV");

  APInt Mask = APInt::getAllOnes(OpVT.getVectorNumElements());

  Mask.clearBit(0);

  KnownBits KnownLeadingLanes = DAG.computeKnownBits(A, Mask);

  if (KnownLeadingLanes.isZero())

    return A;


  return SDValue();

}


static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,

                                 TargetLowering::DAGCombinerInfo &DCI,

                                 const AArch64Subtarget *Subtarget) {

  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);

}


SDValue

AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,

                                     SelectionDAG &DAG,

                                     SmallVectorImpl<SDNode *> &Created) const {

  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();

  if (isIntDivCheap(N->getValueType(0), Attr))

    return SDValue(N, 0); // Lower SDIV as SDIV


  EVT VT = N->getValueType(0);


  // If SVE is available, we can generate

  //  sdiv(x,y) -> ptrue + asrd          , where 'y' is positive pow-2 divisor.

  //  sdiv(x,y) -> ptrue + asrd + subr   , where 'y' is negative pow-2 divisor.

  if (VT.isVector() && Subtarget->isSVEorStreamingSVEAvailable())

    return SDValue(N, 0);


  // fold (sdiv X, pow2)

  if ((VT != MVT::i32 && VT != MVT::i64) ||

      !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))

    return SDValue();


  // If the divisor is 2 or -2, the default expansion is better. It will add

  // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.

  if (Divisor == 2 ||

      Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))

    return SDValue();


  return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);

}


SDValue

AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,

                                     SelectionDAG &DAG,

                                     SmallVectorImpl<SDNode *> &Created) const {

  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();

  if (isIntDivCheap(N->getValueType(0), Attr))

    return SDValue(N, 0); // Lower SREM as SREM


  EVT VT = N->getValueType(0);


  // For scalable and fixed types, mark them as cheap so we can handle it much

  // later. This allows us to handle larger than legal types.

  if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())

    return SDValue(N, 0);


  // fold (srem X, pow2)

  if ((VT != MVT::i32 && VT != MVT::i64) ||

      !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))

    return SDValue();


  unsigned Lg2 = Divisor.countr_zero();

  if (Lg2 == 0)

    return SDValue();


  SDLoc DL(N);

  SDValue N0 = N->getOperand(0);

  SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);

  SDValue Zero = DAG.getConstant(0, DL, VT);

  SDValue CCVal, CSNeg;

  if (Lg2 == 1) {

    SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);

    SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);

    CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);


    Created.push_back(Cmp.getNode());

    Created.push_back(And.getNode());

  } else {

    SDValue CCVal = getCondCode(DAG, AArch64CC::MI);

    SDVTList VTs = DAG.getVTList(VT, FlagsVT);


    SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);

    SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);

    SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);

    CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,

                        Negs.getValue(1));


    Created.push_back(Negs.getNode());

    Created.push_back(AndPos.getNode());

    Created.push_back(AndNeg.getNode());

  }


  return CSNeg;

}


static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {

  switch(getIntrinsicID(S.getNode())) {

  default:

    break;

  case Intrinsic::aarch64_sve_cntb:

    return 8;

  case Intrinsic::aarch64_sve_cnth:

    return 16;

  case Intrinsic::aarch64_sve_cntw:

    return 32;

  case Intrinsic::aarch64_sve_cntd:

    return 64;

  }

  return {};

}


/// Calculates what the pre-extend type is, based on the extension

/// operation node provided by \p Extend.

///

/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the

/// pre-extend type is pulled directly from the operand, while other extend

/// operations need a bit more inspection to get this information.

///

/// \param Extend The SDNode from the DAG that represents the extend operation

///

/// \returns The type representing the \p Extend source type, or \p MVT::Other

/// if no valid type can be determined


static EVT calculatePreExtendType(SDValue Extend) {

  switch (Extend.getOpcode()) {

  case ISD::SIGN_EXTEND:

  case ISD::ZERO_EXTEND:

  case ISD::ANY_EXTEND:

    return Extend.getOperand(0).getValueType();

  case ISD::AssertSext:

  case ISD::AssertZext:

  case ISD::SIGN_EXTEND_INREG: {

    VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));

    if (!TypeNode)

      return MVT::Other;

    return TypeNode->getVT();

  }

  case ISD::AND: {

    ConstantSDNode *Constant =

        dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());

    if (!Constant)

      return MVT::Other;


    uint32_t Mask = Constant->getZExtValue();


    if (Mask == UCHAR_MAX)

      return MVT::i8;

    else if (Mask == USHRT_MAX)

      return MVT::i16;

    else if (Mask == UINT_MAX)

      return MVT::i32;


    return MVT::Other;

  }

  default:

    return MVT::Other;

  }

}


/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern

/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector

/// SExt/ZExt rather than the scalar SExt/ZExt


static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {

  EVT VT = BV.getValueType();

  if (BV.getOpcode() != ISD::BUILD_VECTOR &&

      BV.getOpcode() != ISD::VECTOR_SHUFFLE)

    return SDValue();


  // Use the first item in the buildvector/shuffle to get the size of the

  // extend, and make sure it looks valid.

  SDValue Extend = BV->getOperand(0);

  unsigned ExtendOpcode = Extend.getOpcode();

  bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;

  bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||

                ExtendOpcode == ISD::SIGN_EXTEND_INREG ||

                ExtendOpcode == ISD::AssertSext;

  if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&

      ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)

    return SDValue();

  // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to

  // ensure calculatePreExtendType will work without issue.

  if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&

      ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)

    return SDValue();


  // Restrict valid pre-extend data type

  EVT PreExtendType = calculatePreExtendType(Extend);

  if (PreExtendType == MVT::Other ||

      PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)

    return SDValue();


  // Make sure all other operands are equally extended.

  bool SeenZExtOrSExt = !IsAnyExt;

  for (SDValue Op : drop_begin(BV->ops())) {

    if (Op.isUndef())

      continue;


    if (calculatePreExtendType(Op) != PreExtendType)

      return SDValue();


    unsigned Opc = Op.getOpcode();

    if (Opc == ISD::ANY_EXTEND)

      continue;


    bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||

                     Opc == ISD::AssertSext;


    if (SeenZExtOrSExt && OpcIsSExt != IsSExt)

      return SDValue();


    IsSExt = OpcIsSExt;

    SeenZExtOrSExt = true;

  }


  SDValue NBV;

  SDLoc DL(BV);

  if (BV.getOpcode() == ISD::BUILD_VECTOR) {

    EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);

    EVT PreExtendLegalType =

        PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;

    SmallVector<SDValue, 8> NewOps;

    for (SDValue Op : BV->ops())

      NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)

                                    : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,

                                                           PreExtendLegalType));

    NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);

  } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE

    EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());

    NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),

                               BV.getOperand(1).isUndef()

                                   ? DAG.getUNDEF(PreExtendVT)

                                   : BV.getOperand(1).getOperand(0),

                               cast<ShuffleVectorSDNode>(BV)->getMask());

  }

  unsigned ExtOpc = !SeenZExtOrSExt

                        ? ISD::ANY_EXTEND

                        : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);

  return DAG.getNode(ExtOpc, DL, VT, NBV);

}


/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))

/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt


static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {

  // If the value type isn't a vector, none of the operands are going to be dups

  EVT VT = Mul->getValueType(0);

  if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)

    return SDValue();


  SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);

  SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);


  // Neither operands have been changed, don't make any further changes

  if (!Op0 && !Op1)

    return SDValue();


  SDLoc DL(Mul);

  return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),

                     Op1 ? Op1 : Mul->getOperand(1));

}


// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz

// Same for other types with equivalent constants.


static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);

  if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&

      VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)

    return SDValue();

  if (N->getOperand(0).getOpcode() != ISD::AND ||

      N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)

    return SDValue();


  SDValue And = N->getOperand(0);

  SDValue Srl = And.getOperand(0);


  APInt V1, V2, V3;

  if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||

      !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||

      !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3))

    return SDValue();


  unsigned HalfSize = VT.getScalarSizeInBits() / 2;

  if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||

      V3 != (HalfSize - 1))

    return SDValue();


  EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),

                                EVT::getIntegerVT(*DAG.getContext(), HalfSize),

                                VT.getVectorElementCount() * 2);


  SDLoc DL(N);

  SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));

  SDValue Zero = DAG.getConstant(0, DL, In.getValueType());

  SDValue CM = DAG.getSetCC(DL, HalfVT, Zero, In, ISD::SETGT);

  return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);

}


// Transform vector add(zext i8 to i32, zext i8 to i32)

//  into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)

// This allows extra uses of saddl/uaddl at the lower vector widths, and less

// extends.


static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);

  if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||

      (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&

       N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||

      (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&

       N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||

      N->getOperand(0).getOperand(0).getValueType() !=

          N->getOperand(1).getOperand(0).getValueType())

    return SDValue();


  if (N->getOpcode() == ISD::MUL &&

      N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())

    return SDValue();


  SDValue N0 = N->getOperand(0).getOperand(0);

  SDValue N1 = N->getOperand(1).getOperand(0);

  EVT InVT = N0.getValueType();


  EVT S1 = InVT.getScalarType();

  EVT S2 = VT.getScalarType();

  if ((S2 == MVT::i32 && S1 == MVT::i8) ||

      (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {

    SDLoc DL(N);

    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),

                                  S2.getHalfSizedIntegerVT(*DAG.getContext()),

                                  VT.getVectorElementCount());

    SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);

    SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);

    SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);

    return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()

                                                  : (unsigned)ISD::SIGN_EXTEND,

                       DL, VT, NewOp);

  }

  return SDValue();

}


static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,

                                 TargetLowering::DAGCombinerInfo &DCI,

                                 const AArch64Subtarget *Subtarget) {


  if (SDValue Ext = performMulVectorExtendCombine(N, DAG))

    return Ext;

  if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG))

    return Ext;

  if (SDValue Ext = performVectorExtCombine(N, DAG))

    return Ext;


  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,

  // and in MachineCombiner pass, add+mul will be combined into madd.

  // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.

  SDLoc DL(N);

  EVT VT = N->getValueType(0);

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);

  SDValue MulOper;

  unsigned AddSubOpc;


  auto IsAddSubWith1 = [&](SDValue V) -> bool {

    AddSubOpc = V->getOpcode();

    if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {

      SDValue Opnd = V->getOperand(1);

      MulOper = V->getOperand(0);

      if (AddSubOpc == ISD::SUB)

        std::swap(Opnd, MulOper);

      if (auto C = dyn_cast<ConstantSDNode>(Opnd))

        return C->isOne();

    }

    return false;

  };


  if (IsAddSubWith1(N0)) {

    SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);

    return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);

  }


  if (IsAddSubWith1(N1)) {

    SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);

    return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);

  }


  // The below optimizations require a constant RHS.

  if (!isa<ConstantSDNode>(N1))

    return SDValue();


  ConstantSDNode *C = cast<ConstantSDNode>(N1);

  const APInt &ConstValue = C->getAPIntValue();


  // Allow the scaling to be folded into the `cnt` instruction by preventing

  // the scaling to be obscured here. This makes it easier to pattern match.

  if (IsSVECntIntrinsic(N0) ||

     (N0->getOpcode() == ISD::TRUNCATE &&

      (IsSVECntIntrinsic(N0->getOperand(0)))))

       if (ConstValue.sge(1) && ConstValue.sle(16))

         return SDValue();


  // Multiplication of a power of two plus/minus one can be done more

  // cheaply as shift+add/sub. For now, this is true unilaterally. If

  // future CPUs have a cheaper MADD instruction, this may need to be

  // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and

  // 64-bit is 5 cycles, so this is always a win.

  // More aggressively, some multiplications N0 * C can be lowered to

  // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,

  // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)

  // TODO: lower more cases.


  // TrailingZeroes is used to test if the mul can be lowered to

  // shift+add+shift.

  unsigned TrailingZeroes = ConstValue.countr_zero();

  if (TrailingZeroes) {

    // Conservatively do not lower to shift+add+shift if the mul might be

    // folded into smul or umul.

    if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||

                            isZeroExtended(N0, DAG)))

      return SDValue();

    // Conservatively do not lower to shift+add+shift if the mul might be

    // folded into madd or msub.

    if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||

                           N->user_begin()->getOpcode() == ISD::SUB))

      return SDValue();

  }

  // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub

  // and shift+add+shift.

  APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);

  unsigned ShiftAmt;


  auto Shl = [&](SDValue N0, unsigned N1) {

    if (!N0.getNode())

      return SDValue();

    // If shift causes overflow, ignore this combine.

    if (N1 >= N0.getValueSizeInBits())

      return SDValue();

    SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);

    return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);

  };

  auto Add = [&](SDValue N0, SDValue N1) {

    if (!N0.getNode() || !N1.getNode())

      return SDValue();

    return DAG.getNode(ISD::ADD, DL, VT, N0, N1);

  };

  auto Sub = [&](SDValue N0, SDValue N1) {

    if (!N0.getNode() || !N1.getNode())

      return SDValue();

    return DAG.getNode(ISD::SUB, DL, VT, N0, N1);

  };

  auto Negate = [&](SDValue N) {

    if (!N0.getNode())

      return SDValue();

    SDValue Zero = DAG.getConstant(0, DL, VT);

    return DAG.getNode(ISD::SUB, DL, VT, Zero, N);

  };


  // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:

  // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as

  // the (2^N - 1) can't be execused via a single instruction.

  auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {

    unsigned BitWidth = C.getBitWidth();

    for (unsigned i = 1; i < BitWidth / 2; i++) {

      APInt Rem;

      APInt X(BitWidth, (1 << i) + 1);

      APInt::sdivrem(C, X, N, Rem);

      APInt NVMinus1 = N - 1;

      if (Rem == 0 && NVMinus1.isPowerOf2()) {

        M = X;

        return true;

      }

    }

    return false;

  };


  // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:

  // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as

  // the (2^N - 1) can't be execused via a single instruction.

  auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {

    APInt CVMinus1 = C - 1;

    if (CVMinus1.isNegative())

      return false;

    unsigned TrailingZeroes = CVMinus1.countr_zero();

    APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;

    if (SCVMinus1.isPowerOf2()) {

      unsigned BitWidth = SCVMinus1.getBitWidth();

      M = APInt(BitWidth, SCVMinus1.logBase2());

      N = APInt(BitWidth, TrailingZeroes);

      return true;

    }

    return false;

  };


  // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:

  // C = 29 is equal to 1 - (1 - 2^3) * 2^2.

  auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {

    APInt CVMinus1 = C - 1;

    if (CVMinus1.isNegative())

      return false;

    unsigned TrailingZeroes = CVMinus1.countr_zero();

    APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;

    if (CVPlus1.isPowerOf2()) {

      unsigned BitWidth = CVPlus1.getBitWidth();

      M = APInt(BitWidth, CVPlus1.logBase2());

      N = APInt(BitWidth, TrailingZeroes);

      return true;

    }

    return false;

  };


  if (ConstValue.isNonNegative()) {

    // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)

    // (mul x, 2^N - 1) => (sub (shl x, N), x)

    // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))

    // (mul x, (2^M + 1) * (2^N + 1))

    //     => MV = (add (shl x, M), x); (add (shl MV, N), MV)

    // (mul x, (2^M + 1) * 2^N + 1))

    //     =>  MV = add (shl x, M), x); add (shl MV, N), x)

    // (mul x, 1 - (1 - 2^M) * 2^N))

    //     =>  MV = sub (x - (shl x, M)); sub (x - (shl MV, N))

    APInt SCVMinus1 = ShiftedConstValue - 1;

    APInt SCVPlus1 = ShiftedConstValue + 1;

    APInt CVPlus1 = ConstValue + 1;

    APInt CVM, CVN;

    if (SCVMinus1.isPowerOf2()) {

      ShiftAmt = SCVMinus1.logBase2();

      return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);

    } else if (CVPlus1.isPowerOf2()) {

      ShiftAmt = CVPlus1.logBase2();

      return Sub(Shl(N0, ShiftAmt), N0);

    } else if (SCVPlus1.isPowerOf2()) {

      ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;

      return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));

    }

    if (Subtarget->hasALULSLFast() &&

        isPowPlusPlusConst(ConstValue, CVM, CVN)) {

      APInt CVMMinus1 = CVM - 1;

      APInt CVNMinus1 = CVN - 1;

      unsigned ShiftM1 = CVMMinus1.logBase2();

      unsigned ShiftN1 = CVNMinus1.logBase2();

      // ALULSLFast implicate that Shifts <= 4 places are fast

      if (ShiftM1 <= 4 && ShiftN1 <= 4) {

        SDValue MVal = Add(Shl(N0, ShiftM1), N0);

        return Add(Shl(MVal, ShiftN1), MVal);

      }

    }

    if (Subtarget->hasALULSLFast() &&

        isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {

      unsigned ShiftM = CVM.getZExtValue();

      unsigned ShiftN = CVN.getZExtValue();

      // ALULSLFast implicate that Shifts <= 4 places are fast

      if (ShiftM <= 4 && ShiftN <= 4) {

        SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);

        return Add(Shl(MVal, CVN.getZExtValue()), N0);

      }

    }


    if (Subtarget->hasALULSLFast() &&

        isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {

      unsigned ShiftM = CVM.getZExtValue();

      unsigned ShiftN = CVN.getZExtValue();

      // ALULSLFast implicate that Shifts <= 4 places are fast

      if (ShiftM <= 4 && ShiftN <= 4) {

        SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));

        return Sub(N0, Shl(MVal, CVN.getZExtValue()));

      }

    }

  } else {

    // (mul x, -(2^N - 1)) => (sub x, (shl x, N))

    // (mul x, -(2^N + 1)) => - (add (shl x, N), x)

    // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))

    APInt SCVPlus1 = -ShiftedConstValue + 1;

    APInt CVNegPlus1 = -ConstValue + 1;

    APInt CVNegMinus1 = -ConstValue - 1;

    if (CVNegPlus1.isPowerOf2()) {

      ShiftAmt = CVNegPlus1.logBase2();

      return Sub(N0, Shl(N0, ShiftAmt));

    } else if (CVNegMinus1.isPowerOf2()) {

      ShiftAmt = CVNegMinus1.logBase2();

      return Negate(Add(Shl(N0, ShiftAmt), N0));

    } else if (SCVPlus1.isPowerOf2()) {

      ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;

      return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));

    }

  }


  return SDValue();

}


static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,

                                                         SelectionDAG &DAG) {

  // Take advantage of vector comparisons producing 0 or -1 in each lane to

  // optimize away operation when it's from a constant.

  //

  // The general transformation is:

  //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->

  //       AND(VECTOR_CMP(x,y), constant2)

  //    constant2 = UNARYOP(constant)


  // Early exit if this isn't a vector operation, the operand of the

  // unary operation isn't a bitwise AND, or if the sizes of the operations

  // aren't the same.

  EVT VT = N->getValueType(0);

  if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||

      N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||

      VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())

    return SDValue();


  // Now check that the other operand of the AND is a constant. We could

  // make the transformation for non-constant splats as well, but it's unclear

  // that would be a benefit as it would not eliminate any operations, just

  // perform one more step in scalar code before moving to the vector unit.

  if (BuildVectorSDNode *BV =

          dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {

    // Bail out if the vector isn't a constant.

    if (!BV->isConstant())

      return SDValue();


    // Everything checks out. Build up the new and improved node.

    SDLoc DL(N);

    EVT IntVT = BV->getValueType(0);

    // Create a new constant of the appropriate type for the transformed

    // DAG.

    SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));

    // The AND node needs bitcasts to/from an integer vector type around it.

    SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);

    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,

                                 N->getOperand(0)->getOperand(0), MaskConst);

    SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);

    return Res;

  }


  return SDValue();

}


/// Tries to replace scalar FP <-> INT conversions with SVE in streaming

/// functions, this can help to reduce the number of fmovs to/from GPRs.

static SDValue


tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG,

                                      TargetLowering::DAGCombinerInfo &DCI,

                                      const AArch64Subtarget *Subtarget) {

  if (N->isStrictFPOpcode())

    return SDValue();


  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  if (!Subtarget->isSVEorStreamingSVEAvailable() ||

      (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))

    return SDValue();


  auto isSupportedType = [](EVT VT) {

    return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;

  };


  SDValue SrcVal = N->getOperand(0);

  EVT SrcTy = SrcVal.getValueType();

  EVT DestTy = N->getValueType(0);


  if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))

    return SDValue();


  EVT SrcVecTy;

  EVT DestVecTy;

  if (DestTy.bitsGT(SrcTy)) {

    DestVecTy = getPackedSVEVectorVT(DestTy);

    SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);

  } else {

    SrcVecTy = getPackedSVEVectorVT(SrcTy);

    DestVecTy = SrcVecTy.changeVectorElementType(DestTy);

  }


  // Ensure the resulting src/dest vector type is legal.

  if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)

    return SDValue();


  SDLoc DL(N);

  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);

  SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,

                            DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);

  SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);

}


static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,

                                     TargetLowering::DAGCombinerInfo &DCI,

                                     const AArch64Subtarget *Subtarget) {

  // First try to optimize away the conversion when it's conditionally from

  // a constant. Vectors only.

  if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))

    return Res;


  if (SDValue Res =

          tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))

    return Res;


  EVT VT = N->getValueType(0);

  if (VT != MVT::f32 && VT != MVT::f64)

    return SDValue();


  // Only optimize when the source and destination types have the same width.

  if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())

    return SDValue();


  // If the result of an integer load is only used by an integer-to-float

  // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.

  // This eliminates an "integer-to-vector-move" UOP and improves throughput.

  SDValue N0 = N->getOperand(0);

  if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&

      N0.hasOneUse() &&

      // Do not change the width of a volatile load.

      !cast<LoadSDNode>(N0)->isVolatile()) {

    LoadSDNode *LN0 = cast<LoadSDNode>(N0);

    SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),

                               LN0->getPointerInfo(), LN0->getAlign(),

                               LN0->getMemOperand()->getFlags());


    // Make sure successors of the original load stay after it by updating them

    // to use the new Chain.

    DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));


    unsigned Opcode =

        (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;

    return DAG.getNode(Opcode, SDLoc(N), VT, Load);

  }


  return SDValue();

}


/// Fold a floating-point multiply by power of two into floating-point to

/// fixed-point conversion.


static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,

                                     TargetLowering::DAGCombinerInfo &DCI,

                                     const AArch64Subtarget *Subtarget) {

  if (SDValue Res =

          tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))

    return Res;


  if (!Subtarget->isNeonAvailable())

    return SDValue();


  if (!N->getValueType(0).isSimple())

    return SDValue();


  SDValue Op = N->getOperand(0);

  if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)

    return SDValue();


  if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())

    return SDValue();


  SDValue ConstVec = Op->getOperand(1);

  if (!isa<BuildVectorSDNode>(ConstVec))

    return SDValue();


  MVT FloatTy = Op.getSimpleValueType().getVectorElementType();

  uint32_t FloatBits = FloatTy.getSizeInBits();

  if (FloatBits != 32 && FloatBits != 64 &&

      (FloatBits != 16 || !Subtarget->hasFullFP16()))

    return SDValue();


  MVT IntTy = N->getSimpleValueType(0).getVectorElementType();

  uint32_t IntBits = IntTy.getSizeInBits();

  if (IntBits != 16 && IntBits != 32 && IntBits != 64)

    return SDValue();


  // Avoid conversions where iN is larger than the float (e.g., float -> i64).

  if (IntBits > FloatBits)

    return SDValue();


  BitVector UndefElements;

  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);

  int32_t Bits = IntBits == 64 ? 64 : 32;

  int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);

  if (C == -1 || C == 0 || C > Bits)

    return SDValue();


  EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();

  if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))

    return SDValue();


  if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||

      N->getOpcode() == ISD::FP_TO_UINT_SAT) {

    EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();

    if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)

      return SDValue();

  }


  SDLoc DL(N);

  bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||

                   N->getOpcode() == ISD::FP_TO_SINT_SAT);

  unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs

                                      : Intrinsic::aarch64_neon_vcvtfp2fxu;

  SDValue FixConv =

      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,

                  DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),

                  Op->getOperand(0), DAG.getTargetConstant(C, DL, MVT::i32));

  // We can handle smaller integers by generating an extra trunc.

  if (IntBits < FloatBits)

    FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);


  return FixConv;

}


// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to

// convert to csel(ccmp(.., cc0)), depending on cc1:


// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))

// =>

// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))

//

// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))

// =>

// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))


static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);

  SDValue CSel0 = N->getOperand(0);

  SDValue CSel1 = N->getOperand(1);


  if (CSel0.getOpcode() != AArch64ISD::CSEL ||

      CSel1.getOpcode() != AArch64ISD::CSEL)

    return SDValue();


  if (!CSel0->hasOneUse() || !CSel1->hasOneUse())

    return SDValue();


  if (!isNullConstant(CSel0.getOperand(0)) ||

      !isOneConstant(CSel0.getOperand(1)) ||

      !isNullConstant(CSel1.getOperand(0)) ||

      !isOneConstant(CSel1.getOperand(1)))

    return SDValue();


  SDValue Cmp0 = CSel0.getOperand(3);

  SDValue Cmp1 = CSel1.getOperand(3);

  AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(2);

  AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(2);

  if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())

    return SDValue();

  if (Cmp1.getOpcode() != AArch64ISD::SUBS &&

      Cmp0.getOpcode() == AArch64ISD::SUBS) {

    std::swap(Cmp0, Cmp1);

    std::swap(CC0, CC1);

  }


  if (Cmp1.getOpcode() != AArch64ISD::SUBS)

    return SDValue();


  SDLoc DL(N);

  SDValue CCmp, Condition;

  unsigned NZCV;


  if (N->getOpcode() == ISD::AND) {

    AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0);

    Condition = getCondCode(DAG, InvCC0);

    NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1);

  } else {

    AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(CC1);

    Condition = getCondCode(DAG, CC0);

    NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvCC1);

  }


  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);


  auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));

  if (Op1 && Op1->getAPIntValue().isNegative() &&

      Op1->getAPIntValue().sgt(-32)) {

    // CCMP accept the constant int the range [0, 31]

    // if the Op1 is a constant in the range [-31, -1], we

    // can select to CCMN to avoid the extra mov

    SDValue AbsOp1 =

        DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));

    CCmp = DAG.getNode(AArch64ISD::CCMN, DL, FlagsVT, Cmp1.getOperand(0),

                       AbsOp1, NZCVOp, Condition, Cmp0);

  } else {

    CCmp = DAG.getNode(AArch64ISD::CCMP, DL, FlagsVT, Cmp1.getOperand(0),

                       Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);

  }

  return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),

                     CSel0.getOperand(1), getCondCode(DAG, CC1), CCmp);

}


static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,

                                const AArch64Subtarget *Subtarget,

                                const AArch64TargetLowering &TLI) {

  SelectionDAG &DAG = DCI.DAG;


  if (SDValue R = performANDORCSELCombine(N, DAG))

    return R;


  return SDValue();

}


static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {

  if (!MemVT.getVectorElementType().isSimple())

    return false;


  uint64_t MaskForTy = 0ull;

  switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {

  case MVT::i8:

    MaskForTy = 0xffull;

    break;

  case MVT::i16:

    MaskForTy = 0xffffull;

    break;

  case MVT::i32:

    MaskForTy = 0xffffffffull;

    break;

  default:

    return false;

    break;

  }


  if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)

    if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))

      return Op0->getAPIntValue().getLimitedValue() == MaskForTy;


  return false;

}


static SDValue performReinterpretCastCombine(SDNode *N) {

  SDValue LeafOp = SDValue(N, 0);

  SDValue Op = N->getOperand(0);

  while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&

         LeafOp.getValueType() != Op.getValueType())

    Op = Op->getOperand(0);

  if (LeafOp.getValueType() == Op.getValueType())

    return Op;

  return SDValue();

}


static SDValue performSVEAndCombine(SDNode *N,

                                    TargetLowering::DAGCombinerInfo &DCI) {

  SelectionDAG &DAG = DCI.DAG;

  SDValue Src = N->getOperand(0);

  unsigned Opc = Src->getOpcode();


  // Zero/any extend of an unsigned unpack

  if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {

    SDValue UnpkOp = Src->getOperand(0);

    SDValue Dup = N->getOperand(1);


    if (Dup.getOpcode() != ISD::SPLAT_VECTOR)

      return SDValue();


    SDLoc DL(N);

    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));

    if (!C)

      return SDValue();


    uint64_t ExtVal = C->getZExtValue();


    auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {

      return ((ExtVal == 0xFF && VT == MVT::i8) ||

              (ExtVal == 0xFFFF && VT == MVT::i16) ||

              (ExtVal == 0xFFFFFFFF && VT == MVT::i32));

    };


    // If the mask is fully covered by the unpack, we don't need to push

    // a new AND onto the operand

    EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();

    if (MaskAndTypeMatch(EltTy))

      return Src;


    // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check

    // to see if the mask is all-ones of size MemTy.

    auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);

    if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||

                         MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {

      EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();

      if (MaskAndTypeMatch(EltTy))

        return Src;

    }


    // Truncate to prevent a DUP with an over wide constant

    APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());


    // Otherwise, make sure we propagate the AND to the operand

    // of the unpack

    Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),

                      DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));


    SDValue And = DAG.getNode(ISD::AND, DL,

                              UnpkOp->getValueType(0), UnpkOp, Dup);


    return DAG.getNode(Opc, DL, N->getValueType(0), And);

  }


  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  // If both sides of AND operations are i1 splat_vectors then

  // we can produce just i1 splat_vector as the result.

  if (isAllActivePredicate(DAG, N->getOperand(0)))

    return N->getOperand(1);

  if (isAllActivePredicate(DAG, N->getOperand(1)))

    return N->getOperand(0);


  if (!EnableCombineMGatherIntrinsics)

    return SDValue();


  SDValue Mask = N->getOperand(1);


  if (!Src.hasOneUse())

    return SDValue();


  EVT MemVT;


  // SVE load instructions perform an implicit zero-extend, which makes them

  // perfect candidates for combining.

  switch (Opc) {

  case AArch64ISD::LD1_MERGE_ZERO:

  case AArch64ISD::LDNF1_MERGE_ZERO:

  case AArch64ISD::LDFF1_MERGE_ZERO:

    MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();

    break;

  case AArch64ISD::GLD1_MERGE_ZERO:

  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:

  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:

  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:

  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:

  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:

  case AArch64ISD::GLD1_IMM_MERGE_ZERO:

  case AArch64ISD::GLDFF1_MERGE_ZERO:

  case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:

  case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:

  case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:

  case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:

  case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:

  case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:

  case AArch64ISD::GLDNT1_MERGE_ZERO:

    MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();

    break;

  default:

    return SDValue();

  }


  if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))

    return Src;


  return SDValue();

}


// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)


static SDValue performANDSETCCCombine(SDNode *N,

                                      TargetLowering::DAGCombinerInfo &DCI) {


  // This function performs an optimization on a specific pattern involving

  // an AND operation and SETCC (Set Condition Code) node.


  SDValue SetCC = N->getOperand(0);

  EVT VT = N->getValueType(0);

  SelectionDAG &DAG = DCI.DAG;


  // Checks if the current node (N) is used by any SELECT instruction and

  // returns an empty SDValue to avoid applying the optimization to prevent

  // incorrect results

  for (auto U : N->users())

    if (U->getOpcode() == ISD::SELECT)

      return SDValue();


  // Check if the operand is a SETCC node with floating-point comparison

  if (SetCC.getOpcode() == ISD::SETCC &&

      SetCC.getOperand(0).getValueType() == MVT::f32) {


    SDValue Cmp;

    AArch64CC::CondCode CC;


    // Check if the DAG is after legalization and if we can emit the conjunction

    if (!DCI.isBeforeLegalize() &&

        (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {


      AArch64CC::CondCode InvertedCC = AArch64CC::getInvertedCondCode(CC);


      SDLoc DL(N);

      return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),

                         DAG.getConstant(0, DL, VT),

                         getCondCode(DAG, InvertedCC), Cmp);

    }

  }

  return SDValue();

}


static SDValue performANDCombine(SDNode *N,

                                 TargetLowering::DAGCombinerInfo &DCI) {

  SelectionDAG &DAG = DCI.DAG;

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  EVT VT = N->getValueType(0);


  if (SDValue R = performANDORCSELCombine(N, DAG))

    return R;


  if (SDValue R = performANDSETCCCombine(N,DCI))

    return R;


  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

    return SDValue();


  if (VT.isScalableVector())

    return performSVEAndCombine(N, DCI);


  // The combining code below works only for NEON vectors. In particular, it

  // does not work for SVE when dealing with vectors wider than 128 bits.

  if (!VT.is64BitVector() && !VT.is128BitVector())

    return SDValue();


  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());

  if (!BVN)

    return SDValue();


  // AND does not accept an immediate, so check if we can use a BIC immediate

  // instruction instead. We do this here instead of using a (and x, (mvni imm))

  // pattern in isel, because some immediates may be lowered to the preferred

  // (and x, (movi imm)) form, even though an mvni representation also exists.

  APInt DefBits(VT.getSizeInBits(), 0);

  APInt UndefBits(VT.getSizeInBits(), 0);

  if (resolveBuildVector(BVN, DefBits, UndefBits)) {

    SDValue NewOp;


    // Any bits known to already be 0 need not be cleared again, which can help

    // reduce the size of the immediate to one supported by the instruction.

    KnownBits Known = DAG.computeKnownBits(LHS);

    APInt ZeroSplat(VT.getSizeInBits(), 0);

    for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)

      ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())

                   << (Known.Zero.getBitWidth() * I);


    DefBits = ~(DefBits | ZeroSplat);

    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,

                                    DefBits, &LHS)) ||

        (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,

                                    DefBits, &LHS)))

      return NewOp;


    UndefBits = ~(UndefBits | ZeroSplat);

    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,

                                    UndefBits, &LHS)) ||

        (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,

                                    UndefBits, &LHS)))

      return NewOp;

  }


  return SDValue();

}


static SDValue performFADDCombine(SDNode *N,

                                  TargetLowering::DAGCombinerInfo &DCI) {

  SelectionDAG &DAG = DCI.DAG;

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  EVT VT = N->getValueType(0);

  SDLoc DL(N);


  if (!N->getFlags().hasAllowReassociation())

    return SDValue();


  // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)

  auto ReassocComplex = [&](SDValue A, SDValue B) {

    if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)

      return SDValue();

    unsigned Opc = A.getConstantOperandVal(0);

    if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&

        Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&

        Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&

        Opc != Intrinsic::aarch64_neon_vcmla_rot270)

      return SDValue();

    SDValue VCMLA = DAG.getNode(

        ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),

        DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),

        A.getOperand(2), A.getOperand(3));

    VCMLA->setFlags(A->getFlags());

    return VCMLA;

  };

  if (SDValue R = ReassocComplex(LHS, RHS))

    return R;

  if (SDValue R = ReassocComplex(RHS, LHS))

    return R;


  return SDValue();

}


static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {

  switch (Opcode) {

  case ISD::STRICT_FADD:

  case ISD::FADD:

    return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;

  case ISD::ADD:

    return VT == MVT::i64;

  default:

    return false;

  }

}


static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,

                        AArch64CC::CondCode Cond);


static bool isPredicateCCSettingOp(SDValue N) {

  if ((N.getOpcode() == ISD::SETCC) ||

      // get_active_lane_mask is lowered to a whilelo instruction.

      (N.getOpcode() == ISD::GET_ACTIVE_LANE_MASK) ||

      (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&

       (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||

        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege_x2 ||

        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||

        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt_x2 ||

        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||

        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi_x2 ||

        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||

        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs_x2 ||

        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||

        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele_x2 ||

        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||

        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo_x2 ||

        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||

        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels_x2 ||

        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||

        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt_x2)))

    return true;


  return false;

}


// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>

// ... into: "ptrue p, all" + PTEST

static SDValue


performFirstTrueTestVectorCombine(SDNode *N,

                                  TargetLowering::DAGCombinerInfo &DCI,

                                  const AArch64Subtarget *Subtarget) {

  assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);

  // Make sure PTEST can be legalised with illegal types.

  if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())

    return SDValue();


  SDValue N0 = N->getOperand(0);

  EVT VT = N0.getValueType();


  if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||

      !isNullConstant(N->getOperand(1)))

    return SDValue();


  // Restricted the DAG combine to only cases where we're extracting from a

  // flag-setting operation.

  if (!isPredicateCCSettingOp(N0) || N0.getResNo() != 0)

    return SDValue();


  // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0

  SelectionDAG &DAG = DCI.DAG;

  SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);

  return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);

}


// Materialize : Idx = (add (mul vscale, NumEls), -1)

//               i1 = extract_vector_elt t37, Constant:i64<Idx>

//     ... into: "ptrue p, all" + PTEST

static SDValue


performLastTrueTestVectorCombine(SDNode *N,

                                 TargetLowering::DAGCombinerInfo &DCI,

                                 const AArch64Subtarget *Subtarget) {

  assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);

  // Make sure PTEST is legal types.

  if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())

    return SDValue();


  SDValue N0 = N->getOperand(0);

  EVT OpVT = N0.getValueType();


  if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)

    return SDValue();


  // Idx == (add (mul vscale, NumEls), -1)

  SDValue Idx = N->getOperand(1);

  if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))

    return SDValue();


  SDValue VS = Idx.getOperand(0);

  if (VS.getOpcode() != ISD::VSCALE)

    return SDValue();


  unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();

  if (VS.getConstantOperandVal(0) != NumEls)

    return SDValue();


  // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0

  SelectionDAG &DAG = DCI.DAG;

  SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);

  return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);

}


static SDValue


performExtractLastActiveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,

                                const AArch64Subtarget *Subtarget) {

  assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);

  SelectionDAG &DAG = DCI.DAG;

  SDValue Vec = N->getOperand(0);

  SDValue Idx = N->getOperand(1);


  if (DCI.isBeforeLegalize() || Idx.getOpcode() != ISD::VECTOR_FIND_LAST_ACTIVE)

    return SDValue();


  // Only legal for 8, 16, 32, and 64 bit element types.

  EVT EltVT = Vec.getValueType().getVectorElementType();

  if (!is_contained(ArrayRef({MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f16,

                              MVT::bf16, MVT::f32, MVT::f64}),

                    EltVT.getSimpleVT().SimpleTy))

    return SDValue();


  SDValue Mask = Idx.getOperand(0);

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (!TLI.isOperationLegal(ISD::VECTOR_FIND_LAST_ACTIVE, Mask.getValueType()))

    return SDValue();


  return DAG.getNode(AArch64ISD::LASTB, SDLoc(N), N->getValueType(0), Mask,

                     Vec);

}


static SDValue


performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,

                               const AArch64Subtarget *Subtarget) {

  assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);

  if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))

    return Res;

  if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))

    return Res;

  if (SDValue Res = performExtractLastActiveCombine(N, DCI, Subtarget))

    return Res;


  SelectionDAG &DAG = DCI.DAG;

  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);


  EVT VT = N->getValueType(0);

  const bool FullFP16 = Subtarget->hasFullFP16();

  bool IsStrict = N0->isStrictFPOpcode();


  // extract(dup x) -> x

  if (N0.getOpcode() == AArch64ISD::DUP)

    return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)

                          : N0.getOperand(0);


  // Rewrite for pairwise fadd pattern

  //   (f32 (extract_vector_elt

  //           (fadd (vXf32 Other)

  //                 (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))

  // ->

  //   (f32 (fadd (extract_vector_elt (vXf32 Other) 0)

  //              (extract_vector_elt (vXf32 Other) 1))

  // For strict_fadd we need to make sure the old strict_fadd can be deleted, so

  // we can only do this when it's used only by the extract_vector_elt.

  if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&

      (!IsStrict || N0.hasOneUse())) {

    SDLoc DL(N0);

    SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);

    SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);


    ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);

    SDValue Other = N00;


    // And handle the commutative case.

    if (!Shuffle) {

      Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);

      Other = N01;

    }


    if (Shuffle && Shuffle->getMaskElt(0) == 1 &&

        Other == Shuffle->getOperand(0)) {

      SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,

                                     DAG.getConstant(0, DL, MVT::i64));

      SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,

                                     DAG.getConstant(1, DL, MVT::i64));

      if (!IsStrict)

        return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);


      // For strict_fadd we need uses of the final extract_vector to be replaced

      // with the strict_fadd, but we also need uses of the chain output of the

      // original strict_fadd to use the chain output of the new strict_fadd as

      // otherwise it may not be deleted.

      SDValue Ret = DAG.getNode(N0->getOpcode(), DL,

                                {VT, MVT::Other},

                                {N0->getOperand(0), Extract1, Extract2});

      DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);

      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));

      return SDValue(N, 0);

    }

  }


  // Given an extract(load) or extract(extend(load)), produce a scalar load

  // instead to avoid the cross-register-bank copies.

  if (DCI.isAfterLegalizeDAG() && Subtarget->isLittleEndian() &&

      VT.isInteger() && isa<ConstantSDNode>(N1)) {

    SDValue LoadN0 = N0;

    // Look through sext/zext and extract_subvector / insert_subvector if

    // required.

    if ((N0.getOpcode() == ISD::ZERO_EXTEND ||

         N0.getOpcode() == ISD::SIGN_EXTEND ||

         N0.getOpcode() == ISD::ANY_EXTEND) &&

        N0.getOperand(0).hasOneUse())

      LoadN0 = N0.getOperand(0);

    unsigned OffsetElts = 0;

    if (LoadN0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

      OffsetElts = LoadN0.getConstantOperandVal(1);

      LoadN0 = LoadN0.getOperand(0);

    }

    if (LoadN0.getOpcode() == ISD::INSERT_SUBVECTOR &&

        LoadN0.getOperand(0).isUndef() &&

        isNullConstant(LoadN0.getOperand(2)) &&

        LoadN0.getOperand(1).hasOneUse())

      LoadN0 = LoadN0.getOperand(1);


    // Check all the uses are valid and can be scalarized. We check that all the

    // uses are extracts and those extracts are not re-inserted into an

    // operation best treated as a vector register.

    auto Load = dyn_cast<LoadSDNode>(LoadN0);

    if (Load && Load->isSimple() && ISD::isNormalLoad(Load) &&

        Load->getMemoryVT().isByteSized() &&

        all_of(N0->uses(), [&](const SDUse &U) {

          return U.getResNo() != N0.getResNo() ||

                 (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

                  !any_of(U.getUser()->uses(), [](const SDUse &U2) {

                    return U2.getUser()->getOpcode() ==

                               ISD::INSERT_VECTOR_ELT ||

                           U2.getUser()->getOpcode() == ISD::BUILD_VECTOR ||

                           U2.getUser()->getOpcode() == ISD::SCALAR_TO_VECTOR;

                  }));

        })) {


      SDLoc DL(Load);


      // Generate a new scalar load.

      unsigned Offset = (OffsetElts + N->getConstantOperandVal(1)) *

                        Load->getValueType(0).getScalarSizeInBits() / 8;

      SDValue BasePtr = DAG.getObjectPtrOffset(

          DL, Load->getBasePtr(), DAG.getConstant(Offset, DL, MVT::i64));

      ISD::LoadExtType ExtType =

          N0.getOpcode() == ISD::ZERO_EXTEND

              ? ISD::ZEXTLOAD

              : (N0.getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD

                                                    : ISD::EXTLOAD);

      SDValue ScalarLoad =

          DAG.getExtLoad(ExtType, DL, VT, Load->getChain(), BasePtr,

                         Load->getPointerInfo().getWithOffset(Offset),

                         Load->getValueType(0).getScalarType(),

                         commonAlignment(Load->getAlign(), Offset),

                         Load->getMemOperand()->getFlags(), Load->getAAInfo());

      DAG.makeEquivalentMemoryOrdering(Load, ScalarLoad);

      return ScalarLoad;

    }

  }


  return SDValue();

}


static SDValue performConcatVectorsCombine(SDNode *N,

                                           TargetLowering::DAGCombinerInfo &DCI,

                                           SelectionDAG &DAG) {

  SDLoc DL(N);

  EVT VT = N->getValueType(0);

  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);

  unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();


  if (VT.isScalableVector())

    return SDValue();


  if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&

      N1Opc == ISD::TRUNCATE) {

    SDValue N00 = N0->getOperand(0);

    SDValue N10 = N1->getOperand(0);

    EVT N00VT = N00.getValueType();

    unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();


    // Optimize concat_vectors of truncated vectors, where the intermediate

    // type is illegal, to avoid said illegality,  e.g.,

    //   (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),

    //                          (v2i16 (truncate (v2i64)))))

    // ->

    //   (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),

    //                                    (v4i32 (bitcast (v2i64))),

    //                                    <0, 2, 4, 6>)))

    // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed

    // on both input and result type, so we might generate worse code.

    // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.

    if (N00VT == N10.getValueType() &&

        (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&

        N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {

      MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);

      SmallVector<int, 8> Mask(MidVT.getVectorNumElements());

      for (size_t i = 0; i < Mask.size(); ++i)

        Mask[i] = i * 2;

      return DAG.getNode(ISD::TRUNCATE, DL, VT,

                         DAG.getVectorShuffle(

                             MidVT, DL,

                             DAG.getNode(ISD::BITCAST, DL, MidVT, N00),

                             DAG.getNode(ISD::BITCAST, DL, MidVT, N10), Mask));

    }


    // Optimize two large shifts and a combine into a single combine and shift

    // For AArch64 architectures, sequences like the following:

    //

    //     ushr    v0.4s, v0.4s, #20

    //     ushr    v1.4s, v1.4s, #20

    //     uzp1    v0.8h, v0.8h, v1.8h

    //

    // Can be optimized to:

    //

    //     uzp2    v0.8h, v0.8h, v1.8h

    //     ushr    v0.8h, v0.8h, #4

    //

    // This optimization reduces instruction count.

    if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&

        N00->getOperand(1) == N10->getOperand(1)) {

      SDValue N000 = N00->getOperand(0);

      SDValue N100 = N10->getOperand(0);

      uint64_t N001ConstVal = N00->getConstantOperandVal(1),

               N101ConstVal = N10->getConstantOperandVal(1),

               NScalarSize = N->getValueType(0).getScalarSizeInBits();


      if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {

        N000 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N000);

        N100 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N100);

        SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, DL, VT, N000, N100);

        SDValue NewShiftConstant =

            DAG.getTargetConstant(N001ConstVal - NScalarSize, DL, MVT::i32);


        return DAG.getNode(AArch64ISD::VLSHR, DL, VT, Uzp, NewShiftConstant);

      }

    }

  }


  if (N->getOperand(0).getValueType() == MVT::v4i8 ||

      N->getOperand(0).getValueType() == MVT::v2i16 ||

      N->getOperand(0).getValueType() == MVT::v2i8) {

    EVT SrcVT = N->getOperand(0).getValueType();

    // If we have a concat of v4i8 loads, convert them to a buildvector of f32

    // loads to prevent having to go through the v4i8 load legalization that

    // needs to extend each element into a larger type.

    if (N->getNumOperands() % 2 == 0 &&

        all_of(N->op_values(), [SrcVT](SDValue V) {

          if (V.getValueType() != SrcVT)

            return false;

          if (V.isUndef())

            return true;

          LoadSDNode *LD = dyn_cast<LoadSDNode>(V);

          return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&

                 LD->getExtensionType() == ISD::NON_EXTLOAD;

        })) {

      EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;

      EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());

      SmallVector<SDValue> Ops;


      for (unsigned i = 0; i < N->getNumOperands(); i++) {

        SDValue V = N->getOperand(i);

        if (V.isUndef())

          Ops.push_back(DAG.getUNDEF(FVT));

        else {

          LoadSDNode *LD = cast<LoadSDNode>(V);

          SDValue NewLoad = DAG.getLoad(FVT, DL, LD->getChain(),

                                        LD->getBasePtr(), LD->getMemOperand());

          DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));

          Ops.push_back(NewLoad);

        }

      }

      return DAG.getBitcast(N->getValueType(0),

                            DAG.getBuildVector(NVT, DL, Ops));

    }

  }


  // Canonicalise concat_vectors to replace concatenations of truncated nots

  // with nots of concatenated truncates. This in some cases allows for multiple

  // redundant negations to be eliminated.

  //  (concat_vectors (v4i16 (truncate (not (v4i32)))),

  //                  (v4i16 (truncate (not (v4i32)))))

  // ->

  //  (not (concat_vectors (v4i16 (truncate (v4i32))),

  //                       (v4i16 (truncate (v4i32)))))

  if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&

      N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&

      N->isOnlyUserOf(N1.getNode())) {

    auto isBitwiseVectorNegate = [](SDValue V) {

      return V->getOpcode() == ISD::XOR &&

             ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());

    };

    SDValue N00 = N0->getOperand(0);

    SDValue N10 = N1->getOperand(0);

    if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&

        isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {

      return DAG.getNOT(

          DL,

          DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,

                      DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(),

                                  N00->getOperand(0)),

                      DAG.getNode(ISD::TRUNCATE, DL, N1.getValueType(),

                                  N10->getOperand(0))),

          VT);

    }

  }


  // Wait till after everything is legalized to try this. That way we have

  // legal vector types and such.

  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  // Optimise concat_vectors of two identical binops with a 128-bit destination

  // size, combine into an binop of two contacts of the source vectors. eg:

  // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))

  if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&

      (DAG.getTargetLoweringInfo().isBinOp(N0Opc) ||

       isVectorizedBinOp(N0Opc)) &&

      N0->hasOneUse() && N1->hasOneUse()) {

    SDValue N00 = N0->getOperand(0);

    SDValue N01 = N0->getOperand(1);

    SDValue N10 = N1->getOperand(0);

    SDValue N11 = N1->getOperand(1);


    if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {

      SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N00, N10);

      SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N01, N11);

      return DAG.getNode(N0Opc, DL, VT, Concat0, Concat1);

    }

  }


  auto IsRSHRN = [](SDValue Shr) {

    if (Shr.getOpcode() != AArch64ISD::VLSHR)

      return false;

    SDValue Op = Shr.getOperand(0);

    EVT VT = Op.getValueType();

    unsigned ShtAmt = Shr.getConstantOperandVal(1);

    if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)

      return false;


    APInt Imm;

    if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)

      Imm = APInt(VT.getScalarSizeInBits(),

                  Op.getOperand(1).getConstantOperandVal(0)

                      << Op.getOperand(1).getConstantOperandVal(1));

    else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&

             isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))

      Imm = APInt(VT.getScalarSizeInBits(),

                  Op.getOperand(1).getConstantOperandVal(0));

    else

      return false;


    if (Imm != 1ULL << (ShtAmt - 1))

      return false;

    return true;

  };


  // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))

  if (N->getNumOperands() == 2 && IsRSHRN(N0) &&

      ((IsRSHRN(N1) &&

        N0.getConstantOperandVal(1) == N1.getConstantOperandVal(1)) ||

       N1.isUndef())) {

    SDValue X = N0.getOperand(0).getOperand(0);

    SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())

                             : N1.getOperand(0).getOperand(0);

    EVT BVT =

        X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());

    SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, DL, BVT, X, Y);

    SDValue Add = DAG.getNode(

        ISD::ADD, DL, BVT, CC,

        DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), DL, BVT));

    SDValue Shr =

        DAG.getNode(AArch64ISD::VLSHR, DL, BVT, Add, N0.getOperand(1));

    return Shr;

  }


  // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)

  if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&

      N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&

      N0.getOperand(1) == N1.getOperand(1)) {

    SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),

                             DAG.getUNDEF(N0.getValueType()));

    SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(1),

                             DAG.getUNDEF(N0.getValueType()));

    return DAG.getNode(AArch64ISD::ZIP1, DL, VT, E0, E1);

  }


  // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector

  // splat. The indexed instructions are going to be expecting a DUPLANE64, so

  // canonicalise to that.

  if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {

    assert(VT.getScalarSizeInBits() == 64);

    return DAG.getNode(AArch64ISD::DUPLANE64, DL, VT, WidenVector(N0, DAG),

                       DAG.getConstant(0, DL, MVT::i64));

  }


  // Canonicalise concat_vectors so that the right-hand vector has as few

  // bit-casts as possible before its real operation. The primary matching

  // destination for these operations will be the narrowing "2" instructions,

  // which depend on the operation being performed on this right-hand vector.

  // For example,

  //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))

  // becomes

  //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))


  if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)

    return SDValue();

  SDValue RHS = N1->getOperand(0);

  MVT RHSTy = RHS.getValueType().getSimpleVT();

  // If the RHS is not a vector, this is not the pattern we're looking for.

  if (!RHSTy.isVector())

    return SDValue();


  LLVM_DEBUG(

      dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");


  MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),

                                  RHSTy.getVectorNumElements() * 2);

  return DAG.getNode(ISD::BITCAST, DL, VT,

                     DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatTy,

                                 DAG.getNode(ISD::BITCAST, DL, RHSTy, N0),

                                 RHS));

}


static SDValue


performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,

                               SelectionDAG &DAG) {

  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  EVT VT = N->getValueType(0);

  if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)

    return SDValue();


  SDValue V = N->getOperand(0);


  // NOTE: This combine exists in DAGCombiner, but that version's legality check

  // blocks this combine because the non-const case requires custom lowering.

  //

  // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)

  if (V.getOpcode() == ISD::SPLAT_VECTOR)

    if (isa<ConstantSDNode>(V.getOperand(0)))

      return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));


  return SDValue();

}


static SDValue


performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,

                              SelectionDAG &DAG) {

  SDLoc DL(N);

  SDValue Vec = N->getOperand(0);

  SDValue SubVec = N->getOperand(1);

  uint64_t IdxVal = N->getConstantOperandVal(2);

  EVT VecVT = Vec.getValueType();

  EVT SubVT = SubVec.getValueType();


  // Promote fixed length vector zeros.

  if (VecVT.isScalableVector() && SubVT.isFixedLengthVector() &&

      Vec.isUndef() && isZerosVector(SubVec.getNode()))

    return VecVT.isInteger() ? DAG.getConstant(0, DL, VecVT)

                             : DAG.getConstantFP(0, DL, VecVT);


  // Only do this for legal fixed vector types.

  if (!VecVT.isFixedLengthVector() ||

      !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||

      !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))

    return SDValue();


  // Ignore widening patterns.

  if (IdxVal == 0 && Vec.isUndef())

    return SDValue();


  // Subvector must be half the width and an "aligned" insertion.

  unsigned NumSubElts = SubVT.getVectorNumElements();

  if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||

      (IdxVal != 0 && IdxVal != NumSubElts))

    return SDValue();


  // Fold insert_subvector -> concat_vectors

  // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))

  // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)

  SDValue Lo, Hi;

  if (IdxVal == 0) {

    Lo = SubVec;

    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,

                     DAG.getVectorIdxConstant(NumSubElts, DL));

  } else {

    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,

                     DAG.getVectorIdxConstant(0, DL));

    Hi = SubVec;

  }

  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);

}


static SDValue tryCombineFixedPointConvert(SDNode *N,

                                           TargetLowering::DAGCombinerInfo &DCI,

                                           SelectionDAG &DAG) {

  // Wait until after everything is legalized to try this. That way we have

  // legal vector types and such.

  if (DCI.isBeforeLegalizeOps())

    return SDValue();

  // Transform a scalar conversion of a value from a lane extract into a

  // lane extract of a vector conversion. E.g., from foo1 to foo2:

  // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }

  // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }

  //

  // The second form interacts better with instruction selection and the

  // register allocator to avoid cross-class register copies that aren't

  // coalescable due to a lane reference.


  // Check the operand and see if it originates from a lane extract.

  SDValue Op1 = N->getOperand(1);

  if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

    return SDValue();


  // Yep, no additional predication needed. Perform the transform.

  SDValue IID = N->getOperand(0);

  SDValue Shift = N->getOperand(2);

  SDValue Vec = Op1.getOperand(0);

  SDValue Lane = Op1.getOperand(1);

  EVT ResTy = N->getValueType(0);

  EVT VecResTy;

  SDLoc DL(N);


  // The vector width should be 128 bits by the time we get here, even

  // if it started as 64 bits (the extract_vector handling will have

  // done so). Bail if it is not.

  if (Vec.getValueSizeInBits() != 128)

    return SDValue();


  if (Vec.getValueType() == MVT::v4i32)

    VecResTy = MVT::v4f32;

  else if (Vec.getValueType() == MVT::v2i64)

    VecResTy = MVT::v2f64;

  else

    return SDValue();


  SDValue Convert =

      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);

}


// AArch64 high-vector "long" operations are formed by performing the non-high

// version on an extract_subvector of each operand which gets the high half:

//

//  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))

//

// However, there are cases which don't have an extract_high explicitly, but

// have another operation that can be made compatible with one for free. For

// example:

//

//  (dupv64 scalar) --> (extract_high (dup128 scalar))

//

// This routine does the actual conversion of such DUPs, once outer routines

// have determined that everything else is in order.

// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold

// similarly here.


static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {

  MVT VT = N.getSimpleValueType();

  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

      N.getConstantOperandVal(1) == 0)

    N = N.getOperand(0);


  switch (N.getOpcode()) {

  case AArch64ISD::DUP:

  case AArch64ISD::DUPLANE8:

  case AArch64ISD::DUPLANE16:

  case AArch64ISD::DUPLANE32:

  case AArch64ISD::DUPLANE64:

  case AArch64ISD::MOVI:

  case AArch64ISD::MOVIshift:

  case AArch64ISD::MOVIedit:

  case AArch64ISD::MOVImsl:

  case AArch64ISD::MVNIshift:

  case AArch64ISD::MVNImsl:

    break;

  default:

    // FMOV could be supported, but isn't very useful, as it would only occur

    // if you passed a bitcast' floating point immediate to an eligible long

    // integer op (addl, smull, ...).

    return SDValue();

  }


  if (!VT.is64BitVector())

    return SDValue();


  SDLoc DL(N);

  unsigned NumElems = VT.getVectorNumElements();

  if (N.getValueType().is64BitVector()) {

    MVT ElementTy = VT.getVectorElementType();

    MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);

    N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());

  }


  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,

                     DAG.getConstant(NumElems, DL, MVT::i64));

}


static bool isEssentiallyExtractHighSubvector(SDValue N) {

  if (N.getOpcode() == ISD::BITCAST)

    N = N.getOperand(0);

  if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)

    return false;

  if (N.getOperand(0).getValueType().isScalableVector())

    return false;

  return N.getConstantOperandAPInt(1) ==

         N.getOperand(0).getValueType().getVectorNumElements() / 2;

}


/// Helper structure to keep track of ISD::SET_CC operands.


struct GenericSetCCInfo {

  const SDValue *Opnd0;

  const SDValue *Opnd1;

  ISD::CondCode CC;

};


/// Helper structure to keep track of a SET_CC lowered into AArch64 code.


struct AArch64SetCCInfo {

  const SDValue *Cmp;

  AArch64CC::CondCode CC;

};


/// Helper structure to keep track of SetCC information.


union SetCCInfo {

  GenericSetCCInfo Generic;

  AArch64SetCCInfo AArch64;

};


/// Helper structure to be able to read SetCC information.  If set to

/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a

/// GenericSetCCInfo.


struct SetCCInfoAndKind {

  SetCCInfo Info;

  bool IsAArch64;

};


/// Check whether or not \p Op is a SET_CC operation, either a generic or

/// an

/// AArch64 lowered one.

/// \p SetCCInfo is filled accordingly.

/// \post SetCCInfo is meanginfull only when this function returns true.

/// \return True when Op is a kind of SET_CC operation.


static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {

  // If this is a setcc, this is straight forward.

  if (Op.getOpcode() == ISD::SETCC) {

    SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);

    SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);

    SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();

    SetCCInfo.IsAArch64 = false;

    return true;

  }

  // Otherwise, check if this is a matching csel instruction.

  // In other words:

  // - csel 1, 0, cc

  // - csel 0, 1, !cc

  if (Op.getOpcode() != AArch64ISD::CSEL)

    return false;

  // Set the information about the operands.

  // TODO: we want the operands of the Cmp not the csel

  SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);

  SetCCInfo.IsAArch64 = true;

  SetCCInfo.Info.AArch64.CC =

      static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));


  // Check that the operands matches the constraints:

  // (1) Both operands must be constants.

  // (2) One must be 1 and the other must be 0.

  ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));

  ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));


  // Check (1).

  if (!TValue || !FValue)

    return false;


  // Check (2).

  if (!TValue->isOne()) {

    // Update the comparison when we are interested in !cc.

    std::swap(TValue, FValue);

    SetCCInfo.Info.AArch64.CC =

        AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);

  }

  return TValue->isOne() && FValue->isZero();

}


// Returns true if Op is setcc or zext of setcc.


static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {

  if (isSetCC(Op, Info))

    return true;

  return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&

    isSetCC(Op->getOperand(0), Info));

}


// The folding we want to perform is:

// (add x, [zext] (setcc cc ...) )

//   -->

// (csel x, (add x, 1), !cc ...)

//

// The latter will get matched to a CSINC instruction.


static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {

  assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");

  SDValue LHS = Op->getOperand(0);

  SDValue RHS = Op->getOperand(1);

  SetCCInfoAndKind InfoAndKind;


  // If both operands are a SET_CC, then we don't want to perform this

  // folding and create another csel as this results in more instructions

  // (and higher register usage).

  if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&

      isSetCCOrZExtSetCC(RHS, InfoAndKind))

    return SDValue();


  // If neither operand is a SET_CC, give up.

  if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {

    std::swap(LHS, RHS);

    if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))

      return SDValue();

  }


  // FIXME: This could be generatized to work for FP comparisons.

  EVT CmpVT = InfoAndKind.IsAArch64

                  ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()

                  : InfoAndKind.Info.Generic.Opnd0->getValueType();

  if (CmpVT != MVT::i32 && CmpVT != MVT::i64)

    return SDValue();


  SDValue CCVal;

  SDValue Cmp;

  SDLoc DL(Op);

  if (InfoAndKind.IsAArch64) {

    CCVal = DAG.getConstant(

        AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), DL,

        MVT::i32);

    Cmp = *InfoAndKind.Info.AArch64.Cmp;

  } else

    Cmp = getAArch64Cmp(

        *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,

        ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,

        DL);


  EVT VT = Op->getValueType(0);

  LHS = DAG.getNode(ISD::ADD, DL, VT, RHS, DAG.getConstant(1, DL, VT));

  return DAG.getNode(AArch64ISD::CSEL, DL, VT, RHS, LHS, CCVal, Cmp);

}


// ADD(UADDV a, UADDV b) -->  UADDV(ADD a, b)


static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);

  // Only scalar integer and vector types.

  if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())

    return SDValue();


  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

      RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)

    return SDValue();


  auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));

  auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));

  if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())

    return SDValue();


  SDValue Op1 = LHS->getOperand(0);

  SDValue Op2 = RHS->getOperand(0);

  EVT OpVT1 = Op1.getValueType();

  EVT OpVT2 = Op2.getValueType();

  if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||

      Op2.getOpcode() != AArch64ISD::UADDV ||

      OpVT1.getVectorElementType() != VT)

    return SDValue();


  SDValue Val1 = Op1.getOperand(0);

  SDValue Val2 = Op2.getOperand(0);

  EVT ValVT = Val1->getValueType(0);

  SDLoc DL(N);

  SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

                     DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),

                     DAG.getConstant(0, DL, MVT::i64));

}


/// Perform the scalar expression combine in the form of:

///   CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)

///   CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)


static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);

  if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)

    return SDValue();


  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);


  // Handle commutivity.

  if (LHS.getOpcode() != AArch64ISD::CSEL &&

      LHS.getOpcode() != AArch64ISD::CSNEG) {

    std::swap(LHS, RHS);

    if (LHS.getOpcode() != AArch64ISD::CSEL &&

        LHS.getOpcode() != AArch64ISD::CSNEG) {

      return SDValue();

    }

  }


  if (!LHS.hasOneUse())

    return SDValue();


  AArch64CC::CondCode AArch64CC =

      static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));


  // The CSEL should include a const one operand, and the CSNEG should include

  // One or NegOne operand.

  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));

  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));

  if (!CTVal || !CFVal)

    return SDValue();


  if (!(LHS.getOpcode() == AArch64ISD::CSEL &&

        (CTVal->isOne() || CFVal->isOne())) &&

      !(LHS.getOpcode() == AArch64ISD::CSNEG &&

        (CTVal->isOne() || CFVal->isAllOnes())))

    return SDValue();


  // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)

  if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&

      !CFVal->isOne()) {

    std::swap(CTVal, CFVal);

    AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);

  }


  SDLoc DL(N);

  // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)

  if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&

      !CFVal->isAllOnes()) {

    APInt C = -1 * CFVal->getAPIntValue();

    CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));

    CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));

    AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);

  }


  // It might be neutral for larger constants, as the immediate need to be

  // materialized in a register.

  APInt ADDC = CTVal->getAPIntValue();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))

    return SDValue();


  assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||

          (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&

         "Unexpected constant value");


  SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));

  SDValue CCVal = getCondCode(DAG, AArch64CC);

  SDValue Cmp = LHS.getOperand(3);


  return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);

}


// ADD(UDOT(zero, x, y), A) -->  UDOT(A, x, y)


static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);

  if (N->getOpcode() != ISD::ADD)

    return SDValue();


  SDValue Dot = N->getOperand(0);

  SDValue A = N->getOperand(1);

  // Handle commutivity

  auto isZeroDot = [](SDValue Dot) {

    return (Dot.getOpcode() == AArch64ISD::UDOT ||

            Dot.getOpcode() == AArch64ISD::SDOT) &&

           isZerosVector(Dot.getOperand(0).getNode());

  };

  if (!isZeroDot(Dot))

    std::swap(Dot, A);

  if (!isZeroDot(Dot))

    return SDValue();


  return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),

                     Dot.getOperand(2));

}


static bool isNegatedInteger(SDValue Op) {

  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));

}


// Try to fold

//

// (neg (csel X, Y)) -> (csel (neg X), (neg Y))

//

// The folding helps csel to be matched with csneg without generating

// redundant neg instruction, which includes negation of the csel expansion

// of abs node lowered by lowerABS.


static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {

  if (!isNegatedInteger(SDValue(N, 0)))

    return SDValue();


  SDValue CSel = N->getOperand(1);

  if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())

    return SDValue();


  SDValue N0 = CSel.getOperand(0);

  SDValue N1 = CSel.getOperand(1);


  // If neither of them are negations, it's not worth the folding as it

  // introduces two additional negations while reducing one negation.

  if (!isNegatedInteger(N0) && !isNegatedInteger(N1))

    return SDValue();


  SDLoc DL(N);

  EVT VT = CSel.getValueType();


  SDValue N0N = DAG.getNegative(N0, DL, VT);

  SDValue N1N = DAG.getNegative(N1, DL, VT);


  return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),

                     CSel.getOperand(3));

}


// The basic add/sub long vector instructions have variants with "2" on the end

// which act on the high-half of their inputs. They are normally matched by

// patterns like:

//

// (add (zeroext (extract_high LHS)),

//      (zeroext (extract_high RHS)))

// -> uaddl2 vD, vN, vM

//

// However, if one of the extracts is something like a duplicate, this

// instruction can still be used profitably. This function puts the DAG into a

// more appropriate form for those patterns to trigger.


static SDValue performAddSubLongCombine(SDNode *N,

                                        TargetLowering::DAGCombinerInfo &DCI) {

  SelectionDAG &DAG = DCI.DAG;

  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  MVT VT = N->getSimpleValueType(0);

  if (!VT.is128BitVector()) {

    if (N->getOpcode() == ISD::ADD)

      return performSetccAddFolding(N, DAG);

    return SDValue();

  }


  // Make sure both branches are extended in the same way.

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&

       LHS.getOpcode() != ISD::SIGN_EXTEND) ||

      LHS.getOpcode() != RHS.getOpcode())

    return SDValue();


  unsigned ExtType = LHS.getOpcode();


  // It's not worth doing if at least one of the inputs isn't already an

  // extract, but we don't know which it'll be so we have to try both.

  if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {

    RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);

    if (!RHS.getNode())

      return SDValue();


    RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);

  } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {

    LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);

    if (!LHS.getNode())

      return SDValue();


    LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);

  }


  return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);

}


static bool isCMP(SDValue Op) {

  return Op.getOpcode() == AArch64ISD::SUBS &&

         !Op.getNode()->hasAnyUseOfValue(0);

}


// (CSEL 1 0 CC Cond) => CC

// (CSEL 0 1 CC Cond) => !CC


static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {

  if (Op.getOpcode() != AArch64ISD::CSEL)

    return std::nullopt;

  auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));

  if (CC == AArch64CC::AL || CC == AArch64CC::NV)

    return std::nullopt;

  SDValue OpLHS = Op.getOperand(0);

  SDValue OpRHS = Op.getOperand(1);

  if (isOneConstant(OpLHS) && isNullConstant(OpRHS))

    return CC;

  if (isNullConstant(OpLHS) && isOneConstant(OpRHS))

    return getInvertedCondCode(CC);


  return std::nullopt;

}


// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)

// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)


static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {

  SDValue CmpOp = Op->getOperand(2);

  if (!isCMP(CmpOp))

    return SDValue();


  if (IsAdd) {

    if (!isOneConstant(CmpOp.getOperand(1)))

      return SDValue();

  } else {

    if (!isNullConstant(CmpOp.getOperand(0)))

      return SDValue();

  }


  SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);

  auto CC = getCSETCondCode(CsetOp);

  if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))

    return SDValue();


  return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),

                     Op->getOperand(0), Op->getOperand(1),

                     CsetOp.getOperand(3));

}


// (ADC x 0 cond) => (CINC x HS cond)


static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  SDValue Cond = N->getOperand(2);


  if (!isNullConstant(RHS))

    return SDValue();


  EVT VT = N->getValueType(0);

  SDLoc DL(N);


  // (CINC x cc cond) <=> (CSINC x x !cc cond)

  SDValue CC = getCondCode(DAG, AArch64CC::LO);

  return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);

}


static SDValue performBuildVectorCombine(SDNode *N,

                                         TargetLowering::DAGCombinerInfo &DCI,

                                         SelectionDAG &DAG) {

  SDLoc DL(N);

  EVT VT = N->getValueType(0);


  if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&

      (VT == MVT::v4f16 || VT == MVT::v4bf16)) {

    SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),

            Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);

    if (Elt0->getOpcode() == ISD::FP_ROUND &&

        Elt1->getOpcode() == ISD::FP_ROUND &&

        isa<ConstantSDNode>(Elt0->getOperand(1)) &&

        isa<ConstantSDNode>(Elt1->getOperand(1)) &&

        Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&

        Elt0->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

        Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

        // Constant index.

        isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&

        isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&

        Elt0->getOperand(0)->getOperand(0) ==

            Elt1->getOperand(0)->getOperand(0) &&

        Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&

        Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {

      SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);

      if (LowLanesSrcVec.getValueType() == MVT::v2f64) {

        SDValue HighLanes;

        if (Elt2->getOpcode() == ISD::UNDEF &&

            Elt3->getOpcode() == ISD::UNDEF) {

          HighLanes = DAG.getUNDEF(MVT::v2f32);

        } else if (Elt2->getOpcode() == ISD::FP_ROUND &&

                   Elt3->getOpcode() == ISD::FP_ROUND &&

                   isa<ConstantSDNode>(Elt2->getOperand(1)) &&

                   isa<ConstantSDNode>(Elt3->getOperand(1)) &&

                   Elt2->getConstantOperandVal(1) ==

                       Elt3->getConstantOperandVal(1) &&

                   Elt2->getOperand(0)->getOpcode() ==

                       ISD::EXTRACT_VECTOR_ELT &&

                   Elt3->getOperand(0)->getOpcode() ==

                       ISD::EXTRACT_VECTOR_ELT &&

                   // Constant index.

                   isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&

                   isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&

                   Elt2->getOperand(0)->getOperand(0) ==

                       Elt3->getOperand(0)->getOperand(0) &&

                   Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&

                   Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {

          SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);

          HighLanes =

              DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);

        }

        if (HighLanes) {

          SDValue DoubleToSingleSticky =

              DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);

          SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,

                                       DoubleToSingleSticky, HighLanes);

          return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,

                             Elt0->getOperand(1));

        }

      }

    }

  }


  if (VT == MVT::v2f64) {

    SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);

    if (Elt0->getOpcode() == ISD::FP_EXTEND &&

        Elt1->getOpcode() == ISD::FP_EXTEND &&

        Elt0->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

        Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

        Elt0->getOperand(0)->getOperand(0) ==

            Elt1->getOperand(0)->getOperand(0) &&

        // Constant index.

        isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&

        isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&

        Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==

            Elt1->getOperand(0)->getConstantOperandVal(1) &&

        // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of

        // ResultType's known minimum vector length.

        Elt0->getOperand(0)->getConstantOperandVal(1) %

                VT.getVectorMinNumElements() ==

            0) {

      SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);

      if (SrcVec.getValueType() == MVT::v4f16 ||

          SrcVec.getValueType() == MVT::v4bf16) {

        SDValue HalfToSingle =

            DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);

        SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);

        SDValue Extract = DAG.getNode(

            ISD::EXTRACT_SUBVECTOR, DL, VT.changeVectorElementType(MVT::f32),

            HalfToSingle, SubvectorIdx);

        return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);

      }

    }

  }


  // A build vector of two extracted elements is equivalent to an

  // extract subvector where the inner vector is any-extended to the

  // extract_vector_elt VT.

  //    (build_vector (extract_elt_iXX_to_i32 vec Idx+0)

  //                  (extract_elt_iXX_to_i32 vec Idx+1))

  // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)


  // For now, only consider the v2i32 case, which arises as a result of

  // legalization.

  if (VT != MVT::v2i32)

    return SDValue();


  SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);

  // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.

  if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

      Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

      // Constant index.

      isa<ConstantSDNode>(Elt0->getOperand(1)) &&

      isa<ConstantSDNode>(Elt1->getOperand(1)) &&

      // Both EXTRACT_VECTOR_ELT from same vector...

      Elt0->getOperand(0) == Elt1->getOperand(0) &&

      // ... and contiguous. First element's index +1 == second element's index.

      Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&

      // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of

      // ResultType's known minimum vector length.

      Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {

    SDValue VecToExtend = Elt0->getOperand(0);

    EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);

    if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))

      return SDValue();


    SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);


    SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);

    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,

                       SubvectorIdx);

  }


  return SDValue();

}


// A special combine for the sqdmulh family of instructions.

// smin( sra ( mul( sext v0, sext v1 ) ), SHIFT_AMOUNT ),

// SATURATING_VAL ) can be reduced to sqdmulh(...)


static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG) {


  if (N->getOpcode() != ISD::SMIN)

    return SDValue();


  EVT DestVT = N->getValueType(0);


  if (!DestVT.isVector() || DestVT.getScalarSizeInBits() > 64 ||

      DestVT.isScalableVector())

    return SDValue();


  ConstantSDNode *Clamp = isConstOrConstSplat(N->getOperand(1));


  if (!Clamp)

    return SDValue();


  MVT ScalarType;

  unsigned ShiftAmt = 0;

  switch (Clamp->getSExtValue()) {

  case (1ULL << 15) - 1:

    ScalarType = MVT::i16;

    ShiftAmt = 16;

    break;

  case (1ULL << 31) - 1:

    ScalarType = MVT::i32;

    ShiftAmt = 32;

    break;

  default:

    return SDValue();

  }


  SDValue Sra = N->getOperand(0);

  if (Sra.getOpcode() != ISD::SRA || !Sra.hasOneUse())

    return SDValue();


  ConstantSDNode *RightShiftVec = isConstOrConstSplat(Sra.getOperand(1));

  if (!RightShiftVec)

    return SDValue();

  unsigned SExtValue = RightShiftVec->getSExtValue();


  if (SExtValue != (ShiftAmt - 1))

    return SDValue();


  SDValue Mul = Sra.getOperand(0);

  if (Mul.getOpcode() != ISD::MUL)

    return SDValue();


  SDValue SExt0 = Mul.getOperand(0);

  SDValue SExt1 = Mul.getOperand(1);


  if (SExt0.getOpcode() != ISD::SIGN_EXTEND ||

      SExt1.getOpcode() != ISD::SIGN_EXTEND)

    return SDValue();


  EVT SExt0Type = SExt0.getOperand(0).getValueType();

  EVT SExt1Type = SExt1.getOperand(0).getValueType();


  if (SExt0Type != SExt1Type || SExt0Type.getScalarType() != ScalarType ||

      SExt0Type.getFixedSizeInBits() > 128 || !SExt0Type.isPow2VectorType() ||

      SExt0Type.getVectorNumElements() == 1)

    return SDValue();


  SDLoc DL(N);

  SDValue V0 = SExt0.getOperand(0);

  SDValue V1 = SExt1.getOperand(0);


  // Ensure input vectors are extended to legal types

  if (SExt0Type.getFixedSizeInBits() < 64) {

    unsigned VecNumElements = SExt0Type.getVectorNumElements();

    EVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(64 / VecNumElements),

                                    VecNumElements);

    V0 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V0);

    V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V1);

  }


  SDValue SQDMULH =

      DAG.getNode(AArch64ISD::SQDMULH, DL, V0.getValueType(), V0, V1);


  return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, SQDMULH);

}


static SDValue performSMINCombine(SDNode *N, SelectionDAG &DAG) {

  if (SDValue V = trySQDMULHCombine(N, DAG)) {

    return V;

  }


  return SDValue();

}


static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,

                                      TargetLowering::DAGCombinerInfo &DCI) {

  SDLoc DL(N);

  EVT VT = N->getValueType(0);

  SDValue N0 = N->getOperand(0);

  if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&

      N0.getOpcode() == AArch64ISD::DUP) {

    SDValue Op = N0.getOperand(0);

    if (VT.getScalarType() == MVT::i32 &&

        N0.getOperand(0).getValueType().getScalarType() == MVT::i64)

      Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);

    return DAG.getNode(N0.getOpcode(), DL, VT, Op);

  }


  // Performing the following combine produces a preferable form for ISEL.

  // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))

  if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

      N0.hasOneUse()) {

    SDValue Op = N0.getOperand(0);

    SDValue ExtractIndexNode = N0.getOperand(1);

    if (!isa<ConstantSDNode>(ExtractIndexNode))

      return SDValue();


    // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.

    // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).

    assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&

           "Unexpected legalisation result!");


    EVT SrcVectorType = Op.getValueType();

    // We also assume that SrcVectorType cannot be a V64 (see

    // LowerEXTRACT_VECTOR_ELT).

    assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&

           "Unexpected legalisation result!");


    unsigned ExtractIndex =

        cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();

    MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;


    Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);

    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,

                       DAG.getVectorIdxConstant(ExtractIndex * 2, DL));

  }


  return SDValue();

}


// Check an node is an extend or shift operand


static bool isExtendOrShiftOperand(SDValue N) {

  unsigned Opcode = N.getOpcode();

  if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {

    EVT SrcVT;

    if (Opcode == ISD::SIGN_EXTEND_INREG)

      SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();

    else

      SrcVT = N.getOperand(0).getValueType();


    return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;

  } else if (Opcode == ISD::AND) {

    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));

    if (!CSD)

      return false;

    uint64_t AndMask = CSD->getZExtValue();

    return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;

  } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {

    return isa<ConstantSDNode>(N.getOperand(1));

  }


  return false;

}


// (N - Y) + Z --> (Z - Y) + N

// when N is an extend or shift operand


static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z,

                                         SelectionDAG &DAG) {

  auto IsOneUseExtend = [](SDValue N) {

    return N.hasOneUse() && isExtendOrShiftOperand(N);

  };


  // DAGCombiner will revert the combination when Z is constant cause

  // dead loop. So don't enable the combination when Z is constant.

  // If Z is one use shift C, we also can't do the optimization.

  // It will falling to self infinite loop.

  if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))

    return SDValue();


  if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())

    return SDValue();


  SDValue Shift = SUB.getOperand(0);

  if (!IsOneUseExtend(Shift))

    return SDValue();


  SDLoc DL(N);

  EVT VT = N->getValueType(0);


  SDValue Y = SUB.getOperand(1);

  SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);

  return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);

}


static SDValue performAddCombineForShiftedOperands(SDNode *N,

                                                   SelectionDAG &DAG) {

  // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not

  // commutative.

  if (N->getOpcode() != ISD::ADD)

    return SDValue();


  // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with

  // shifted register is only available for i32 and i64.

  EVT VT = N->getValueType(0);

  if (VT != MVT::i32 && VT != MVT::i64)

    return SDValue();


  SDLoc DL(N);

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);


  if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))

    return Val;

  if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))

    return Val;


  uint64_t LHSImm = 0, RHSImm = 0;

  // If both operand are shifted by imm and shift amount is not greater than 4

  // for one operand, swap LHS and RHS to put operand with smaller shift amount

  // on RHS.

  //

  // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with

  // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD

  // with LSL (shift > 4). For the rest of processors, this is no-op for

  // performance or correctness.

  if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&

      isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&

      RHSImm > 4 && LHS.hasOneUse())

    return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);


  return SDValue();

}


// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))

// This reassociates it back to allow the creation of more mls instructions.


static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG) {

  if (N->getOpcode() != ISD::SUB)

    return SDValue();


  SDValue Add = N->getOperand(1);

  SDValue X = N->getOperand(0);

  if (Add.getOpcode() != ISD::ADD)

    return SDValue();


  if (!Add.hasOneUse())

    return SDValue();

  if (DAG.isConstantIntBuildVectorOrConstantInt(X))

    return SDValue();


  SDValue M1 = Add.getOperand(0);

  SDValue M2 = Add.getOperand(1);

  if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&

      M1.getOpcode() != AArch64ISD::UMULL)

    return SDValue();

  if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&

      M2.getOpcode() != AArch64ISD::UMULL)

    return SDValue();


  EVT VT = N->getValueType(0);

  SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);

  return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);

}


// Combine into mla/mls.

// This works on the patterns of:

//   add v1, (mul v2, v3)

//   sub v1, (mul v2, v3)

// for vectors of type <1 x i64> and <2 x i64> when SVE is available.

// It will transform the add/sub to a scalable version, so that we can

// make use of SVE's MLA/MLS that will be generated for that pattern

static SDValue


performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {

  SelectionDAG &DAG = DCI.DAG;

  // Make sure that the types are legal

  if (!DCI.isAfterLegalizeDAG())

    return SDValue();

  // Before using SVE's features, check first if it's available.

  if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())

    return SDValue();


  if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)

    return SDValue();


  if (!N->getValueType(0).isFixedLengthVector())

    return SDValue();


  auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {

    if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)

      return SDValue();


    if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())

      return SDValue();


    SDValue MulValue = Op1->getOperand(0);

    if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)

      return SDValue();


    if (!Op1.hasOneUse() || !MulValue.hasOneUse())

      return SDValue();


    EVT ScalableVT = MulValue.getValueType();

    if (!ScalableVT.isScalableVector())

      return SDValue();


    SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);

    SDValue NewValue =

        DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});

    return convertFromScalableVector(DAG, N->getValueType(0), NewValue);

  };


  if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))

    return res;

  else if (N->getOpcode() == ISD::ADD)

    return performOpt(N->getOperand(1), N->getOperand(0));


  return SDValue();

}


// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can

// help, for example, to produce ssra from sshr+add.


static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);

  if (VT != MVT::i64 ||

      DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64))

    return SDValue();

  SDValue Op0 = N->getOperand(0);

  SDValue Op1 = N->getOperand(1);


  // At least one of the operands should be an extract, and the other should be

  // something that is easy to convert to v1i64 type (in this case a load).

  if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&

      Op0.getOpcode() != ISD::LOAD)

    return SDValue();

  if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&

      Op1.getOpcode() != ISD::LOAD)

    return SDValue();


  SDLoc DL(N);

  if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

      Op0.getOperand(0).getValueType() == MVT::v1i64) {

    Op0 = Op0.getOperand(0);

    Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);

  } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

             Op1.getOperand(0).getValueType() == MVT::v1i64) {

    Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);

    Op1 = Op1.getOperand(0);

  } else

    return SDValue();


  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,

                     DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),

                     DAG.getConstant(0, DL, MVT::i64));

}


static bool isLoadOrMultipleLoads(SDValue B, SmallVector<LoadSDNode *> &Loads) {

  SDValue BV = peekThroughOneUseBitcasts(B);

  if (!BV->hasOneUse())

    return false;

  if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {

    if (!Ld || !Ld->isSimple())

      return false;

    Loads.push_back(Ld);

    return true;

  } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||

             BV.getOpcode() == ISD::CONCAT_VECTORS) {

    for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {

      auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));

      if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())

        return false;

      Loads.push_back(Ld);

    }

    return true;

  } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {

    // Try to find a tree of shuffles and concats from how IR shuffles of loads

    // are lowered. Note that this only comes up because we do not always visit

    // operands before uses. After that is fixed this can be removed and in the

    // meantime this is fairly specific to the lowering we expect from IR.

    // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45

    //   t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43

    //     t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8

    //       t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64

    //       t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64

    //     t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8

    //       t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64

    //   t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8

    //     t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64

    if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||

        B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||

        B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||

        B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||

        B.getOperand(1).getNumOperands() != 4)

      return false;

    auto SV1 = cast<ShuffleVectorSDNode>(B);

    auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));

    int NumElts = B.getValueType().getVectorNumElements();

    int NumSubElts = NumElts / 4;

    for (int I = 0; I < NumSubElts; I++) {

      // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>

      if (SV1->getMaskElt(I) != I ||

          SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||

          SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||

          SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)

        return false;

      // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>

      if (SV2->getMaskElt(I) != I ||

          SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||

          SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)

        return false;

    }

    auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));

    auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));

    auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));

    auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));

    if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||

        !Ld2->isSimple() || !Ld3->isSimple())

      return false;

    Loads.push_back(Ld0);

    Loads.push_back(Ld1);

    Loads.push_back(Ld2);

    Loads.push_back(Ld3);

    return true;

  }

  return false;

}


static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1,

                                            SelectionDAG &DAG,

                                            unsigned &NumSubLoads) {

  if (!Op0.hasOneUse() || !Op1.hasOneUse())

    return false;


  SmallVector<LoadSDNode *> Loads0, Loads1;

  if (isLoadOrMultipleLoads(Op0, Loads0) &&

      isLoadOrMultipleLoads(Op1, Loads1)) {

    if (NumSubLoads && Loads0.size() != NumSubLoads)

      return false;

    NumSubLoads = Loads0.size();

    return Loads0.size() == Loads1.size() &&

           all_of(zip(Loads0, Loads1), [&DAG](auto L) {

             unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();

             return Size == get<1>(L)->getValueType(0).getSizeInBits() &&

                    DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),

                                                       Size / 8, 1);

           });

  }


  if (Op0.getOpcode() != Op1.getOpcode())

    return false;


  switch (Op0.getOpcode()) {

  case ISD::ADD:

  case ISD::SUB:

    return areLoadedOffsetButOtherwiseSame(Op0.getOperand(0), Op1.getOperand(0),

                                           DAG, NumSubLoads) &&

           areLoadedOffsetButOtherwiseSame(Op0.getOperand(1), Op1.getOperand(1),

                                           DAG, NumSubLoads);

  case ISD::SIGN_EXTEND:

  case ISD::ANY_EXTEND:

  case ISD::ZERO_EXTEND:

    EVT XVT = Op0.getOperand(0).getValueType();

    if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&

        XVT.getScalarSizeInBits() != 32)

      return false;

    return areLoadedOffsetButOtherwiseSame(Op0.getOperand(0), Op1.getOperand(0),

                                           DAG, NumSubLoads);

  }

  return false;

}


// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))

// into a single load of twice the size, that we extract the bottom part and top

// part so that the shl can use a shll2 instruction. The two loads in that

// example can also be larger trees of instructions, which are identical except

// for the leaves which are all loads offset from the LHS, including

// buildvectors of multiple loads. For example the RHS tree could be

// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))

// Whilst it can be common for the larger loads to replace LDP instructions

// (which doesn't gain anything on it's own), the larger loads can help create

// more efficient code, and in buildvectors prevent the need for ld1 lane

// inserts which can be slower than normal loads.


static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);

  if (!VT.isFixedLengthVector() ||

      (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&

       VT.getScalarSizeInBits() != 64))

    return SDValue();


  SDValue Other = N->getOperand(0);

  SDValue Shift = N->getOperand(1);

  if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)

    std::swap(Shift, Other);

  APInt ShiftAmt;

  if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||

      !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))

    return SDValue();


  if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||

      !ISD::isExtOpcode(Other.getOpcode()) ||

      Shift.getOperand(0).getOperand(0).getValueType() !=

          Other.getOperand(0).getValueType() ||

      !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())

    return SDValue();


  SDValue Op0 = Other.getOperand(0);

  SDValue Op1 = Shift.getOperand(0).getOperand(0);


  unsigned NumSubLoads = 0;

  if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))

    return SDValue();


  // Attempt to rule out some unprofitable cases using heuristics (some working

  // around suboptimal code generation), notably if the extend not be able to

  // use ushll2 instructions as the types are not large enough. Otherwise zip's

  // will need to be created which can increase the instruction count.

  unsigned NumElts = Op0.getValueType().getVectorNumElements();

  unsigned NumSubElts = NumElts / NumSubLoads;

  if (NumSubElts * VT.getScalarSizeInBits() < 128 ||

      (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&

       Op0.getValueType().getSizeInBits() < 128 &&

       !DAG.getTargetLoweringInfo().isTypeLegal(Op0.getValueType())))

    return SDValue();


  // Recreate the tree with the new combined loads.

  std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =

      [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {

        EVT DVT =

            Op0.getValueType().getDoubleNumVectorElementsVT(*DAG.getContext());


        SmallVector<LoadSDNode *> Loads0, Loads1;

        if (isLoadOrMultipleLoads(Op0, Loads0) &&

            isLoadOrMultipleLoads(Op1, Loads1)) {

          EVT LoadVT = EVT::getVectorVT(

              *DAG.getContext(), Op0.getValueType().getScalarType(),

              Op0.getValueType().getVectorNumElements() / Loads0.size());

          EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());


          SmallVector<SDValue> NewLoads;

          for (const auto &[L0, L1] : zip(Loads0, Loads1)) {

            SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),

                                       L0->getBasePtr(), L0->getPointerInfo(),

                                       L0->getBaseAlign());

            DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));

            DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));

            NewLoads.push_back(Load);

          }

          return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);

        }


        SmallVector<SDValue> Ops;

        for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))

          Ops.push_back(GenCombinedTree(O0, O1, DAG));

        return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);

      };

  SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);


  SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);

  int Hi = NumSubElts, Lo = 0;

  for (unsigned i = 0; i < NumSubLoads; i++) {

    for (unsigned j = 0; j < NumSubElts; j++) {

      LowMask[i * NumSubElts + j] = Lo++;

      HighMask[i * NumSubElts + j] = Hi++;

    }

    Lo += NumSubElts;

    Hi += NumSubElts;

  }

  SDLoc DL(N);

  SDValue Ext0, Ext1;

  // Extract the top and bottom lanes, then extend the result. Possibly extend

  // the result then extract the lanes if the two operands match as it produces

  // slightly smaller code.

  if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {

    SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(),

                               NewOp, DAG.getConstant(0, DL, MVT::i64));

    SDValue SubH =

        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,

                    DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));

    SDValue Extr0 =

        DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);

    SDValue Extr1 =

        DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);

    Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);

    Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);

  } else {

    EVT DVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());

    SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);

    SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,

                               DAG.getConstant(0, DL, MVT::i64));

    SDValue SubH =

        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,

                    DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));

    Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);

    Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);

  }

  SDValue NShift =

      DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));

  return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);

}


static SDValue performAddSubCombine(SDNode *N,

                                    TargetLowering::DAGCombinerInfo &DCI) {

  // Try to change sum of two reductions.

  if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))

    return Val;

  if (SDValue Val = performAddDotCombine(N, DCI.DAG))

    return Val;

  if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))

    return Val;

  if (SDValue Val = performNegCSelCombine(N, DCI.DAG))

    return Val;

  if (SDValue Val = performVectorExtCombine(N, DCI.DAG))

    return Val;

  if (SDValue Val = performAddCombineForShiftedOperands(N, DCI.DAG))

    return Val;

  if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))

    return Val;

  if (SDValue Val = performSVEMulAddSubCombine(N, DCI))

    return Val;

  if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))

    return Val;


  if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))

    return Val;


  return performAddSubLongCombine(N, DCI);

}


// Massage DAGs which we can use the high-half "long" operations on into

// something isel will recognize better. E.g.

//

// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->

//   (aarch64_neon_umull (extract_high (v2i64 vec)))

//                     (extract_high (v2i64 (dup128 scalar)))))

//


static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,

                                       TargetLowering::DAGCombinerInfo &DCI,

                                       SelectionDAG &DAG) {

  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);

  SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);

  assert(LHS.getValueType().is64BitVector() &&

         RHS.getValueType().is64BitVector() &&

         "unexpected shape for long operation");


  // Either node could be a DUP, but it's not worth doing both of them (you'd

  // just as well use the non-high version) so look for a corresponding extract

  // operation on the other "wing".

  if (isEssentiallyExtractHighSubvector(LHS)) {

    RHS = tryExtendDUPToExtractHigh(RHS, DAG);

    if (!RHS.getNode())

      return SDValue();

  } else if (isEssentiallyExtractHighSubvector(RHS)) {

    LHS = tryExtendDUPToExtractHigh(LHS, DAG);

    if (!LHS.getNode())

      return SDValue();

  } else

    return SDValue();


  if (IID == Intrinsic::not_intrinsic)

    return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);


  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),

                     N->getOperand(0), LHS, RHS);

}


static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {

  MVT ElemTy = N->getSimpleValueType(0).getScalarType();

  unsigned ElemBits = ElemTy.getSizeInBits();


  int64_t ShiftAmount;

  if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {

    APInt SplatValue, SplatUndef;

    unsigned SplatBitSize;

    bool HasAnyUndefs;

    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,

                              HasAnyUndefs, ElemBits) ||

        SplatBitSize != ElemBits)

      return SDValue();


    ShiftAmount = SplatValue.getSExtValue();

  } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {

    ShiftAmount = CVN->getSExtValue();

  } else

    return SDValue();


  // If the shift amount is zero, remove the shift intrinsic.

  if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)

    return N->getOperand(1);


  unsigned Opcode;

  bool IsRightShift;

  switch (IID) {

  default:

    llvm_unreachable("Unknown shift intrinsic");

  case Intrinsic::aarch64_neon_sqshl:

    Opcode = AArch64ISD::SQSHL_I;

    IsRightShift = false;

    break;

  case Intrinsic::aarch64_neon_uqshl:

    Opcode = AArch64ISD::UQSHL_I;

    IsRightShift = false;

    break;

  case Intrinsic::aarch64_neon_srshl:

    Opcode = AArch64ISD::SRSHR_I;

    IsRightShift = true;

    break;

  case Intrinsic::aarch64_neon_urshl:

    Opcode = AArch64ISD::URSHR_I;

    IsRightShift = true;

    break;

  case Intrinsic::aarch64_neon_sqshlu:

    Opcode = AArch64ISD::SQSHLU_I;

    IsRightShift = false;

    break;

  case Intrinsic::aarch64_neon_sshl:

  case Intrinsic::aarch64_neon_ushl:

    // For positive shift amounts we can use SHL, as ushl/sshl perform a regular

    // left shift for positive shift amounts. For negative shifts we can use a

    // VASHR/VLSHR as appropriate.

    if (ShiftAmount < 0) {

      Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR

                                                   : AArch64ISD::VLSHR;

      ShiftAmount = -ShiftAmount;

    } else

      Opcode = AArch64ISD::VSHL;

    IsRightShift = false;

    break;

  }


  EVT VT = N->getValueType(0);

  SDValue Op = N->getOperand(1);

  SDLoc DL(N);

  if (VT == MVT::i64) {

    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op);

    VT = MVT::v1i64;

  }


  if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {

    Op = DAG.getNode(Opcode, DL, VT, Op,

                     DAG.getSignedConstant(-ShiftAmount, DL, MVT::i32, true));

    if (N->getValueType(0) == MVT::i64)

      Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,

                       DAG.getConstant(0, DL, MVT::i64));

    return Op;

  } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {

    Op = DAG.getNode(Opcode, DL, VT, Op,

                     DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));

    if (N->getValueType(0) == MVT::i64)

      Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,

                       DAG.getConstant(0, DL, MVT::i64));

    return Op;

  }


  return SDValue();

}


// The CRC32[BH] instructions ignore the high bits of their data operand. Since

// the intrinsics must be legal and take an i32, this means there's almost

// certainly going to be a zext in the DAG which we can eliminate.


static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {

  SDValue AndN = N->getOperand(2);

  if (AndN.getOpcode() != ISD::AND)

    return SDValue();


  ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));

  if (!CMask || CMask->getZExtValue() != Mask)

    return SDValue();


  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,

                     N->getOperand(0), N->getOperand(1), AndN.getOperand(0));

}


static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,

                                           SelectionDAG &DAG) {

  SDLoc DL(N);

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),

                     DAG.getNode(Opc, DL, N->getOperand(1).getSimpleValueType(),

                                 N->getOperand(1)),

                     DAG.getConstant(0, DL, MVT::i64));

}


static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {

  SDLoc DL(N);

  SDValue Op1 = N->getOperand(1);

  SDValue Op2 = N->getOperand(2);

  EVT ScalarTy = Op2.getValueType();

  if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))

    ScalarTy = MVT::i32;


  // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).

  SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));

  SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);

  SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);

  SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);

  return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);

}


static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {

  SDLoc DL(N);

  SDValue Scalar = N->getOperand(3);

  EVT ScalarTy = Scalar.getValueType();


  if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))

    Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);


  SDValue Passthru = N->getOperand(1);

  SDValue Pred = N->getOperand(2);

  return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, DL, N->getValueType(0),

                     Pred, Scalar, Passthru);

}


static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {

  SDLoc DL(N);

  LLVMContext &Ctx = *DAG.getContext();

  EVT VT = N->getValueType(0);


  assert(VT.isScalableVector() && "Expected a scalable vector.");


  // Current lowering only supports the SVE-ACLE types.

  if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)

    return SDValue();


  unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;

  unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;

  EVT ByteVT =

      EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));


  // Convert everything to the domain of EXT (i.e bytes).

  SDValue Op0 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(1));

  SDValue Op1 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(2));

  SDValue Op2 = DAG.getNode(ISD::MUL, DL, MVT::i32, N->getOperand(3),

                            DAG.getConstant(ElemSize, DL, MVT::i32));


  SDValue EXT = DAG.getNode(AArch64ISD::EXT, DL, ByteVT, Op0, Op1, Op2);

  return DAG.getNode(ISD::BITCAST, DL, VT, EXT);

}


static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,

                                        TargetLowering::DAGCombinerInfo &DCI,

                                        SelectionDAG &DAG) {

  if (DCI.isBeforeLegalize())

    return SDValue();


  SDValue Comparator = N->getOperand(3);

  if (Comparator.getOpcode() == AArch64ISD::DUP ||

      Comparator.getOpcode() == ISD::SPLAT_VECTOR) {

    unsigned IID = getIntrinsicID(N);

    EVT VT = N->getValueType(0);

    EVT CmpVT = N->getOperand(2).getValueType();

    SDValue Pred = N->getOperand(1);

    SDValue Imm;

    SDLoc DL(N);


    switch (IID) {

    default:

      llvm_unreachable("Called with wrong intrinsic!");

      break;


    // Signed comparisons

    case Intrinsic::aarch64_sve_cmpeq_wide:

    case Intrinsic::aarch64_sve_cmpne_wide:

    case Intrinsic::aarch64_sve_cmpge_wide:

    case Intrinsic::aarch64_sve_cmpgt_wide:

    case Intrinsic::aarch64_sve_cmplt_wide:

    case Intrinsic::aarch64_sve_cmple_wide: {

      if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {

        int64_t ImmVal = CN->getSExtValue();

        if (ImmVal >= -16 && ImmVal <= 15)

          Imm = DAG.getSignedConstant(ImmVal, DL, MVT::i32);

        else

          return SDValue();

      }

      break;

    }

    // Unsigned comparisons

    case Intrinsic::aarch64_sve_cmphs_wide:

    case Intrinsic::aarch64_sve_cmphi_wide:

    case Intrinsic::aarch64_sve_cmplo_wide:

    case Intrinsic::aarch64_sve_cmpls_wide:  {

      if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {

        uint64_t ImmVal = CN->getZExtValue();

        if (ImmVal <= 127)

          Imm = DAG.getConstant(ImmVal, DL, MVT::i32);

        else

          return SDValue();

      }

      break;

    }

    }


    if (!Imm)

      return SDValue();


    SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);

    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,

                       N->getOperand(2), Splat, DAG.getCondCode(CC));

  }


  return SDValue();

}


static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,

                        AArch64CC::CondCode Cond) {

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();


  SDLoc DL(Op);

  assert(Op.getValueType().isScalableVector() &&

         TLI.isTypeLegal(Op.getValueType()) &&

         "Expected legal scalable vector type!");

  assert(Op.getValueType() == Pg.getValueType() &&

         "Expected same type for PTEST operands");


  // Ensure target specific opcodes are using legal type.

  EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);

  SDValue TVal = DAG.getConstant(1, DL, OutVT);

  SDValue FVal = DAG.getConstant(0, DL, OutVT);


  // Ensure operands have type nxv16i1.

  if (Op.getValueType() != MVT::nxv16i1) {

    if ((Cond == AArch64CC::ANY_ACTIVE || Cond == AArch64CC::NONE_ACTIVE) &&

        isZeroingInactiveLanes(Op))

      Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);

    else

      Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);

    Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);

  }


  unsigned PTest = AArch64ISD::PTEST;

  if (Cond == AArch64CC::ANY_ACTIVE)

    PTest = AArch64ISD::PTEST_ANY;

  else if (Cond == AArch64CC::FIRST_ACTIVE)

    PTest = AArch64ISD::PTEST_FIRST;


  // Set condition code (CC) flags.

  SDValue Test = DAG.getNode(PTest, DL, MVT::i32, Pg, Op);


  // Convert CC to integer based on requested condition.

  // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.

  SDValue CC = getCondCode(DAG, getInvertedCondCode(Cond));

  SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);

  return DAG.getZExtOrTrunc(Res, DL, VT);

}


static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc,

                                      SelectionDAG &DAG) {

  SDLoc DL(N);


  SDValue Pred = N->getOperand(1);

  SDValue VecToReduce = N->getOperand(2);


  // NOTE: The integer reduction's result type is not always linked to the

  // operand's element type so we construct it from the intrinsic's result type.

  EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));

  SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);


  // SVE reductions set the whole vector register with the first element

  // containing the reduction result, which we'll now extract.

  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,

                     Zero);

}


static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,

                                     SelectionDAG &DAG) {

  SDLoc DL(N);


  SDValue Pred = N->getOperand(1);

  SDValue VecToReduce = N->getOperand(2);


  EVT ReduceVT = VecToReduce.getValueType();

  SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);


  // SVE reductions set the whole vector register with the first element

  // containing the reduction result, which we'll now extract.

  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,

                     Zero);

}


static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,

                                            SelectionDAG &DAG) {

  SDLoc DL(N);


  SDValue Pred = N->getOperand(1);

  SDValue InitVal = N->getOperand(2);

  SDValue VecToReduce = N->getOperand(3);

  EVT ReduceVT = VecToReduce.getValueType();


  // Ordered reductions use the first lane of the result vector as the

  // reduction's initial value.

  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);

  InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,

                        DAG.getUNDEF(ReduceVT), InitVal, Zero);


  SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);


  // SVE reductions set the whole vector register with the first element

  // containing the reduction result, which we'll now extract.

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,

                     Zero);

}


static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode,

                                           SelectionDAG &DAG) {

  if (N->getValueType(0) != MVT::i16)

    return SDValue();


  SDLoc DL(N);

  SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1));

  SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT);

  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast);

}


// If a merged operation has no inactive lanes we can relax it to a predicated

// or unpredicated operation, which potentially allows better isel (perhaps

// using immediate forms) or relaxing register reuse requirements.


static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,

                                       SelectionDAG &DAG, bool UnpredOp = false,

                                       bool SwapOperands = false) {

  assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");

  assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");

  SDValue Pg = N->getOperand(1);

  SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);

  SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);


  // ISD way to specify an all active predicate.

  if (isAllActivePredicate(DAG, Pg)) {

    if (UnpredOp)

      return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);


    return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);

  }


  // FUTURE: SplatVector(true)

  return SDValue();

}


static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG) {

  SDLoc DL(N);

  EVT VT = N->getValueType(0);

  SDValue Op1 = N->getOperand(1);

  SDValue Op2 = N->getOperand(2);

  SDValue Op3 = N->getOperand(3);


  switch (IID) {

  default:

    llvm_unreachable("Called with wrong intrinsic!");

  case Intrinsic::aarch64_sve_bsl:

    return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2);

  case Intrinsic::aarch64_sve_bsl1n:

    return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, DAG.getNOT(DL, Op1, VT),

                       Op2);

  case Intrinsic::aarch64_sve_bsl2n:

    return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1,

                       DAG.getNOT(DL, Op2, VT));

  case Intrinsic::aarch64_sve_nbsl:

    return DAG.getNOT(DL, DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2),

                      VT);

  }

}


static SDValue performIntrinsicCombine(SDNode *N,

                                       TargetLowering::DAGCombinerInfo &DCI,

                                       const AArch64Subtarget *Subtarget) {

  SelectionDAG &DAG = DCI.DAG;

  unsigned IID = getIntrinsicID(N);

  switch (IID) {

  default:

    break;

  case Intrinsic::aarch64_neon_vcvtfxs2fp:

  case Intrinsic::aarch64_neon_vcvtfxu2fp:

    return tryCombineFixedPointConvert(N, DCI, DAG);

  case Intrinsic::aarch64_neon_saddv:

    return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);

  case Intrinsic::aarch64_neon_uaddv:

    return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);

  case Intrinsic::aarch64_neon_sminv:

    return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);

  case Intrinsic::aarch64_neon_uminv:

    return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);

  case Intrinsic::aarch64_neon_smaxv:

    return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);

  case Intrinsic::aarch64_neon_umaxv:

    return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);

  case Intrinsic::aarch64_neon_fmax:

    return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2));

  case Intrinsic::aarch64_neon_fmin:

    return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2));

  case Intrinsic::aarch64_neon_fmaxnm:

    return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2));

  case Intrinsic::aarch64_neon_fminnm:

    return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2));

  case Intrinsic::aarch64_neon_smull:

    return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2));

  case Intrinsic::aarch64_neon_umull:

    return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2));

  case Intrinsic::aarch64_neon_pmull:

    return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2));

  case Intrinsic::aarch64_neon_sqdmull:

    return tryCombineLongOpWithDup(IID, N, DCI, DAG);

  case Intrinsic::aarch64_neon_sqshl:

  case Intrinsic::aarch64_neon_uqshl:

  case Intrinsic::aarch64_neon_sqshlu:

  case Intrinsic::aarch64_neon_srshl:

  case Intrinsic::aarch64_neon_urshl:

  case Intrinsic::aarch64_neon_sshl:

  case Intrinsic::aarch64_neon_ushl:

    return tryCombineShiftImm(IID, N, DAG);

  case Intrinsic::aarch64_neon_sabd:

    return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2));

  case Intrinsic::aarch64_neon_uabd:

    return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2));

  case Intrinsic::aarch64_neon_fcvtzs:

    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG);

  case Intrinsic::aarch64_neon_fcvtzu:

    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZU_HALF, DAG);

  case Intrinsic::aarch64_neon_fcvtas:

    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAS_HALF, DAG);

  case Intrinsic::aarch64_neon_fcvtau:

    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAU_HALF, DAG);

  case Intrinsic::aarch64_neon_fcvtms:

    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMS_HALF, DAG);

  case Intrinsic::aarch64_neon_fcvtmu:

    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMU_HALF, DAG);

  case Intrinsic::aarch64_neon_fcvtns:

    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNS_HALF, DAG);

  case Intrinsic::aarch64_neon_fcvtnu:

    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNU_HALF, DAG);

  case Intrinsic::aarch64_neon_fcvtps:

    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPS_HALF, DAG);

  case Intrinsic::aarch64_neon_fcvtpu:

    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPU_HALF, DAG);

  case Intrinsic::aarch64_crc32b:

  case Intrinsic::aarch64_crc32cb:

    return tryCombineCRC32(0xff, N, DAG);

  case Intrinsic::aarch64_crc32h:

  case Intrinsic::aarch64_crc32ch:

    return tryCombineCRC32(0xffff, N, DAG);

  case Intrinsic::aarch64_sve_saddv:

    // There is no i64 version of SADDV because the sign is irrelevant.

    if (N->getOperand(2).getValueType().getVectorElementType() == MVT::i64)

      return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);

    else

      return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);

  case Intrinsic::aarch64_sve_uaddv:

    return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);

  case Intrinsic::aarch64_sve_smaxv:

    return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);

  case Intrinsic::aarch64_sve_umaxv:

    return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);

  case Intrinsic::aarch64_sve_sminv:

    return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);

  case Intrinsic::aarch64_sve_uminv:

    return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);

  case Intrinsic::aarch64_sve_orv:

    return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);

  case Intrinsic::aarch64_sve_eorv:

    return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);

  case Intrinsic::aarch64_sve_andv:

    return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);

  case Intrinsic::aarch64_sve_index:

    return LowerSVEIntrinsicIndex(N, DAG);

  case Intrinsic::aarch64_sve_dup:

    return LowerSVEIntrinsicDUP(N, DAG);

  case Intrinsic::aarch64_sve_dup_x:

    return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),

                       N->getOperand(1));

  case Intrinsic::aarch64_sve_ext:

    return LowerSVEIntrinsicEXT(N, DAG);

  case Intrinsic::aarch64_sve_mul_u:

    return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_smulh_u:

    return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_umulh_u:

    return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_smin_u:

    return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_umin_u:

    return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_smax_u:

    return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_umax_u:

    return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_lsl_u:

    return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_lsr_u:

    return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_asr_u:

    return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_fadd_u:

    return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_fdiv_u:

    return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_fmax_u:

    return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_fmaxnm_u:

    return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_fmla_u:

    return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(3), N->getOperand(4),

                       N->getOperand(2));

  case Intrinsic::aarch64_sve_fmin_u:

    return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_fminnm_u:

    return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_fmul_u:

    return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_fsub_u:

    return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_add_u:

    return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),

                       N->getOperand(3));

  case Intrinsic::aarch64_sve_sub_u:

    return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),

                       N->getOperand(3));

  case Intrinsic::aarch64_sve_subr:

    return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);

  case Intrinsic::aarch64_sve_and_u:

    return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),

                       N->getOperand(3));

  case Intrinsic::aarch64_sve_bic_u:

    return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),

                       N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_saddwb:

    return DAG.getNode(AArch64ISD::SADDWB, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2));

  case Intrinsic::aarch64_sve_saddwt:

    return DAG.getNode(AArch64ISD::SADDWT, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2));

  case Intrinsic::aarch64_sve_uaddwb:

    return DAG.getNode(AArch64ISD::UADDWB, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2));

  case Intrinsic::aarch64_sve_uaddwt:

    return DAG.getNode(AArch64ISD::UADDWT, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2));

  case Intrinsic::aarch64_sve_eor_u:

    return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),

                       N->getOperand(3));

  case Intrinsic::aarch64_sve_orr_u:

    return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),

                       N->getOperand(3));

  case Intrinsic::aarch64_sve_sabd_u:

    return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),

                       N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_uabd_u:

    return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),

                       N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_sdiv_u:

    return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_udiv_u:

    return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_sqadd:

    return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);

  case Intrinsic::aarch64_sve_sqsub_u:

    return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),

                       N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_uqadd:

    return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);

  case Intrinsic::aarch64_sve_uqsub_u:

    return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),

                       N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_sqadd_x:

    return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2));

  case Intrinsic::aarch64_sve_sqsub_x:

    return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2));

  case Intrinsic::aarch64_sve_uqadd_x:

    return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2));

  case Intrinsic::aarch64_sve_uqsub_x:

    return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2));

  case Intrinsic::aarch64_sve_asrd:

    return DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_cmphs:

    if (!N->getOperand(2).getValueType().isFloatingPoint())

      return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),

                         N->getValueType(0), N->getOperand(1), N->getOperand(2),

                         N->getOperand(3), DAG.getCondCode(ISD::SETUGE));

    break;

  case Intrinsic::aarch64_sve_cmphi:

    if (!N->getOperand(2).getValueType().isFloatingPoint())

      return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),

                         N->getValueType(0), N->getOperand(1), N->getOperand(2),

                         N->getOperand(3), DAG.getCondCode(ISD::SETUGT));

    break;

  case Intrinsic::aarch64_sve_fcmpge:

  case Intrinsic::aarch64_sve_cmpge:

    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),

                       N->getValueType(0), N->getOperand(1), N->getOperand(2),

                       N->getOperand(3), DAG.getCondCode(ISD::SETGE));

    break;

  case Intrinsic::aarch64_sve_fcmpgt:

  case Intrinsic::aarch64_sve_cmpgt:

    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),

                       N->getValueType(0), N->getOperand(1), N->getOperand(2),

                       N->getOperand(3), DAG.getCondCode(ISD::SETGT));

    break;

  case Intrinsic::aarch64_sve_fcmpeq:

  case Intrinsic::aarch64_sve_cmpeq:

    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),

                       N->getValueType(0), N->getOperand(1), N->getOperand(2),

                       N->getOperand(3), DAG.getCondCode(ISD::SETEQ));

    break;

  case Intrinsic::aarch64_sve_fcmpne:

  case Intrinsic::aarch64_sve_cmpne:

    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),

                       N->getValueType(0), N->getOperand(1), N->getOperand(2),

                       N->getOperand(3), DAG.getCondCode(ISD::SETNE));

    break;

  case Intrinsic::aarch64_sve_fcmpuo:

    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),

                       N->getValueType(0), N->getOperand(1), N->getOperand(2),

                       N->getOperand(3), DAG.getCondCode(ISD::SETUO));

    break;

  case Intrinsic::aarch64_sve_fadda:

    return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);

  case Intrinsic::aarch64_sve_faddv:

    return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);

  case Intrinsic::aarch64_sve_fmaxnmv:

    return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);

  case Intrinsic::aarch64_sve_fmaxv:

    return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);

  case Intrinsic::aarch64_sve_fminnmv:

    return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);

  case Intrinsic::aarch64_sve_fminv:

    return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);

  case Intrinsic::aarch64_sve_sel:

    return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2), N->getOperand(3));

  case Intrinsic::aarch64_sve_cmpeq_wide:

    return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);

  case Intrinsic::aarch64_sve_cmpne_wide:

    return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);

  case Intrinsic::aarch64_sve_cmpge_wide:

    return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);

  case Intrinsic::aarch64_sve_cmpgt_wide:

    return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);

  case Intrinsic::aarch64_sve_cmplt_wide:

    return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);

  case Intrinsic::aarch64_sve_cmple_wide:

    return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);

  case Intrinsic::aarch64_sve_cmphs_wide:

    return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);

  case Intrinsic::aarch64_sve_cmphi_wide:

    return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);

  case Intrinsic::aarch64_sve_cmplo_wide:

    return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);

  case Intrinsic::aarch64_sve_cmpls_wide:

    return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);

  case Intrinsic::aarch64_sve_ptest_any:

    return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),

                    AArch64CC::ANY_ACTIVE);

  case Intrinsic::aarch64_sve_ptest_first:

    return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),

                    AArch64CC::FIRST_ACTIVE);

  case Intrinsic::aarch64_sve_ptest_last:

    return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),

                    AArch64CC::LAST_ACTIVE);

  case Intrinsic::aarch64_sve_whilelo:

    return DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, SDLoc(N), N->getValueType(0),

                       N->getOperand(1), N->getOperand(2));

  case Intrinsic::aarch64_sve_bsl:

  case Intrinsic::aarch64_sve_bsl1n:

  case Intrinsic::aarch64_sve_bsl2n:

  case Intrinsic::aarch64_sve_nbsl:

    return combineSVEBitSel(IID, N, DAG);

  }

  return SDValue();

}


static bool isCheapToExtend(const SDValue &N) {

  unsigned OC = N->getOpcode();

  return OC == ISD::LOAD || OC == ISD::MLOAD ||

         ISD::isConstantSplatVectorAllZeros(N.getNode());

}


static SDValue


performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,

                              SelectionDAG &DAG) {

  // If we have (sext (setcc A B)) and A and B are cheap to extend,

  // we can move the sext into the arguments and have the same result. For

  // example, if A and B are both loads, we can make those extending loads and

  // avoid an extra instruction. This pattern appears often in VLS code

  // generation where the inputs to the setcc have a different size to the

  // instruction that wants to use the result of the setcc.

  assert(N->getOpcode() == ISD::SIGN_EXTEND &&

         N->getOperand(0)->getOpcode() == ISD::SETCC);

  const SDValue SetCC = N->getOperand(0);


  const SDValue CCOp0 = SetCC.getOperand(0);

  const SDValue CCOp1 = SetCC.getOperand(1);

  if (!CCOp0->getValueType(0).isInteger() ||

      !CCOp1->getValueType(0).isInteger())

    return SDValue();


  ISD::CondCode Code =

      cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();


  ISD::NodeType ExtType =

      isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;


  if (isCheapToExtend(SetCC.getOperand(0)) &&

      isCheapToExtend(SetCC.getOperand(1))) {

    const SDValue Ext1 =

        DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);

    const SDValue Ext2 =

        DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);


    return DAG.getSetCC(

        SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,

        cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());

  }


  return SDValue();

}


// Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255)

// This comes from interleaved vectorization. It is performed late to capture

// uitofp converts too.


static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N,

                                                     SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);

  if ((VT != MVT::v4i32 && VT != MVT::v8i16) ||

      N->getOpcode() != ISD::ZERO_EXTEND ||

      N->getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)

    return SDValue();


  unsigned ExtOffset = N->getOperand(0).getConstantOperandVal(1);

  if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())

    return SDValue();


  EVT InVT = N->getOperand(0).getOperand(0).getValueType();

  auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0).getOperand(0));

  if (!Shuffle ||

      InVT.getVectorNumElements() != VT.getVectorNumElements() * 2 ||

      InVT.getScalarSizeInBits() * 2 != VT.getScalarSizeInBits())

    return SDValue();


  unsigned Idx;

  bool IsDeInterleave = ShuffleVectorInst::isDeInterleaveMaskOfFactor(

      Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements()), 4, Idx);

  // An undef interleave shuffle can come up after other canonicalizations,

  // where the shuffle has been converted to

  //   zext(extract(shuffle b, undef, [u,u,0,4]))

  bool IsUndefDeInterleave = false;

  if (!IsDeInterleave)

    IsUndefDeInterleave =

        Shuffle->getOperand(1).isUndef() &&

        all_of(

            Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements() / 2),

            [](int M) { return M < 0; }) &&

        ShuffleVectorInst::isDeInterleaveMaskOfFactor(

            Shuffle->getMask().slice(ExtOffset + VT.getVectorNumElements() / 2,

                                     VT.getVectorNumElements() / 2),

            4, Idx);

  if ((!IsDeInterleave && !IsUndefDeInterleave) || Idx >= 4)

    return SDValue();

  SDLoc DL(N);

  SDValue BC1 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,

                            Shuffle->getOperand(IsUndefDeInterleave ? 1 : 0));

  SDValue BC2 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,

                            Shuffle->getOperand(IsUndefDeInterleave ? 0 : 1));

  SDValue UZP = DAG.getNode(Idx < 2 ? AArch64ISD::UZP1 : AArch64ISD::UZP2, DL,

                            VT, BC1, BC2);

  if ((Idx & 1) == 1)

    UZP = DAG.getNode(ISD::SRL, DL, VT, UZP,

                      DAG.getConstant(InVT.getScalarSizeInBits(), DL, VT));

  return DAG.getNode(

      ISD::AND, DL, VT, UZP,

      DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT));

}


// This comes up similar to the above when lowering deinterleaving shuffles from

// zexts. We have legalized the operations in the generally case to

// zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if

// the extract is to the low half and the uzp is uzp1. There would be an extra

// shift if the uzp was uzp2 to grab the upper half. Due to the combine above

// there could also be an existing and / shift that can be combined in, either

// before of after the extract.


static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);

  if (N->getOpcode() != ISD::ZERO_EXTEND ||

      (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16))

    return SDValue();


  SDValue Op = N->getOperand(0);

  unsigned ExtOffset = (unsigned)-1;

  if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

    ExtOffset = Op.getConstantOperandVal(1);

    Op = Op.getOperand(0);

  }


  unsigned Shift = 0;

  APInt Mask = APInt::getLowBitsSet(VT.getScalarSizeInBits(),

                                    Op.getValueType().getScalarSizeInBits());


  if (Op.getOpcode() == AArch64ISD::VLSHR) {

    Shift = Op.getConstantOperandVal(1);

    Op = Op.getOperand(0);

    Mask = Mask.lshr(Shift);

  }

  if (Op.getOpcode() == ISD::AND &&

      ISD::isConstantSplatVector(Op.getOperand(1).getNode(), Mask)) {

    Op = Op.getOperand(0);

    Mask = Mask.zext(VT.getScalarSizeInBits());

  } else if (Op.getOpcode() == AArch64ISD::BICi) {

    Mask = ~APInt(Op.getValueType().getScalarSizeInBits(),

                  Op.getConstantOperandVal(1) << Op.getConstantOperandVal(2));

    Mask = Mask.zext(VT.getScalarSizeInBits());

    Op = Op.getOperand(0);

  }


  if (ExtOffset == (unsigned)-1) {

    if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

      ExtOffset = Op.getConstantOperandVal(1);

      Op = Op.getOperand(0);

    } else

      return SDValue();

  }

  if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())

    return SDValue();


  if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2)

    return SDValue();

  if (Op.getOpcode() == AArch64ISD::UZP2)

    Shift += VT.getScalarSizeInBits() / 2;


  SDLoc DL(N);

  SDValue BC = DAG.getNode(AArch64ISD::NVCAST, DL, VT,

                           Op.getOperand(ExtOffset == 0 ? 0 : 1));

  if (Shift != 0)

    BC = DAG.getNode(AArch64ISD::VLSHR, DL, VT, BC,

                     DAG.getTargetConstant(Shift, DL, MVT::i32));

  return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));

}


static SDValue performExtendCombine(SDNode *N,

                                    TargetLowering::DAGCombinerInfo &DCI,

                                    SelectionDAG &DAG) {

  // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then

  // we can convert that DUP into another extract_high (of a bigger DUP), which

  // helps the backend to decide that an sabdl2 would be useful, saving a real

  // extract_high operation.

  if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&

      N->getOperand(0).getValueType().is64BitVector() &&

      (N->getOperand(0).getOpcode() == ISD::ABDU ||

       N->getOperand(0).getOpcode() == ISD::ABDS)) {

    SDNode *ABDNode = N->getOperand(0).getNode();

    SDValue NewABD =

        tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG);

    if (!NewABD.getNode())

      return SDValue();


    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);

  }


  if (SDValue R = performZExtDeinterleaveShuffleCombine(N, DAG))

    return R;

  if (SDValue R = performZExtUZPCombine(N, DAG))

    return R;


  if (N->getValueType(0).isFixedLengthVector() &&

      N->getOpcode() == ISD::SIGN_EXTEND &&

      N->getOperand(0)->getOpcode() == ISD::SETCC)

    return performSignExtendSetCCCombine(N, DCI, DAG);


  // If we see (any_extend (bswap ...)) with bswap returning an i16, we know

  // that the top half of the result register must be unused, due to the

  // any_extend. This means that we can replace this pattern with (rev16

  // (any_extend ...)). This saves a machine instruction compared to (lsr (rev

  // ...)), which is what this pattern would otherwise be lowered to.

  // Only apply this optimisation if any_extend in original pattern to i32 or

  // i64, because this type will become the input type to REV16 in the new

  // pattern, so must be a legitimate REV16 input type.

  SDValue Bswap = N->getOperand(0);

  if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&

      Bswap.getValueType() == MVT::i16 &&

      (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {

    SDLoc DL(N);

    SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0),

                                       Bswap->getOperand(0));

    return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),

                       NewAnyExtend);

  }


  return SDValue();

}


static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,

                               SDValue SplatVal, unsigned NumVecElts) {

  assert(!St.isTruncatingStore() && "cannot split truncating vector store");

  Align OrigAlignment = St.getAlign();

  unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;


  // Create scalar stores. This is at least as good as the code sequence for a

  // split unaligned store which is a dup.s, ext.b, and two stores.

  // Most of the time the three stores should be replaced by store pair

  // instructions (stp).

  SDLoc DL(&St);

  SDValue BasePtr = St.getBasePtr();

  uint64_t BaseOffset = 0;


  const MachinePointerInfo &PtrInfo = St.getPointerInfo();

  SDValue NewST1 =

      DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,

                   OrigAlignment, St.getMemOperand()->getFlags());


  // As this in ISel, we will not merge this add which may degrade results.

  if (BasePtr->getOpcode() == ISD::ADD &&

      isa<ConstantSDNode>(BasePtr->getOperand(1))) {

    BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();

    BasePtr = BasePtr->getOperand(0);

  }


  unsigned Offset = EltOffset;

  while (--NumVecElts) {

    Align Alignment = commonAlignment(OrigAlignment, Offset);

    SDValue OffsetPtr =

        DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,

                    DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));

    NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,

                          PtrInfo.getWithOffset(Offset), Alignment,

                          St.getMemOperand()->getFlags());

    Offset += EltOffset;

  }

  return NewST1;

}


// Returns an SVE type that ContentTy can be trivially sign or zero extended

// into.


static MVT getSVEContainerType(EVT ContentTy) {

  assert(ContentTy.isSimple() && "No SVE containers for extended types");


  switch (ContentTy.getSimpleVT().SimpleTy) {

  default:

    llvm_unreachable("No known SVE container for this MVT type");

  case MVT::nxv2i8:

  case MVT::nxv2i16:

  case MVT::nxv2i32:

  case MVT::nxv2i64:

  case MVT::nxv2f32:

  case MVT::nxv2f64:

    return MVT::nxv2i64;

  case MVT::nxv4i8:

  case MVT::nxv4i16:

  case MVT::nxv4i32:

  case MVT::nxv4f32:

    return MVT::nxv4i32;

  case MVT::nxv8i8:

  case MVT::nxv8i16:

  case MVT::nxv8f16:

  case MVT::nxv8bf16:

    return MVT::nxv8i16;

  case MVT::nxv16i8:

    return MVT::nxv16i8;

  }

}


static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {

  SDLoc DL(N);

  EVT VT = N->getValueType(0);


  if (VT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)

    return SDValue();


  EVT ContainerVT = VT;

  if (ContainerVT.isInteger())

    ContainerVT = getSVEContainerType(ContainerVT);


  SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);

  SDValue Ops[] = { N->getOperand(0), // Chain

                    N->getOperand(2), // Pg

                    N->getOperand(3), // Base

                    DAG.getValueType(VT) };


  SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);

  SDValue LoadChain = SDValue(Load.getNode(), 1);


  if (ContainerVT.isInteger() && (VT != ContainerVT))

    Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));


  return DAG.getMergeValues({ Load, LoadChain }, DL);

}


static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {

  SDLoc DL(N);

  EVT VT = N->getValueType(0);

  EVT PtrTy = N->getOperand(3).getValueType();


  EVT LoadVT = VT;

  if (VT.isFloatingPoint())

    LoadVT = VT.changeTypeToInteger();


  auto *MINode = cast<MemIntrinsicSDNode>(N);

  SDValue PassThru = DAG.getConstant(0, DL, LoadVT);

  SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),

                                MINode->getOperand(3), DAG.getUNDEF(PtrTy),

                                MINode->getOperand(2), PassThru,

                                MINode->getMemoryVT(), MINode->getMemOperand(),

                                ISD::UNINDEXED, ISD::NON_EXTLOAD, false);


   if (VT.isFloatingPoint()) {

     SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };

     return DAG.getMergeValues(Ops, DL);

   }


  return L;

}


template <unsigned Opcode>


static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {

  static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||

                    Opcode == AArch64ISD::LD1RO_MERGE_ZERO,

                "Unsupported opcode.");

  SDLoc DL(N);

  EVT VT = N->getValueType(0);


  EVT LoadVT = VT;

  if (VT.isFloatingPoint())

    LoadVT = VT.changeTypeToInteger();


  SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};

  SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);

  SDValue LoadChain = SDValue(Load.getNode(), 1);


  if (VT.isFloatingPoint())

    Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));


  return DAG.getMergeValues({Load, LoadChain}, DL);

}


static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {

  SDLoc DL(N);

  SDValue Data = N->getOperand(2);

  EVT DataVT = Data.getValueType();

  EVT HwSrcVt = getSVEContainerType(DataVT);

  SDValue InputVT = DAG.getValueType(DataVT);


  if (DataVT.isFloatingPoint())

    InputVT = DAG.getValueType(HwSrcVt);


  SDValue SrcNew;

  if (Data.getValueType().isFloatingPoint())

    SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);

  else

    SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);


  SDValue Ops[] = { N->getOperand(0), // Chain

                    SrcNew,

                    N->getOperand(4), // Base

                    N->getOperand(3), // Pg

                    InputVT

                  };


  return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);

}


static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {

  SDLoc DL(N);


  SDValue Data = N->getOperand(2);

  EVT DataVT = Data.getValueType();

  EVT PtrTy = N->getOperand(4).getValueType();


  if (DataVT.isFloatingPoint())

    Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);


  auto *MINode = cast<MemIntrinsicSDNode>(N);

  return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),

                            DAG.getUNDEF(PtrTy), MINode->getOperand(3),

                            MINode->getMemoryVT(), MINode->getMemOperand(),

                            ISD::UNINDEXED, false, false);

}


/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.  The

/// load store optimizer pass will merge them to store pair stores.  This should

/// be better than a movi to create the vector zero followed by a vector store

/// if the zero constant is not re-used, since one instructions and one register

/// live range will be removed.

///

/// For example, the final generated code should be:

///

///   stp xzr, xzr, [x0]

///

/// instead of:

///

///   movi v0.2d, #0

///   str q0, [x0]

///


static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {

  SDValue StVal = St.getValue();

  EVT VT = StVal.getValueType();


  // Avoid scalarizing zero splat stores for scalable vectors.

  if (VT.isScalableVector())

    return SDValue();


  // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or

  // 2, 3 or 4 i32 elements.

  int NumVecElts = VT.getVectorNumElements();

  if (!(((NumVecElts == 2 || NumVecElts == 3) &&

         VT.getVectorElementType().getSizeInBits() == 64) ||

        ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&

         VT.getVectorElementType().getSizeInBits() == 32)))

    return SDValue();


  if (StVal.getOpcode() != ISD::BUILD_VECTOR)

    return SDValue();


  // If the zero constant has more than one use then the vector store could be

  // better since the constant mov will be amortized and stp q instructions

  // should be able to be formed.

  if (!StVal.hasOneUse())

    return SDValue();


  // If the store is truncating then it's going down to i16 or smaller, which

  // means it can be implemented in a single store anyway.

  if (St.isTruncatingStore())

    return SDValue();


  // If the immediate offset of the address operand is too large for the stp

  // instruction, then bail out.

  if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {

    int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);

    if (Offset < -512 || Offset > 504)

      return SDValue();

  }


  for (int I = 0; I < NumVecElts; ++I) {

    SDValue EltVal = StVal.getOperand(I);

    if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))

      return SDValue();

  }


  // Use a CopyFromReg WZR/XZR here to prevent

  // DAGCombiner::MergeConsecutiveStores from undoing this transformation.

  SDLoc DL(&St);

  unsigned ZeroReg;

  EVT ZeroVT;

  if (VT.getVectorElementType().getSizeInBits() == 32) {

    ZeroReg = AArch64::WZR;

    ZeroVT = MVT::i32;

  } else {

    ZeroReg = AArch64::XZR;

    ZeroVT = MVT::i64;

  }

  SDValue SplatVal =

      DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);

  return splitStoreSplat(DAG, St, SplatVal, NumVecElts);

}


/// Replace a splat of a scalar to a vector store by scalar stores of the scalar

/// value. The load store optimizer pass will merge them to store pair stores.

/// This has better performance than a splat of the scalar followed by a split

/// vector store. Even if the stores are not merged it is four stores vs a dup,

/// followed by an ext.b and two stores.


static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {

  SDValue StVal = St.getValue();

  EVT VT = StVal.getValueType();


  // Don't replace floating point stores, they possibly won't be transformed to

  // stp because of the store pair suppress pass.

  if (VT.isFloatingPoint())

    return SDValue();


  // We can express a splat as store pair(s) for 2 or 4 elements.

  unsigned NumVecElts = VT.getVectorNumElements();

  if (NumVecElts != 4 && NumVecElts != 2)

    return SDValue();


  // If the store is truncating then it's going down to i16 or smaller, which

  // means it can be implemented in a single store anyway.

  if (St.isTruncatingStore())

    return SDValue();


  // Check that this is a splat.

  // Make sure that each of the relevant vector element locations are inserted

  // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.

  std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);

  SDValue SplatVal;

  for (unsigned I = 0; I < NumVecElts; ++I) {

    // Check for insert vector elements.

    if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)

      return SDValue();


    // Check that same value is inserted at each vector element.

    if (I == 0)

      SplatVal = StVal.getOperand(1);

    else if (StVal.getOperand(1) != SplatVal)

      return SDValue();


    // Check insert element index.

    ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));

    if (!CIndex)

      return SDValue();

    uint64_t IndexVal = CIndex->getZExtValue();

    if (IndexVal >= NumVecElts)

      return SDValue();

    IndexNotInserted.reset(IndexVal);


    StVal = StVal.getOperand(0);

  }

  // Check that all vector element locations were inserted to.

  if (IndexNotInserted.any())

      return SDValue();


  return splitStoreSplat(DAG, St, SplatVal, NumVecElts);

}


static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,

                           SelectionDAG &DAG,

                           const AArch64Subtarget *Subtarget) {


  StoreSDNode *S = cast<StoreSDNode>(N);

  if (S->isVolatile() || S->isIndexed())

    return SDValue();


  SDValue StVal = S->getValue();

  EVT VT = StVal.getValueType();


  if (!VT.isFixedLengthVector())

    return SDValue();


  // If we get a splat of zeros, convert this vector store to a store of

  // scalars. They will be merged into store pairs of xzr thereby removing one

  // instruction and one register.

  if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))

    return ReplacedZeroSplat;


  // FIXME: The logic for deciding if an unaligned store should be split should

  // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be

  // a call to that function here.


  if (!Subtarget->isMisaligned128StoreSlow())

    return SDValue();


  // Don't split at -Oz.

  if (DAG.getMachineFunction().getFunction().hasMinSize())

    return SDValue();


  // Don't split v2i64 vectors. Memcpy lowering produces those and splitting

  // those up regresses performance on micro-benchmarks and olden/bh.

  if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)

    return SDValue();


  // Split unaligned 16B stores. They are terrible for performance.

  // Don't split stores with alignment of 1 or 2. Code that uses clang vector

  // extensions can use this to mark that it does not want splitting to happen

  // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of

  // eliminating alignment hazards is only 1 in 8 for alignment of 2.

  if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||

      S->getAlign() <= Align(2))

    return SDValue();


  // If we get a splat of a scalar convert this vector store to a store of

  // scalars. They will be merged into store pairs thereby removing two

  // instructions.

  if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))

    return ReplacedSplat;


  SDLoc DL(S);


  // Split VT into two.

  EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());

  unsigned NumElts = HalfVT.getVectorNumElements();

  SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,

                                   DAG.getConstant(0, DL, MVT::i64));

  SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,

                                   DAG.getConstant(NumElts, DL, MVT::i64));

  SDValue BasePtr = S->getBasePtr();

  SDValue NewST1 =

      DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),

                   S->getAlign(), S->getMemOperand()->getFlags());

  SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,

                                  DAG.getConstant(8, DL, MVT::i64));

  return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,

                      S->getPointerInfo(), S->getAlign(),

                      S->getMemOperand()->getFlags());

}


static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {

  assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexpected Opcode!");


  // splice(pg, op1, undef) -> op1

  if (N->getOperand(2).isUndef())

    return N->getOperand(1);


  return SDValue();

}


static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG,

                                    const AArch64Subtarget *Subtarget) {

  assert((N->getOpcode() == AArch64ISD::UUNPKHI ||

          N->getOpcode() == AArch64ISD::UUNPKLO) &&

         "Unexpected Opcode!");


  // uunpklo/hi undef -> undef

  if (N->getOperand(0).isUndef())

    return DAG.getUNDEF(N->getValueType(0));


  // If this is a masked load followed by an UUNPKLO, fold this into a masked

  // extending load.  We can do this even if this is already a masked

  // {z,}extload.

  if (N->getOperand(0).getOpcode() == ISD::MLOAD &&

      N->getOpcode() == AArch64ISD::UUNPKLO) {

    MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));

    SDValue Mask = MLD->getMask();

    SDLoc DL(N);


    if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&

        SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&

        (MLD->getPassThru()->isUndef() ||

         isZerosVector(MLD->getPassThru().getNode()))) {

      unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();

      unsigned PgPattern = Mask->getConstantOperandVal(0);

      EVT VT = N->getValueType(0);


      // Ensure we can double the size of the predicate pattern

      unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);

      if (NumElts &&

          NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {

        Mask =

            getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);

        SDValue PassThru = DAG.getConstant(0, DL, VT);

        SDValue NewLoad = DAG.getMaskedLoad(

            VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,

            PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),

            MLD->getAddressingMode(), ISD::ZEXTLOAD);


        DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));


        return NewLoad;

      }

    }

  }


  return SDValue();

}


static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N) {

  if (N->getOpcode() != AArch64ISD::UZP1)

    return false;

  SDValue Op0 = N->getOperand(0);

  EVT SrcVT = Op0->getValueType(0);

  EVT DstVT = N->getValueType(0);

  return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||

         (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||

         (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);

}


// Try to combine rounding shifts where the operands come from an extend, and

// the result is truncated and combined into one vector.

//   uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)


static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG) {

  assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");

  SDValue Op0 = N->getOperand(0);

  SDValue Op1 = N->getOperand(1);

  EVT ResVT = N->getValueType(0);


  unsigned RshOpc = Op0.getOpcode();

  if (RshOpc != AArch64ISD::RSHRNB_I)

    return SDValue();


  // Same op code and imm value?

  SDValue ShiftValue = Op0.getOperand(1);

  if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))

    return SDValue();


  // Same unextended operand value?

  SDValue Lo = Op0.getOperand(0);

  SDValue Hi = Op1.getOperand(0);

  if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&

      Hi.getOpcode() != AArch64ISD::UUNPKHI)

    return SDValue();

  SDValue OrigArg = Lo.getOperand(0);

  if (OrigArg != Hi.getOperand(0))

    return SDValue();


  SDLoc DL(N);

  return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,

                     getPredicateForVector(DAG, DL, ResVT), OrigArg,

                     ShiftValue);

}


// Try to simplify:

//    t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))

//    t2 = nxv8i16 srl(t1, ShiftValue)

// to

//    t1 = nxv8i16 rshrnb(X, shiftvalue).

// rshrnb will zero the top half bits of each element. Therefore, this combine

// should only be performed when a following instruction with the rshrnb

// as an operand does not care about the top half of each element. For example,

// a uzp1 or a truncating store.


static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG,

                                         const AArch64Subtarget *Subtarget) {

  EVT VT = Srl->getValueType(0);

  if (!VT.isScalableVector() || !Subtarget->hasSVE2())

    return SDValue();


  EVT ResVT;

  if (VT == MVT::nxv8i16)

    ResVT = MVT::nxv16i8;

  else if (VT == MVT::nxv4i32)

    ResVT = MVT::nxv8i16;

  else if (VT == MVT::nxv2i64)

    ResVT = MVT::nxv4i32;

  else

    return SDValue();


  SDLoc DL(Srl);

  unsigned ShiftValue;

  SDValue RShOperand;

  if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))

    return SDValue();

  SDValue Rshrnb = DAG.getNode(

      AArch64ISD::RSHRNB_I, DL, ResVT,

      {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});

  return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Rshrnb);

}


static SDValue isNVCastToHalfWidthElements(SDValue V) {

  if (V.getOpcode() != AArch64ISD::NVCAST)

    return SDValue();


  SDValue Op = V.getOperand(0);

  if (!Op.getValueType().isVector() ||

      V.getValueType().getVectorElementCount() !=

          Op.getValueType().getVectorElementCount() * 2)

    return SDValue();


  return Op;

}


static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,

                                 const AArch64Subtarget *Subtarget) {

  SDLoc DL(N);

  SDValue Op0 = N->getOperand(0);

  SDValue Op1 = N->getOperand(1);

  EVT ResVT = N->getValueType(0);


  // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)

  if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

      Op1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

      Op0.getOperand(0) == Op1.getOperand(0)) {


    SDValue SourceVec = Op0.getOperand(0);

    uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);

    uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);

    uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();

    if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {

      EVT OpVT = Op0.getOperand(1).getValueType();

      EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());

      SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,

                                DAG.getUNDEF(WidenedResVT));

      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,

                         DAG.getConstant(0, DL, OpVT));

    }

  }


  // Following optimizations only work with uzp1.

  if (N->getOpcode() == AArch64ISD::UZP2)

    return SDValue();


  // uzp1(x, undef) -> concat(truncate(x), undef)

  if (Op1.getOpcode() == ISD::UNDEF) {

    EVT BCVT = MVT::Other, HalfVT = MVT::Other;

    switch (ResVT.getSimpleVT().SimpleTy) {

    default:

      break;

    case MVT::v16i8:

      BCVT = MVT::v8i16;

      HalfVT = MVT::v8i8;

      break;

    case MVT::v8i16:

      BCVT = MVT::v4i32;

      HalfVT = MVT::v4i16;

      break;

    case MVT::v4i32:

      BCVT = MVT::v2i64;

      HalfVT = MVT::v2i32;

      break;

    }

    if (BCVT != MVT::Other) {

      SDValue BC = DAG.getBitcast(BCVT, Op0);

      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);

      return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,

                         DAG.getUNDEF(HalfVT));

    }

  }


  if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))

    return Urshr;


  if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {

    if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {

      Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);

      return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);

    }

  }


  if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {

    if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {

      Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);

      return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);

    }

  }


  // uzp1<ty>(nvcast(unpklo(uzp1<ty>(x, y))), z) => uzp1<ty>(x, z)

  if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {

    if (PreCast.getOpcode() == AArch64ISD::UUNPKLO) {

      if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {

        SDValue X = PreCast.getOperand(0).getOperand(0);

        return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);

      }

    }

  }


  // uzp1<ty>(x, nvcast(unpkhi(uzp1<ty>(y, z)))) => uzp1<ty>(x, z)

  if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {

    if (PreCast.getOpcode() == AArch64ISD::UUNPKHI) {

      if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {

        SDValue Z = PreCast.getOperand(0).getOperand(1);

        return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);

      }

    }

  }


  // These optimizations only work on little endian.

  if (!DAG.getDataLayout().isLittleEndian())

    return SDValue();


  // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)

  // Example:

  // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)

  // to

  // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y

  if (isHalvingTruncateAndConcatOfLegalIntScalableType(N) &&

      Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {

    if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {

      return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),

                         Op1.getOperand(0));

    }

  }


  if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)

    return SDValue();


  SDValue SourceOp0 = peekThroughBitcasts(Op0);

  SDValue SourceOp1 = peekThroughBitcasts(Op1);


  // truncating uzp1(x, y) -> xtn(concat (x, y))

  if (SourceOp0.getValueType() == SourceOp1.getValueType()) {

    EVT Op0Ty = SourceOp0.getValueType();

    if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||

        (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {

      SDValue Concat =

          DAG.getNode(ISD::CONCAT_VECTORS, DL,

                      Op0Ty.getDoubleNumVectorElementsVT(*DAG.getContext()),

                      SourceOp0, SourceOp1);

      return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);

    }

  }


  // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))

  if (SourceOp0.getOpcode() != ISD::TRUNCATE ||

      SourceOp1.getOpcode() != ISD::TRUNCATE)

    return SDValue();

  SourceOp0 = SourceOp0.getOperand(0);

  SourceOp1 = SourceOp1.getOperand(0);


  if (SourceOp0.getValueType() != SourceOp1.getValueType() ||

      !SourceOp0.getValueType().isSimple())

    return SDValue();


  EVT ResultTy;


  switch (SourceOp0.getSimpleValueType().SimpleTy) {

  case MVT::v2i64:

    ResultTy = MVT::v4i32;

    break;

  case MVT::v4i32:

    ResultTy = MVT::v8i16;

    break;

  case MVT::v8i16:

    ResultTy = MVT::v16i8;

    break;

  default:

    return SDValue();

  }


  SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);

  SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);

  SDValue UzpResult =

      DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);


  EVT BitcastResultTy;


  switch (ResVT.getSimpleVT().SimpleTy) {

  case MVT::v2i32:

    BitcastResultTy = MVT::v2i64;

    break;

  case MVT::v4i16:

    BitcastResultTy = MVT::v4i32;

    break;

  case MVT::v8i8:

    BitcastResultTy = MVT::v8i16;

    break;

  default:

    llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");

  }


  return DAG.getNode(ISD::TRUNCATE, DL, ResVT,

                     DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));

}


static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {

  unsigned Opc = N->getOpcode();


  const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||

                      Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;

  const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||

                      Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;

  const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||

                        Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||

                        Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||

                        Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;


  SDLoc DL(N);

  SDValue Chain = N->getOperand(0);

  SDValue Pg = N->getOperand(1);

  SDValue Base = N->getOperand(2);

  SDValue Offset = N->getOperand(3);

  SDValue Ty = N->getOperand(4);


  EVT ResVT = N->getValueType(0);


  const auto OffsetOpc = Offset.getOpcode();

  const bool OffsetIsZExt =

      OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;

  const bool OffsetIsSExt =

      OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;


  // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.

  if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {

    SDValue ExtPg = Offset.getOperand(0);

    VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());

    EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();


    // If the predicate for the sign- or zero-extended offset is the

    // same as the predicate used for this load and the sign-/zero-extension

    // was from a 32-bits...

    if (ExtPg == Pg && ExtFromEVT == MVT::i32) {

      SDValue UnextendedOffset = Offset.getOperand(1);


      unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);

      if (Signed)

        NewOpc = getSignExtendedGatherOpcode(NewOpc);


      return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},

                         {Chain, Pg, Base, UnextendedOffset, Ty});

    }

  }


  return SDValue();

}


/// Optimize a vector shift instruction and its operand if shifted out

/// bits are not used.


static SDValue performVectorShiftCombine(SDNode *N,

                                         const AArch64TargetLowering &TLI,

                                         TargetLowering::DAGCombinerInfo &DCI) {

  assert(N->getOpcode() == AArch64ISD::VASHR ||

         N->getOpcode() == AArch64ISD::VLSHR);


  SDValue Op = N->getOperand(0);

  unsigned OpScalarSize = Op.getScalarValueSizeInBits();


  unsigned ShiftImm = N->getConstantOperandVal(1);

  assert(OpScalarSize > ShiftImm && "Invalid shift imm");


  // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.

  if (N->getOpcode() == AArch64ISD::VASHR &&

      Op.getOpcode() == AArch64ISD::VSHL &&

      N->getOperand(1) == Op.getOperand(1))

    if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)

      return Op.getOperand(0);


  // If the shift is exact, the shifted out bits matter.

  if (N->getFlags().hasExact())

    return SDValue();


  APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);

  APInt DemandedMask = ~ShiftedOutBits;


  if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))

    return SDValue(N, 0);


  return SDValue();

}


static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) {

  // sunpklo(sext(pred)) -> sext(extract_low_half(pred))

  // This transform works in partnership with performSetCCPunpkCombine to

  // remove unnecessary transfer of predicates into standard registers and back

  if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&

      N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==

          MVT::i1) {

    SDValue CC = N->getOperand(0)->getOperand(0);

    auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());

    SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,

                               DAG.getVectorIdxConstant(0, SDLoc(N)));

    return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);

  }


  return SDValue();

}


/// Target-specific DAG combine function for post-increment LD1 (lane) and

/// post-increment LD1R.


static SDValue performPostLD1Combine(SDNode *N,

                                     TargetLowering::DAGCombinerInfo &DCI,

                                     bool IsLaneOp) {

  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  SelectionDAG &DAG = DCI.DAG;

  EVT VT = N->getValueType(0);


  if (!VT.is128BitVector() && !VT.is64BitVector())

    return SDValue();


  // If it is not LOAD, can not do such combine.

  unsigned LoadIdx = IsLaneOp ? 1 : 0;

  LoadSDNode *LD = dyn_cast<LoadSDNode>(N->getOperand(LoadIdx).getNode());

  if (!LD)

    return SDValue();


  // If the Generic combiner already helped form a pre- or post-indexed load,

  // skip forming one here.

  if (LD->isIndexed())

    return SDValue();


  // The vector lane must be a constant in the LD1LANE opcode.

  SDValue Lane;

  if (IsLaneOp) {

    Lane = N->getOperand(2);

    auto *LaneC = dyn_cast<ConstantSDNode>(Lane);

    if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())

      return SDValue();

    if (LaneC->getZExtValue() == 0 && isNullOrNullSplat(N->getOperand(0)))

      return SDValue();

  }


  LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);

  EVT MemVT = LoadSDN->getMemoryVT();

  // Check if memory operand is the same type as the vector element.

  if (MemVT != VT.getVectorElementType())

    return SDValue();


  // Check if there are other uses. If so, do not combine as it will introduce

  // an extra load.

  for (SDUse &U : LD->uses()) {

    if (U.getResNo() == 1) // Ignore uses of the chain result.

      continue;

    if (U.getUser() != N)

      return SDValue();

  }


  // If there is one use and it can splat the value, prefer that operation.

  // TODO: This could be expanded to more operations if they reliably use the

  // index variants.

  if (N->hasOneUse()) {

    unsigned UseOpc = N->user_begin()->getOpcode();

    if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)

      return SDValue();

  }


  SDValue Addr = LD->getOperand(1);

  SDValue Vector = N->getOperand(0);

  // Search for a use of the address operand that is an increment.

  for (SDUse &Use : Addr->uses()) {

    SDNode *User = Use.getUser();

    if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())

      continue;


    // If the increment is a constant, it must match the memory ref size.

    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);

    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {

      uint32_t IncVal = CInc->getZExtValue();

      unsigned NumBytes = VT.getScalarSizeInBits() / 8;

      if (IncVal != NumBytes)

        continue;

      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);

    }


    // To avoid cycle construction make sure that neither the load nor the add

    // are predecessors to each other or the Vector.

    SmallPtrSet<const SDNode *, 32> Visited;

    SmallVector<const SDNode *, 16> Worklist;

    Visited.insert(Addr.getNode());

    Worklist.push_back(User);

    Worklist.push_back(LD);

    Worklist.push_back(Vector.getNode());

    if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||

        SDNode::hasPredecessorHelper(User, Visited, Worklist))

      continue;


    SmallVector<SDValue, 8> Ops;

    Ops.push_back(LD->getOperand(0));  // Chain

    if (IsLaneOp) {

      Ops.push_back(Vector);           // The vector to be inserted

      Ops.push_back(Lane);             // The lane to be inserted in the vector

    }

    Ops.push_back(Addr);

    Ops.push_back(Inc);


    EVT Tys[3] = { VT, MVT::i64, MVT::Other };

    SDVTList SDTys = DAG.getVTList(Tys);

    unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;

    SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,

                                           MemVT,

                                           LoadSDN->getMemOperand());


    // Update the uses.

    SDValue NewResults[] = {

        SDValue(LD, 0),            // The result of load

        SDValue(UpdN.getNode(), 2) // Chain

    };

    DCI.CombineTo(LD, NewResults);

    DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result

    DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register


    break;

  }

  return SDValue();

}


/// Simplify ``Addr`` given that the top byte of it is ignored by HW during

/// address translation.

static bool performTBISimplification(SDValue Addr,

                                     TargetLowering::DAGCombinerInfo &DCI,

                                     SelectionDAG &DAG) {

  APInt DemandedMask = APInt::getLowBitsSet(64, 56);

  KnownBits Known;

  TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

                                        !DCI.isBeforeLegalizeOps());

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {

    DCI.CommitTargetLoweringOpt(TLO);

    return true;

  }

  return false;

}


static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {

  assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&

         "Expected STORE dag node in input!");


  if (auto Store = dyn_cast<StoreSDNode>(N)) {

    if (!Store->isTruncatingStore() || Store->isIndexed())

      return SDValue();

    SDValue Ext = Store->getValue();

    auto ExtOpCode = Ext.getOpcode();

    if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&

        ExtOpCode != ISD::ANY_EXTEND)

      return SDValue();

    SDValue Orig = Ext->getOperand(0);

    if (Store->getMemoryVT() != Orig.getValueType())

      return SDValue();

    return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,

                        Store->getBasePtr(), Store->getMemOperand());

  }


  return SDValue();

}


// A custom combine to lower load <3 x i8> as the more efficient sequence

// below:

//    ldrb wX, [x0, #2]

//    ldrh wY, [x0]

//    orr wX, wY, wX, lsl #16

//    fmov s0, wX

//

// Note that an alternative sequence with even fewer (although usually more

// complex/expensive) instructions would be:

//   ld1r.4h { v0 }, [x0], #2

//   ld1.b { v0 }[2], [x0]

//

// Generating this sequence unfortunately results in noticeably worse codegen

// for code that extends the loaded v3i8, due to legalization breaking vector

// shuffle detection in a way that is very difficult to work around.

// TODO: Revisit once v3i8 legalization has been improved in general.

static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {

  EVT MemVT = LD->getMemoryVT();

  if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||

      LD->getBaseAlign() >= 4)

    return SDValue();


  SDLoc DL(LD);

  MachineFunction &MF = DAG.getMachineFunction();

  SDValue Chain = LD->getChain();

  SDValue BasePtr = LD->getBasePtr();

  MachineMemOperand *MMO = LD->getMemOperand();

  assert(LD->getOffset().isUndef() && "undef offset expected");


  // Load 2 x i8, then 1 x i8.

  SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);

  TypeSize Offset2 = TypeSize::getFixed(2);

  SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,

                           DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),

                           MF.getMachineMemOperand(MMO, 2, 1));


  // Extend to i32.

  SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);

  SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);


  // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.

  SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,

                            DAG.getConstant(16, DL, MVT::i32));

  SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);

  SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);


  // Extract v3i8 again.

  SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,

                                DAG.getConstant(0, DL, MVT::i64));

  SDValue TokenFactor = DAG.getNode(

      ISD::TokenFactor, DL, MVT::Other,

      {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});

  return DAG.getMergeValues({Extract, TokenFactor}, DL);

}


// Perform TBI simplification if supported by the target and try to break up

// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit

// load instructions can be selected.

static SDValue performLOADCombine(SDNode *N,

                                  TargetLowering::DAGCombinerInfo &DCI,

                                  SelectionDAG &DAG,

                                  const AArch64Subtarget *Subtarget) {

  if (Subtarget->supportsAddressTopByteIgnored())

    performTBISimplification(N->getOperand(1), DCI, DAG);


  LoadSDNode *LD = cast<LoadSDNode>(N);

  EVT RegVT = LD->getValueType(0);

  EVT MemVT = LD->getMemoryVT();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  SDLoc DL(LD);


  // Cast ptr32 and ptr64 pointers to the default address space before a load.

  unsigned AddrSpace = LD->getAddressSpace();

  if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||

      AddrSpace == ARM64AS::PTR32_UPTR) {

    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

    if (PtrVT != LD->getBasePtr().getSimpleValueType()) {

      SDValue Cast =

          DAG.getAddrSpaceCast(DL, PtrVT, LD->getBasePtr(), AddrSpace, 0);

      return DAG.getExtLoad(LD->getExtensionType(), DL, RegVT, LD->getChain(),

                            Cast, LD->getPointerInfo(), MemVT,

                            LD->getBaseAlign(),

                            LD->getMemOperand()->getFlags());

    }

  }


  if (LD->isVolatile() || !Subtarget->isLittleEndian())

    return SDValue(N, 0);


  if (SDValue Res = combineV3I8LoadExt(LD, DAG))

    return Res;


  if (!LD->isNonTemporal())

    return SDValue(N, 0);


  if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||

      MemVT.getSizeInBits() % 256 == 0 ||

      256 % MemVT.getScalarSizeInBits() != 0)

    return SDValue(N, 0);


  SDValue Chain = LD->getChain();

  SDValue BasePtr = LD->getBasePtr();

  SDNodeFlags Flags = LD->getFlags();

  SmallVector<SDValue, 4> LoadOps;

  SmallVector<SDValue, 4> LoadOpsChain;

  // Replace any non temporal load over 256-bit with a series of 256 bit loads

  // and a scalar/vector load less than 256. This way we can utilize 256-bit

  // loads and reduce the amount of load instructions generated.

  MVT NewVT =

      MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),

                       256 / MemVT.getVectorElementType().getSizeInBits());

  unsigned Num256Loads = MemVT.getSizeInBits() / 256;

  // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.

  for (unsigned I = 0; I < Num256Loads; I++) {

    unsigned PtrOffset = I * 32;

    SDValue NewPtr = DAG.getMemBasePlusOffset(

        BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);

    Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);

    SDValue NewLoad = DAG.getLoad(

        NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),

        NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());

    LoadOps.push_back(NewLoad);

    LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));

  }


  // Process remaining bits of the load operation.

  // This is done by creating an UNDEF vector to match the size of the

  // 256-bit loads and inserting the remaining load to it. We extract the

  // original load type at the end using EXTRACT_SUBVECTOR instruction.

  unsigned BitsRemaining = MemVT.getSizeInBits() % 256;

  unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;

  MVT RemainingVT = MVT::getVectorVT(

      MemVT.getVectorElementType().getSimpleVT(),

      BitsRemaining / MemVT.getVectorElementType().getSizeInBits());

  SDValue NewPtr = DAG.getMemBasePlusOffset(

      BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);

  Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);

  SDValue RemainingLoad =

      DAG.getLoad(RemainingVT, DL, Chain, NewPtr,

                  LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,

                  LD->getMemOperand()->getFlags(), LD->getAAInfo());

  SDValue UndefVector = DAG.getUNDEF(NewVT);

  SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);

  SDValue ExtendedRemainingLoad =

      DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,

                  {UndefVector, RemainingLoad, InsertIdx});

  LoadOps.push_back(ExtendedRemainingLoad);

  LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));

  EVT ConcatVT =

      EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),

                       LoadOps.size() * NewVT.getVectorNumElements());

  SDValue ConcatVectors =

      DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);

  // Extract the original vector type size.

  SDValue ExtractSubVector =

      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,

                  {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});

  SDValue TokenFactor =

      DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);

  return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);

}


static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = 0) {

  EVT VecVT = Op.getValueType();

  assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&

         "Need boolean vector type.");


  if (Depth > 3)

    return MVT::INVALID_SIMPLE_VALUE_TYPE;


  // We can get the base type from a vector compare or truncate.

  if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)

    return Op.getOperand(0).getValueType();


  // If an operand is a bool vector, continue looking.

  EVT BaseVT = MVT::INVALID_SIMPLE_VALUE_TYPE;

  for (SDValue Operand : Op->op_values()) {

    if (Operand.getValueType() != VecVT)

      continue;


    EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);

    if (!BaseVT.isSimple())

      BaseVT = OperandVT;

    else if (OperandVT != BaseVT)

      return MVT::INVALID_SIMPLE_VALUE_TYPE;

  }


  return BaseVT;

}


// When converting a <N x iX> vector to <N x i1> to store or use as a scalar

// iN, we can use a trick that extracts the i^th bit from the i^th element and

// then performs a vector add to get a scalar bitmask. This requires that each

// element's bits are either all 1 or all 0.

static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {

  SDLoc DL(N);

  SDValue ComparisonResult(N, 0);

  EVT VecVT = ComparisonResult.getValueType();

  assert(VecVT.isVector() && "Must be a vector type");


  unsigned NumElts = VecVT.getVectorNumElements();

  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)

    return SDValue();


  if (VecVT.getVectorElementType() != MVT::i1 &&

      !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))

    return SDValue();


  // If we can find the original types to work on instead of a vector of i1,

  // we can avoid extend/extract conversion instructions.

  if (VecVT.getVectorElementType() == MVT::i1) {

    VecVT = tryGetOriginalBoolVectorType(ComparisonResult);

    if (!VecVT.isSimple()) {

      unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector

      VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);

    }

  }

  VecVT = VecVT.changeVectorElementTypeToInteger();


  // Large vectors don't map directly to this conversion, so to avoid too many

  // edge cases, we don't apply it here. The conversion will likely still be

  // applied later via multiple smaller vectors, whose results are concatenated.

  if (VecVT.getSizeInBits() > 128)

    return SDValue();


  // Ensure that all elements' bits are either 0s or 1s.

  ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);


  bool IsLE = DAG.getDataLayout().isLittleEndian();

  SmallVector<SDValue, 16> MaskConstants;

  if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&

      VecVT == MVT::v16i8) {

    // v16i8 is a special case, as we have 16 entries but only 8 positional bits

    // per entry. We split it into two halves, apply the mask, zip the halves to

    // create 8x 16-bit values, and the perform the vector reduce.

    for (unsigned Half = 0; Half < 2; ++Half) {

      for (unsigned I = 0; I < 8; ++I) {

        // On big-endian targets, the lane order in sub-byte vector elements

        // gets reversed, so we need to flip the bit index.

        unsigned MaskBit = IsLE ? (1u << I) : (1u << (7 - I));

        MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));

      }

    }

    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);

    SDValue RepresentativeBits =

        DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);


    SDValue UpperRepresentativeBits =

        DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,

                    RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));

    SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,

                                 RepresentativeBits, UpperRepresentativeBits);

    Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);

    return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);

  }


  // All other vector sizes.

  unsigned NumEl = VecVT.getVectorNumElements();

  for (unsigned I = 0; I < NumEl; ++I) {

    unsigned MaskBit = IsLE ? (1u << I) : (1u << (NumEl - 1 - I));

    MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));

  }


  SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);

  SDValue RepresentativeBits =

      DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);

  EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(

      NumElts, VecVT.getVectorElementType().getSizeInBits()));

  return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);

}


static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,

                                                 StoreSDNode *Store) {

  if (!Store->isTruncatingStore())

    return SDValue();


  SDLoc DL(Store);

  SDValue VecOp = Store->getValue();

  EVT VT = VecOp.getValueType();

  EVT MemVT = Store->getMemoryVT();


  if (!MemVT.isVector() || !VT.isVector() ||

      MemVT.getVectorElementType() != MVT::i1)

    return SDValue();


  // If we are storing a vector that we are currently building, let

  // `scalarizeVectorStore()` handle this more efficiently.

  if (VecOp.getOpcode() == ISD::BUILD_VECTOR)

    return SDValue();


  VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);

  SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);

  if (!VectorBits)

    return SDValue();


  EVT StoreVT =

      EVT::getIntegerVT(*DAG.getContext(), MemVT.getStoreSizeInBits());

  SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);

  return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),

                      Store->getMemOperand());

}


// Combine store (fp_to_int X) to use vector semantics around the conversion

// when NEON is available. This allows us to store the in-vector result directly

// without transferring the result into a GPR in the process.

static SDValue combineStoreValueFPToInt(StoreSDNode *ST,

                                        TargetLowering::DAGCombinerInfo &DCI,

                                        SelectionDAG &DAG,

                                        const AArch64Subtarget *Subtarget) {

  // Limit to post-legalization in order to avoid peeling truncating stores.

  if (DCI.isBeforeLegalize())

    return SDValue();

  if (!Subtarget->isNeonAvailable())

    return SDValue();

  // Source operand is already a vector.

  SDValue Value = ST->getValue();

  if (Value.getValueType().isVector())

    return SDValue();


  // Look through potential assertions.

  while (Value->isAssert())

    Value = Value.getOperand(0);


  if (Value.getOpcode() != ISD::FP_TO_SINT &&

      Value.getOpcode() != ISD::FP_TO_UINT)

    return SDValue();

  if (!Value->hasOneUse())

    return SDValue();


  SDValue FPSrc = Value.getOperand(0);

  EVT SrcVT = FPSrc.getValueType();

  if (SrcVT != MVT::f32 && SrcVT != MVT::f64)

    return SDValue();


  // No support for assignments such as i64 = fp_to_sint i32

  EVT VT = Value.getSimpleValueType();

  if (VT != SrcVT.changeTypeToInteger())

    return SDValue();


  // Create a 128-bit element vector to avoid widening. The floating point

  // conversion is transformed into a single element conversion via a pattern.

  unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();

  EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);

  EVT VecDstVT = VecSrcVT.changeTypeToInteger();

  SDLoc DL(ST);

  SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);

  SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);


  SDValue Zero = DAG.getVectorIdxConstant(0, DL);

  SDValue Extracted =

      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);


  DCI.CombineTo(ST->getValue().getNode(), Extracted);

  return SDValue(ST, 0);

}


bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {

  return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||

         (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||

         (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);

}


// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.

static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,

                                   const AArch64Subtarget *Subtarget) {

  SDValue Value = ST->getValue();

  EVT ValueVT = Value.getValueType();


  if (ST->isVolatile() || !Subtarget->isLittleEndian() ||

      Value.getOpcode() != ISD::TRUNCATE ||

      ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))

    return SDValue();


  assert(ST->getOffset().isUndef() && "undef offset expected");

  SDLoc DL(ST);

  auto WideVT = EVT::getVectorVT(

      *DAG.getContext(),

      Value->getOperand(0).getValueType().getVectorElementType(), 4);

  SDValue UndefVector = DAG.getUNDEF(WideVT);

  SDValue WideTrunc = DAG.getNode(

      ISD::INSERT_SUBVECTOR, DL, WideVT,

      {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});

  SDValue Cast = DAG.getNode(

      ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,

      WideTrunc);


  MachineFunction &MF = DAG.getMachineFunction();

  SDValue Chain = ST->getChain();

  MachineMemOperand *MMO = ST->getMemOperand();

  unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;

  SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,

                           DAG.getConstant(2 * IdxScale, DL, MVT::i64));

  TypeSize Offset2 = TypeSize::getFixed(2);

  SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);

  Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));


  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,

                           DAG.getConstant(1 * IdxScale, DL, MVT::i64));

  TypeSize Offset1 = TypeSize::getFixed(1);

  SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);

  Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));


  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,

                           DAG.getConstant(0, DL, MVT::i64));

  Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),

                       MF.getMachineMemOperand(MMO, 0, 1));

  return Chain;

}


static unsigned getFPSubregForVT(EVT VT) {

  assert(VT.isSimple() && "Expected simple VT");

  switch (VT.getSimpleVT().SimpleTy) {

  case MVT::aarch64mfp8:

    return AArch64::bsub;

  case MVT::f16:

    return AArch64::hsub;

  case MVT::f32:

    return AArch64::ssub;

  case MVT::f64:

    return AArch64::dsub;

  default:

    llvm_unreachable("Unexpected VT!");

  }

}


static SDValue performSTORECombine(SDNode *N,

                                   TargetLowering::DAGCombinerInfo &DCI,

                                   SelectionDAG &DAG,

                                   const AArch64Subtarget *Subtarget) {

  StoreSDNode *ST = cast<StoreSDNode>(N);

  SDValue Chain = ST->getChain();

  SDValue Value = ST->getValue();

  SDValue Ptr = ST->getBasePtr();

  EVT ValueVT = Value.getValueType();

  EVT MemVT = ST->getMemoryVT();

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  SDLoc DL(ST);


  if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))

    return Res;


  auto hasValidElementTypeForFPTruncStore = [](EVT VT) {

    EVT EltVT = VT.getVectorElementType();

    return EltVT == MVT::f32 || EltVT == MVT::f64;

  };


  // Cast ptr32 and ptr64 pointers to the default address space before a store.

  unsigned AddrSpace = ST->getAddressSpace();

  if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||

      AddrSpace == ARM64AS::PTR32_UPTR) {

    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

    if (PtrVT != Ptr.getSimpleValueType()) {

      SDValue Cast = DAG.getAddrSpaceCast(DL, PtrVT, Ptr, AddrSpace, 0);

      return DAG.getStore(Chain, DL, Value, Cast, ST->getPointerInfo(),

                          ST->getBaseAlign(), ST->getMemOperand()->getFlags(),

                          ST->getAAInfo());

    }

  }


  if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))

    return Res;


  // If this is an FP_ROUND followed by a store, fold this into a truncating

  // store. We can do this even if this is already a truncstore.

  // We purposefully don't care about legality of the nodes here as we know

  // they can be split down into something legal.

  if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&

      Value.getNode()->hasOneUse() && ST->isUnindexed() &&

      Subtarget->useSVEForFixedLengthVectors() &&

      ValueVT.isFixedLengthVector() &&

      ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&

      hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))

    return DAG.getTruncStore(Chain, DL, Value.getOperand(0), Ptr, MemVT,

                             ST->getMemOperand());


  if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))

    return Split;


  if (Subtarget->supportsAddressTopByteIgnored() &&

      performTBISimplification(N->getOperand(2), DCI, DAG))

    return SDValue(N, 0);


  if (SDValue Store = foldTruncStoreOfExt(DAG, N))

    return Store;


  if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))

    return Store;


  if (ST->isTruncatingStore() &&

      isHalvingTruncateOfLegalScalableType(ValueVT, MemVT)) {

    if (SDValue Rshrnb =

            trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {

      return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),

                               MemVT, ST->getMemOperand());

    }

  }


  // This is an integer vector_extract_elt followed by a (possibly truncating)

  // store. We may be able to replace this with a store of an FP subregister.

  if (DCI.isAfterLegalizeDAG() && ST->isUnindexed() &&

      Value.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {


    SDValue Vector = Value.getOperand(0);

    SDValue ExtIdx = Value.getOperand(1);

    EVT VectorVT = Vector.getValueType();

    EVT ElemVT = VectorVT.getVectorElementType();


    if (!ValueVT.isInteger())

      return SDValue();


    // Propagate zero constants (applying this fold may miss optimizations).

    if (ISD::isConstantSplatVectorAllZeros(Vector.getNode())) {

      SDValue ZeroElt = DAG.getConstant(0, DL, ValueVT);

      DAG.ReplaceAllUsesWith(Value, ZeroElt);

      return SDValue();

    }


    if (ValueVT != MemVT && !ST->isTruncatingStore())

      return SDValue();


    // This could generate an additional extract if the index is non-zero and

    // the extracted value has multiple uses.

    auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);

    if ((!ExtCst || !ExtCst->isZero()) && !Value.hasOneUse())

      return SDValue();


    // These can lower to st1, which is preferable if we're unlikely to fold the

    // addressing into the store.

    if (Subtarget->isNeonAvailable() && ElemVT == MemVT &&

        (VectorVT.is64BitVector() || VectorVT.is128BitVector()) && ExtCst &&

        !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD)

      return SDValue();


    if (MemVT == MVT::i64 || MemVT == MVT::i32) {

      // Heuristic: If there are other users of w/x integer scalars extracted

      // from this vector that won't fold into the store -- abandon folding.

      // Applying this fold may disrupt paired stores.

      for (const auto &Use : Vector->uses()) {

        if (Use.getResNo() != Vector.getResNo())

          continue;

        const SDNode *User = Use.getUser();

        if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

            (!User->hasOneUse() ||

             (*User->user_begin())->getOpcode() != ISD::STORE))

          return SDValue();

      }

    }


    SDValue ExtVector = Vector;

    if (!ExtCst || !ExtCst->isZero()) {

      // Handle extracting from lanes != 0.

      SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,

                                Value.getValueType(), Vector, ExtIdx);

      SDValue Zero = DAG.getVectorIdxConstant(0, DL);

      ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT,

                              DAG.getUNDEF(VectorVT), Ext, Zero);

    }


    EVT FPMemVT = MemVT == MVT::i8

                      ? MVT::aarch64mfp8

                      : EVT::getFloatingPointVT(MemVT.getSizeInBits());

    SDValue FPSubreg = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,

                                                  FPMemVT, ExtVector);


    return DAG.getStore(ST->getChain(), DL, FPSubreg, ST->getBasePtr(),

                        ST->getMemOperand());

  }


  return SDValue();

}


static bool

isSequentialConcatOfVectorInterleave(SDNode *N, SmallVectorImpl<SDValue> &Ops) {

  if (N->getOpcode() != ISD::CONCAT_VECTORS)

    return false;


  unsigned NumParts = N->getNumOperands();


  // We should be concatenating each sequential result from a

  // VECTOR_INTERLEAVE.

  SDNode *InterleaveOp = N->getOperand(0).getNode();

  if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE ||

      InterleaveOp->getNumOperands() != NumParts)

    return false;


  for (unsigned I = 0; I < NumParts; I++)

    if (N->getOperand(I) != SDValue(InterleaveOp, I))

      return false;


  Ops.append(InterleaveOp->op_begin(), InterleaveOp->op_end());

  return true;

}


static SDValue getNarrowMaskForInterleavedOps(SelectionDAG &DAG, SDLoc &DL,

                                              SDValue WideMask,

                                              unsigned RequiredNumParts) {

  if (WideMask->getOpcode() == ISD::CONCAT_VECTORS) {

    SmallVector<SDValue, 4> MaskInterleaveOps;

    if (!isSequentialConcatOfVectorInterleave(WideMask.getNode(),

                                              MaskInterleaveOps))

      return SDValue();


    if (MaskInterleaveOps.size() != RequiredNumParts)

      return SDValue();


    // Make sure the inputs to the vector interleave are identical.

    if (!llvm::all_equal(MaskInterleaveOps))

      return SDValue();


    return MaskInterleaveOps[0];

  }


  if (WideMask->getOpcode() != ISD::SPLAT_VECTOR)

    return SDValue();


  ElementCount EC = WideMask.getValueType().getVectorElementCount();

  assert(EC.isKnownMultipleOf(RequiredNumParts) &&

         "Expected element count divisible by number of parts");

  EC = EC.divideCoefficientBy(RequiredNumParts);

  return DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC),

                     WideMask->getOperand(0));

}


static SDValue performInterleavedMaskedStoreCombine(

    SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {

  if (!DCI.isBeforeLegalize())

    return SDValue();


  MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);

  SDValue WideValue = MST->getValue();


  // Bail out if the stored value has an unexpected number of uses, since we'll

  // have to perform manual interleaving and may as well just use normal masked

  // stores. Also, discard masked stores that are truncating or indexed.

  if (!WideValue.hasOneUse() || !ISD::isNormalMaskedStore(MST) ||

      !MST->isSimple() || !MST->getOffset().isUndef())

    return SDValue();


  SmallVector<SDValue, 4> ValueInterleaveOps;

  if (!isSequentialConcatOfVectorInterleave(WideValue.getNode(),

                                            ValueInterleaveOps))

    return SDValue();


  unsigned NumParts = ValueInterleaveOps.size();

  if (NumParts != 2 && NumParts != 4)

    return SDValue();


  // At the moment we're unlikely to see a fixed-width vector interleave as

  // we usually generate shuffles instead.

  EVT SubVecTy = ValueInterleaveOps[0].getValueType();

  if (!SubVecTy.isScalableVT() ||

      SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||

      !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))

    return SDValue();


  SDLoc DL(N);

  SDValue NarrowMask =

      getNarrowMaskForInterleavedOps(DAG, DL, MST->getMask(), NumParts);

  if (!NarrowMask)

    return SDValue();


  const Intrinsic::ID IID =

      NumParts == 2 ? Intrinsic::aarch64_sve_st2 : Intrinsic::aarch64_sve_st4;

  SmallVector<SDValue, 8> NewStOps;

  NewStOps.append({MST->getChain(), DAG.getConstant(IID, DL, MVT::i32)});

  NewStOps.append(ValueInterleaveOps);

  NewStOps.append({NarrowMask, MST->getBasePtr()});

  return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, NewStOps);

}


static SDValue performMSTORECombine(SDNode *N,

                                    TargetLowering::DAGCombinerInfo &DCI,

                                    SelectionDAG &DAG,

                                    const AArch64Subtarget *Subtarget) {

  MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);

  SDValue Value = MST->getValue();

  SDValue Mask = MST->getMask();

  SDLoc DL(N);


  if (SDValue Res = performInterleavedMaskedStoreCombine(N, DCI, DAG))

    return Res;


  // If this is a UZP1 followed by a masked store, fold this into a masked

  // truncating store.  We can do this even if this is already a masked

  // truncstore.

  if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&

      MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&

      Value.getValueType().isInteger()) {

    Value = Value.getOperand(0);

    if (Value.getOpcode() == ISD::BITCAST) {

      EVT HalfVT =

          Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());

      EVT InVT = Value.getOperand(0).getValueType();


      if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {

        unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();

        unsigned PgPattern = Mask->getConstantOperandVal(0);


        // Ensure we can double the size of the predicate pattern

        unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);

        if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=

                           MinSVESize) {

          Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),

                          PgPattern);

          return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),

                                    MST->getBasePtr(), MST->getOffset(), Mask,

                                    MST->getMemoryVT(), MST->getMemOperand(),

                                    MST->getAddressingMode(),

                                    /*IsTruncating=*/true);

        }

      }

    }

  }


  if (MST->isTruncatingStore()) {

    EVT ValueVT = Value->getValueType(0);

    EVT MemVT = MST->getMemoryVT();

    if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))

      return SDValue();

    if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {

      return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),

                                MST->getOffset(), MST->getMask(),

                                MST->getMemoryVT(), MST->getMemOperand(),

                                MST->getAddressingMode(), true);

    }

  }


  return SDValue();

}


/// \return true if part of the index was folded into the Base.


static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,

                              SDLoc DL, SelectionDAG &DAG) {

  // This function assumes a vector of i64 indices.

  EVT IndexVT = Index.getValueType();

  if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)

    return false;


  // Simplify:

  //   BasePtr = Ptr

  //   Index = X + splat(Offset)

  // ->

  //   BasePtr = Ptr + Offset * scale.

  //   Index = X

  if (Index.getOpcode() == ISD::ADD) {

    if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {

      Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);

      BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);

      Index = Index.getOperand(0);

      return true;

    }

  }


  // Simplify:

  //   BasePtr = Ptr

  //   Index = (X + splat(Offset)) << splat(Shift)

  // ->

  //   BasePtr = Ptr + (Offset << Shift) * scale)

  //   Index = X << splat(shift)

  if (Index.getOpcode() == ISD::SHL &&

      Index.getOperand(0).getOpcode() == ISD::ADD) {

    SDValue Add = Index.getOperand(0);

    SDValue ShiftOp = Index.getOperand(1);

    SDValue OffsetOp = Add.getOperand(1);

    if (auto Shift = DAG.getSplatValue(ShiftOp))

      if (auto Offset = DAG.getSplatValue(OffsetOp)) {

        Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);

        Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);

        BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);

        Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),

                            Add.getOperand(0), ShiftOp);

        return true;

      }

  }


  return false;

}


// Analyse the specified address returning true if a more optimal addressing

// mode is available. When returning true all parameters are updated to reflect

// their recommended values.


static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,

                                     SDValue &BasePtr, SDValue &Index,

                                     SelectionDAG &DAG) {

  // Try to iteratively fold parts of the index into the base pointer to

  // simplify the index as much as possible.

  bool Changed = false;

  while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))

    Changed = true;


  // Only consider element types that are pointer sized as smaller types can

  // be easily promoted.

  EVT IndexVT = Index.getValueType();

  if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)

    return Changed;


  // Can indices be trivially shrunk?

  EVT DataVT = N->getOperand(1).getValueType();

  // Don't attempt to shrink the index for fixed vectors of 64 bit data since it

  // will later be re-extended to 64 bits in legalization

  if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)

    return Changed;

  if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {

    EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);

    Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);

    return true;

  }


  // Match:

  //   Index = step(const)

  int64_t Stride = 0;

  if (Index.getOpcode() == ISD::STEP_VECTOR) {

    Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();

  }

  // Match:

  //   Index = step(const) << shift(const)

  else if (Index.getOpcode() == ISD::SHL &&

           Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {

    SDValue RHS = Index.getOperand(1);

    if (auto *Shift =

            dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {

      int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);

      Stride = Step << Shift->getZExtValue();

    }

  }


  // Return early because no supported pattern is found.

  if (Stride == 0)

    return Changed;


  if (Stride < std::numeric_limits<int32_t>::min() ||

      Stride > std::numeric_limits<int32_t>::max())

    return Changed;


  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();

  unsigned MaxVScale =

      Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock;

  int64_t LastElementOffset =

      IndexVT.getVectorMinNumElements() * Stride * MaxVScale;


  if (LastElementOffset < std::numeric_limits<int32_t>::min() ||

      LastElementOffset > std::numeric_limits<int32_t>::max())

    return Changed;


  EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);

  // Stride does not scale explicitly by 'Scale', because it happens in

  // the gather/scatter addressing mode.

  Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride, true));

  return true;

}


static SDValue performMaskedGatherScatterCombine(

    SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {

  if (!DCI.isBeforeLegalize())

    return SDValue();

  MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);


  SDLoc DL(MGS);

  SDValue Chain = MGS->getChain();

  SDValue Scale = MGS->getScale();

  SDValue Index = MGS->getIndex();

  SDValue Mask = MGS->getMask();

  SDValue BasePtr = MGS->getBasePtr();

  ISD::MemIndexType IndexType = MGS->getIndexType();


  if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))

    return SDValue();


  // Here we catch such cases early and change MGATHER's IndexType to allow

  // the use of an Index that's more legalisation friendly.

  if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {

    SDValue PassThru = MGT->getPassThru();

    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};

    return DAG.getMaskedGather(

        DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,

        Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());

  }

  if (auto *MSC = dyn_cast<MaskedScatterSDNode>(MGS)) {

    SDValue Data = MSC->getValue();

    SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};

    return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),

                                DL, Ops, MSC->getMemOperand(), IndexType,

                                MSC->isTruncatingStore());

  }

  auto *HG = cast<MaskedHistogramSDNode>(MGS);

  SDValue Ops[] = {Chain, HG->getInc(), Mask,          BasePtr,

                   Index, Scale,        HG->getIntID()};

  return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),

                                DL, Ops, HG->getMemOperand(), IndexType);

}


/// Target-specific DAG combine function for NEON load/store intrinsics

/// to merge base address updates.


static SDValue performNEONPostLDSTCombine(SDNode *N,

                                          TargetLowering::DAGCombinerInfo &DCI,

                                          SelectionDAG &DAG) {

  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())

    return SDValue();


  unsigned AddrOpIdx = N->getNumOperands() - 1;

  SDValue Addr = N->getOperand(AddrOpIdx);


  // Search for a use of the address operand that is an increment.

  for (SDUse &Use : Addr->uses()) {

    SDNode *User = Use.getUser();

    if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())

      continue;


    // Check that the add is independent of the load/store.  Otherwise, folding

    // it would create a cycle.

    SmallPtrSet<const SDNode *, 32> Visited;

    SmallVector<const SDNode *, 16> Worklist;

    Visited.insert(Addr.getNode());

    Worklist.push_back(N);

    Worklist.push_back(User);

    if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||

        SDNode::hasPredecessorHelper(User, Visited, Worklist))

      continue;


    // Find the new opcode for the updating load/store.

    bool IsStore = false;

    bool IsLaneOp = false;

    bool IsDupOp = false;

    unsigned NewOpc = 0;

    unsigned NumVecs = 0;

    unsigned IntNo = N->getConstantOperandVal(1);

    switch (IntNo) {

    default: llvm_unreachable("unexpected intrinsic for Neon base update");

    case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;

      NumVecs = 2; break;

    case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;

      NumVecs = 3; break;

    case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;

      NumVecs = 4; break;

    case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;

      NumVecs = 2; IsStore = true; break;

    case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;

      NumVecs = 3; IsStore = true; break;

    case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;

      NumVecs = 4; IsStore = true; break;

    case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;

      NumVecs = 2; break;

    case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;

      NumVecs = 3; break;

    case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;

      NumVecs = 4; break;

    case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;

      NumVecs = 2; IsStore = true; break;

    case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;

      NumVecs = 3; IsStore = true; break;

    case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;

      NumVecs = 4; IsStore = true; break;

    case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;

      NumVecs = 2; IsDupOp = true; break;

    case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;

      NumVecs = 3; IsDupOp = true; break;

    case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;

      NumVecs = 4; IsDupOp = true; break;

    case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;

      NumVecs = 2; IsLaneOp = true; break;

    case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;

      NumVecs = 3; IsLaneOp = true; break;

    case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;

      NumVecs = 4; IsLaneOp = true; break;

    case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;

      NumVecs = 2; IsStore = true; IsLaneOp = true; break;

    case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;

      NumVecs = 3; IsStore = true; IsLaneOp = true; break;

    case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;

      NumVecs = 4; IsStore = true; IsLaneOp = true; break;

    }


    EVT VecTy;

    if (IsStore)

      VecTy = N->getOperand(2).getValueType();

    else

      VecTy = N->getValueType(0);


    // If the increment is a constant, it must match the memory ref size.

    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);

    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {

      uint32_t IncVal = CInc->getZExtValue();

      unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;

      if (IsLaneOp || IsDupOp)

        NumBytes /= VecTy.getVectorNumElements();

      if (IncVal != NumBytes)

        continue;

      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);

    }

    SmallVector<SDValue, 8> Ops;

    Ops.push_back(N->getOperand(0)); // Incoming chain

    // Load lane and store have vector list as input.

    if (IsLaneOp || IsStore)

      for (unsigned i = 2; i < AddrOpIdx; ++i)

        Ops.push_back(N->getOperand(i));

    Ops.push_back(Addr); // Base register

    Ops.push_back(Inc);


    // Return Types.

    EVT Tys[6];

    unsigned NumResultVecs = (IsStore ? 0 : NumVecs);

    unsigned n;

    for (n = 0; n < NumResultVecs; ++n)

      Tys[n] = VecTy;

    Tys[n++] = MVT::i64;  // Type of write back register

    Tys[n] = MVT::Other;  // Type of the chain

    SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));


    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);

    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,

                                           MemInt->getMemoryVT(),

                                           MemInt->getMemOperand());


    // Update the uses.

    std::vector<SDValue> NewResults;

    for (unsigned i = 0; i < NumResultVecs; ++i) {

      NewResults.push_back(SDValue(UpdN.getNode(), i));

    }

    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));

    DCI.CombineTo(N, NewResults);

    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));


    break;

  }

  return SDValue();

}


// Checks to see if the value is the prescribed width and returns information

// about its extension mode.

static


bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {

  ExtType = ISD::NON_EXTLOAD;

  switch(V.getNode()->getOpcode()) {

  default:

    return false;

  case ISD::LOAD: {

    LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());

    if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)

       || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {

      ExtType = LoadNode->getExtensionType();

      return true;

    }

    return false;

  }

  case ISD::AssertSext: {

    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));

    if ((TypeNode->getVT() == MVT::i8 && width == 8)

       || (TypeNode->getVT() == MVT::i16 && width == 16)) {

      ExtType = ISD::SEXTLOAD;

      return true;

    }

    return false;

  }

  case ISD::AssertZext: {

    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));

    if ((TypeNode->getVT() == MVT::i8 && width == 8)

       || (TypeNode->getVT() == MVT::i16 && width == 16)) {

      ExtType = ISD::ZEXTLOAD;

      return true;

    }

    return false;

  }

  case ISD::Constant:

  case ISD::TargetConstant: {

    return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <

           1LL << (width - 1);

  }

  }


  return true;

}


// This function does a whole lot of voodoo to determine if the tests are

// equivalent without and with a mask. Essentially what happens is that given a

// DAG resembling:

//

//  +-------------+ +-------------+ +-------------+ +-------------+

//  |    Input    | | AddConstant | | CompConstant| |     CC      |

//  +-------------+ +-------------+ +-------------+ +-------------+

//           |           |           |               |

//           V           V           |    +----------+

//          +-------------+  +----+  |    |

//          |     ADD     |  |0xff|  |    |

//          +-------------+  +----+  |    |

//                  |           |    |    |

//                  V           V    |    |

//                 +-------------+   |    |

//                 |     AND     |   |    |

//                 +-------------+   |    |

//                      |            |    |

//                      +-----+      |    |

//                            |      |    |

//                            V      V    V

//                           +-------------+

//                           |     CMP     |

//                           +-------------+

//

// The AND node may be safely removed for some combinations of inputs. In

// particular we need to take into account the extension type of the Input,

// the exact values of AddConstant, CompConstant, and CC, along with the nominal

// width of the input (this can work for any width inputs, the above graph is

// specific to 8 bits.

//

// The specific equations were worked out by generating output tables for each

// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The

// problem was simplified by working with 4 bit inputs, which means we only

// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero

// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8

// patterns present in both extensions (0,7). For every distinct set of

// AddConstant and CompConstants bit patterns we can consider the masked and

// unmasked versions to be equivalent if the result of this function is true for

// all 16 distinct bit patterns of for the current extension type of Input (w0).

//

//   sub      w8, w0, w1

//   and      w10, w8, #0x0f

//   cmp      w8, w2

//   cset     w9, AArch64CC

//   cmp      w10, w2

//   cset     w11, AArch64CC

//   cmp      w9, w11

//   cset     w0, eq

//   ret

//

// Since the above function shows when the outputs are equivalent it defines

// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and

// would be expensive to run during compiles. The equations below were written

// in a test harness that confirmed they gave equivalent outputs to the above

// for all inputs function, so they can be used determine if the removal is

// legal instead.

//

// isEquivalentMaskless() is the code for testing if the AND can be removed

// factored out of the DAG recognition as the DAG can take several forms.


static bool isEquivalentMaskless(unsigned CC, unsigned width,

                                 ISD::LoadExtType ExtType, int AddConstant,

                                 int CompConstant) {

  // By being careful about our equations and only writing the in term

  // symbolic values and well known constants (0, 1, -1, MaxUInt) we can

  // make them generally applicable to all bit widths.

  int MaxUInt = (1 << width);


  // For the purposes of these comparisons sign extending the type is

  // equivalent to zero extending the add and displacing it by half the integer

  // width. Provided we are careful and make sure our equations are valid over

  // the whole range we can just adjust the input and avoid writing equations

  // for sign extended inputs.

  if (ExtType == ISD::SEXTLOAD)

    AddConstant -= (1 << (width-1));


  switch(CC) {

  case AArch64CC::LE:

  case AArch64CC::GT:

    if ((AddConstant == 0) ||

        (CompConstant == MaxUInt - 1 && AddConstant < 0) ||

        (AddConstant >= 0 && CompConstant < 0) ||

        (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))

      return true;

    break;

  case AArch64CC::LT:

  case AArch64CC::GE:

    if ((AddConstant == 0) ||

        (AddConstant >= 0 && CompConstant <= 0) ||

        (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))

      return true;

    break;

  case AArch64CC::HI:

  case AArch64CC::LS:

    if ((AddConstant >= 0 && CompConstant < 0) ||

       (AddConstant <= 0 && CompConstant >= -1 &&

        CompConstant < AddConstant + MaxUInt))

      return true;

   break;

  case AArch64CC::PL:

  case AArch64CC::MI:

    if ((AddConstant == 0) ||

        (AddConstant > 0 && CompConstant <= 0) ||

        (AddConstant < 0 && CompConstant <= AddConstant))

      return true;

    break;

  case AArch64CC::LO:

  case AArch64CC::HS:

    if ((AddConstant >= 0 && CompConstant <= 0) ||

        (AddConstant <= 0 && CompConstant >= 0 &&

         CompConstant <= AddConstant + MaxUInt))

      return true;

    break;

  case AArch64CC::EQ:

  case AArch64CC::NE:

    if ((AddConstant > 0 && CompConstant < 0) ||

        (AddConstant < 0 && CompConstant >= 0 &&

         CompConstant < AddConstant + MaxUInt) ||

        (AddConstant >= 0 && CompConstant >= 0 &&

         CompConstant >= AddConstant) ||

        (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))

      return true;

    break;

  case AArch64CC::VS:

  case AArch64CC::VC:

  case AArch64CC::AL:

  case AArch64CC::NV:

    return true;

  case AArch64CC::Invalid:

    break;

  }


  return false;

}


// (X & C) >u Mask --> (X & (C & (~Mask)) != 0

// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0


static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode,

                                        SDNode *AndNode, SelectionDAG &DAG,

                                        unsigned CCIndex, unsigned CmpIndex,

                                        unsigned CC) {

  ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));

  if (!SubsC)

    return SDValue();


  APInt SubsAP = SubsC->getAPIntValue();

  if (CC == AArch64CC::HI) {

    if (!SubsAP.isMask())

      return SDValue();

  } else if (CC == AArch64CC::LO) {

    if (!SubsAP.isPowerOf2())

      return SDValue();

  } else

    return SDValue();


  ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));

  if (!AndC)

    return SDValue();


  APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);


  SDLoc DL(N);

  APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();

  SDValue ANDS = DAG.getNode(

      AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),

      DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));

  SDValue AArch64_CC =

      DAG.getConstant(CC == AArch64CC::HI ? AArch64CC::NE : AArch64CC::EQ, DL,

                      N->getOperand(CCIndex)->getValueType(0));


  // For now, only performCSELCombine and performBRCONDCombine call this

  // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4

  // operands. So just init the ops direct to simplify the code. If we have some

  // other case with different CCIndex, CmpIndex, we need to use for loop to

  // rewrite the code here.

  // TODO: Do we need to assert number of operand is 4 here?

  assert((CCIndex == 2 && CmpIndex == 3) &&

         "Expected CCIndex to be 2 and CmpIndex to be 3.");

  SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,

                   ANDS.getValue(1)};

  return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);

}


static


SDValue performCONDCombine(SDNode *N,

                           TargetLowering::DAGCombinerInfo &DCI,

                           SelectionDAG &DAG, unsigned CCIndex,

                           unsigned CmpIndex) {

  unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();

  SDNode *SubsNode = N->getOperand(CmpIndex).getNode();

  unsigned CondOpcode = SubsNode->getOpcode();


  if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||

      !SubsNode->hasOneUse())

    return SDValue();


  // There is a SUBS feeding this condition. Is it fed by a mask we can

  // use?


  SDNode *AndNode = SubsNode->getOperand(0).getNode();

  unsigned MaskBits = 0;


  if (AndNode->getOpcode() != ISD::AND)

    return SDValue();


  if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,

                                             CmpIndex, CC))

    return Val;


  // X & M ?= C --> (C << clz(M)) ?= (X << clz(M)) where M is a non-empty

  // sequence of ones starting at the least significant bit with the remainder

  // zero and C is a constant s.t. (C & ~M) == 0 that cannot be materialised

  // into a SUBS (immediate). The transformed form can be matched into a SUBS

  // (shifted register).

  if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && AndNode->hasOneUse() &&

      isa<ConstantSDNode>(AndNode->getOperand(1)) &&

      isa<ConstantSDNode>(SubsNode->getOperand(1))) {

    SDValue X = AndNode->getOperand(0);

    APInt M = AndNode->getConstantOperandAPInt(1);

    APInt C = SubsNode->getConstantOperandAPInt(1);


    if (M.isMask() && C.isSubsetOf(M) && !isLegalArithImmed(C.getZExtValue())) {

      SDLoc DL(SubsNode);

      EVT VT = SubsNode->getValueType(0);

      unsigned ShiftAmt = M.countl_zero();

      SDValue ShiftedX = DAG.getNode(

          ISD::SHL, DL, VT, X, DAG.getShiftAmountConstant(ShiftAmt, VT, DL));

      SDValue ShiftedC = DAG.getConstant(C << ShiftAmt, DL, VT);

      SDValue NewSubs = DAG.getNode(AArch64ISD::SUBS, DL, SubsNode->getVTList(),

                                    ShiftedC, ShiftedX);

      DCI.CombineTo(SubsNode, NewSubs, NewSubs.getValue(1));

      return SDValue(N, 0);

    }

  }


  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {

    uint32_t CNV = CN->getZExtValue();

    if (CNV == 255)

      MaskBits = 8;

    else if (CNV == 65535)

      MaskBits = 16;

  }


  if (!MaskBits)

    return SDValue();


  SDValue AddValue = AndNode->getOperand(0);


  if (AddValue.getOpcode() != ISD::ADD)

    return SDValue();


  // The basic dag structure is correct, grab the inputs and validate them.


  SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);

  SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);

  SDValue SubsInputValue = SubsNode->getOperand(1);


  // The mask is present and the provenance of all the values is a smaller type,

  // lets see if the mask is superfluous.


  if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||

      !isa<ConstantSDNode>(SubsInputValue.getNode()))

    return SDValue();


  ISD::LoadExtType ExtType;


  if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||

      !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||

      !checkValueWidth(AddInputValue1, MaskBits, ExtType) )

    return SDValue();


  if(!isEquivalentMaskless(CC, MaskBits, ExtType,

                cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),

                cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))

    return SDValue();


  // The AND is not necessary, remove it.


  SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),

                               SubsNode->getValueType(1));

  SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };


  SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);

  DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());


  return SDValue(N, 0);

}


// Optimize compare with zero and branch.


static SDValue performBRCONDCombine(SDNode *N,

                                    TargetLowering::DAGCombinerInfo &DCI,

                                    SelectionDAG &DAG) {

  MachineFunction &MF = DAG.getMachineFunction();

  // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions

  // will not be produced, as they are conditional branch instructions that do

  // not set flags.

  if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))

    return SDValue();


  if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))

    N = NV.getNode();

  SDValue Chain = N->getOperand(0);

  SDValue Dest = N->getOperand(1);

  SDValue CCVal = N->getOperand(2);

  SDValue Cmp = N->getOperand(3);


  assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");

  unsigned CC = CCVal->getAsZExtVal();

  if (CC != AArch64CC::EQ && CC != AArch64CC::NE)

    return SDValue();


  // Fold away brcond(NE, cmp(csel(1, 0, CC, Cmp), 1)) -> brcond(~CC, Cmp)

  if (isCMP(Cmp) && CC == AArch64CC::NE && isOneConstant(Cmp.getOperand(1))) {

    SDValue CSel = Cmp.getOperand(0);

    auto CSelCC = getCSETCondCode(CSel);

    if (CSelCC) {

      SDLoc DL(N);

      return DAG.getNode(N->getOpcode(), DL, N->getVTList(), Chain, Dest,

                         getCondCode(DAG, getInvertedCondCode(*CSelCC)),

                         CSel.getOperand(3));

    }

  }


  unsigned CmpOpc = Cmp.getOpcode();

  if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)

    return SDValue();


  // Only attempt folding if there is only one use of the flag and no use of the

  // value.

  if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))

    return SDValue();


  SDValue LHS = Cmp.getOperand(0);

  SDValue RHS = Cmp.getOperand(1);


  assert(LHS.getValueType() == RHS.getValueType() &&

         "Expected the value type to be the same for both operands!");

  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)

    return SDValue();


  if (isNullConstant(LHS))

    std::swap(LHS, RHS);


  if (!isNullConstant(RHS))

    return SDValue();


  if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||

      LHS.getOpcode() == ISD::SRL)

    return SDValue();


  // Fold the compare into the branch instruction.

  SDValue BR;

  if (CC == AArch64CC::EQ)

    BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);

  else

    BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);


  // Do not add new nodes to DAG combiner worklist.

  DCI.CombineTo(N, BR, false);


  return SDValue();

}


static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) {

  unsigned CC = N->getConstantOperandVal(2);

  SDValue SUBS = N->getOperand(3);

  SDValue Zero, CTTZ;


  if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {

    Zero = N->getOperand(0);

    CTTZ = N->getOperand(1);

  } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {

    Zero = N->getOperand(1);

    CTTZ = N->getOperand(0);

  } else

    return SDValue();


  if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||

      (CTTZ.getOpcode() == ISD::TRUNCATE &&

       CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))

    return SDValue();


  assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&

         "Illegal type in CTTZ folding");


  if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))

    return SDValue();


  SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE

                  ? CTTZ.getOperand(0).getOperand(0)

                  : CTTZ.getOperand(0);


  if (X != SUBS.getOperand(0))

    return SDValue();


  unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE

                          ? CTTZ.getOperand(0).getValueSizeInBits()

                          : CTTZ.getValueSizeInBits();

  SDValue BitWidthMinusOne =

      DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());

  return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,

                     BitWidthMinusOne);

}


// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)

// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)

// Where x and y are constants and x != y


// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)

// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)

// Where x and y are constants and x != y


static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) {

  SDValue L = Op->getOperand(0);

  SDValue R = Op->getOperand(1);

  AArch64CC::CondCode OpCC =

      static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));


  SDValue OpCmp = Op->getOperand(3);

  if (!isCMP(OpCmp))

    return SDValue();


  SDValue CmpLHS = OpCmp.getOperand(0);

  SDValue CmpRHS = OpCmp.getOperand(1);


  if (CmpRHS.getOpcode() == AArch64ISD::CSEL)

    std::swap(CmpLHS, CmpRHS);

  else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)

    return SDValue();


  SDValue X = CmpLHS->getOperand(0);

  SDValue Y = CmpLHS->getOperand(1);

  if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {

    return SDValue();

  }


  // If one of the constant is opaque constant, x,y sdnode is still different

  // but the real value maybe the same. So check APInt here to make sure the

  // code is correct.

  ConstantSDNode *CX = cast<ConstantSDNode>(X);

  ConstantSDNode *CY = cast<ConstantSDNode>(Y);

  if (CX->getAPIntValue() == CY->getAPIntValue())

    return SDValue();


  AArch64CC::CondCode CC =

      static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));

  SDValue Cond = CmpLHS->getOperand(3);


  if (CmpRHS == Y)

    CC = AArch64CC::getInvertedCondCode(CC);

  else if (CmpRHS != X)

    return SDValue();


  if (OpCC == AArch64CC::NE)

    CC = AArch64CC::getInvertedCondCode(CC);

  else if (OpCC != AArch64CC::EQ)

    return SDValue();


  SDLoc DL(Op);

  EVT VT = Op->getValueType(0);


  SDValue CCValue = getCondCode(DAG, CC);

  return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);

}


// Reassociate the true/false expressions of a CSEL instruction to obtain a

// common subexpression with the comparison instruction. For example, change

// (CSEL (ADD (ADD x y) -c) f LO (SUBS x c)) to

// (CSEL (ADD (SUBS x c) y) f LO (SUBS x c)) such that (SUBS x c) is a common

// subexpression.


static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) {

  SDValue SubsNode = N->getOperand(3);

  if (SubsNode.getOpcode() != AArch64ISD::SUBS || !SubsNode.hasOneUse())

    return SDValue();


  SDValue CmpOpToMatch = SubsNode.getOperand(1);

  SDValue CmpOpOther = SubsNode.getOperand(0);

  EVT VT = N->getValueType(0);


  unsigned ExpectedOpcode;

  SDValue ExpectedOp;

  SDValue SubsOp;

  auto *CmpOpConst = dyn_cast<ConstantSDNode>(CmpOpToMatch);

  if (CmpOpConst) {

    ExpectedOpcode = ISD::ADD;

    ExpectedOp =

        DAG.getConstant(-CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),

                        CmpOpConst->getValueType(0));

    SubsOp = DAG.getConstant(CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),

                             CmpOpConst->getValueType(0));

  } else {

    ExpectedOpcode = ISD::SUB;

    ExpectedOp = CmpOpToMatch;

    SubsOp = CmpOpToMatch;

  }


  // Get the operand that can be reassociated with the SUBS instruction.

  auto GetReassociationOp = [&](SDValue Op, SDValue ExpectedOp) {

    if (Op.getOpcode() != ExpectedOpcode)

      return SDValue();

    if (Op.getOperand(0).getOpcode() != ISD::ADD ||

        !Op.getOperand(0).hasOneUse())

      return SDValue();

    SDValue X = Op.getOperand(0).getOperand(0);

    SDValue Y = Op.getOperand(0).getOperand(1);

    if (X != CmpOpOther)

      std::swap(X, Y);

    if (X != CmpOpOther)

      return SDValue();

    if (ExpectedOp != Op.getOperand(1))

      return SDValue();

    return Y;

  };


  // Try the reassociation using the given constant and condition code.

  auto Fold = [&](AArch64CC::CondCode NewCC, SDValue ExpectedOp,

                  SDValue SubsOp) {

    SDValue TReassocOp = GetReassociationOp(N->getOperand(0), ExpectedOp);

    SDValue FReassocOp = GetReassociationOp(N->getOperand(1), ExpectedOp);

    if (!TReassocOp && !FReassocOp)

      return SDValue();


    SDValue NewCmp =

        DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),

                    DAG.getVTList(VT, FlagsVT), CmpOpOther, SubsOp);


    auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {

      if (!ReassocOp)

        return N->getOperand(OpNum);

      SDValue Res = DAG.getNode(ISD::ADD, SDLoc(N->getOperand(OpNum)), VT,

                                NewCmp.getValue(0), ReassocOp);

      DAG.ReplaceAllUsesWith(N->getOperand(OpNum), Res);

      return Res;

    };


    SDValue TValReassoc = Reassociate(TReassocOp, 0);

    SDValue FValReassoc = Reassociate(FReassocOp, 1);

    return DAG.getNode(AArch64ISD::CSEL, SDLoc(N), VT, TValReassoc, FValReassoc,

                       getCondCode(DAG, NewCC), NewCmp.getValue(1));

  };


  auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));


  // First, try to eliminate the compare instruction by searching for a

  // subtraction with the same constant.

  if (SDValue R = Fold(CC, ExpectedOp, SubsOp))

    return R;


  if (!CmpOpConst) {

    // Try again with the operands of the SUBS instruction and the condition

    // swapped. Due to canonicalization, this only helps for non-constant

    // operands of the SUBS instruction.

    std::swap(CmpOpToMatch, CmpOpOther);

    if (SDValue R = Fold(getSwappedCondition(CC), CmpOpToMatch, CmpOpToMatch))

      return R;

    return SDValue();

  }


  if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && !CmpOpConst->isZero())

    return SDValue();


  // Next, search for a subtraction with a slightly different constant. By

  // adjusting the condition code, we can still eliminate the compare

  // instruction. Adjusting the constant is only valid if it does not result

  // in signed/unsigned wrap for signed/unsigned comparisons, respectively.

  // Since such comparisons are trivially true/false, we should not encounter

  // them here but check for them nevertheless to be on the safe side.

  auto CheckedFold = [&](bool Check, APInt NewCmpConst,

                         AArch64CC::CondCode NewCC) {

    auto ExpectedOp = DAG.getConstant(-NewCmpConst, SDLoc(CmpOpConst),

                                      CmpOpConst->getValueType(0));

    auto SubsOp = DAG.getConstant(NewCmpConst, SDLoc(CmpOpConst),

                                  CmpOpConst->getValueType(0));

    return Check ? Fold(NewCC, ExpectedOp, SubsOp) : SDValue();

  };

  switch (CC) {

  case AArch64CC::EQ:

  case AArch64CC::LS:

    return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),

                       CmpOpConst->getAPIntValue() + 1, AArch64CC::LO);

  case AArch64CC::NE:

  case AArch64CC::HI:

    return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),

                       CmpOpConst->getAPIntValue() + 1, AArch64CC::HS);

  case AArch64CC::LO:

    return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),

                       CmpOpConst->getAPIntValue() - 1, AArch64CC::LS);

  case AArch64CC::HS:

    return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),

                       CmpOpConst->getAPIntValue() - 1, AArch64CC::HI);

  case AArch64CC::LT:

    return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),

                       CmpOpConst->getAPIntValue() - 1, AArch64CC::LE);

  case AArch64CC::LE:

    return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),

                       CmpOpConst->getAPIntValue() + 1, AArch64CC::LT);

  case AArch64CC::GT:

    return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),

                       CmpOpConst->getAPIntValue() + 1, AArch64CC::GE);

  case AArch64CC::GE:

    return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),

                       CmpOpConst->getAPIntValue() - 1, AArch64CC::GT);

  default:

    return SDValue();

  }

}


static SDValue foldCSELofLASTB(SDNode *Op, SelectionDAG &DAG) {

  AArch64CC::CondCode OpCC =

      static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));


  if (OpCC != AArch64CC::NE)

    return SDValue();


  SDValue PTest = Op->getOperand(3);

  if (PTest.getOpcode() != AArch64ISD::PTEST_ANY)

    return SDValue();


  SDValue TruePred = PTest.getOperand(0);

  SDValue AnyPred = PTest.getOperand(1);


  if (TruePred.getOpcode() == AArch64ISD::REINTERPRET_CAST)

    TruePred = TruePred.getOperand(0);


  if (AnyPred.getOpcode() == AArch64ISD::REINTERPRET_CAST)

    AnyPred = AnyPred.getOperand(0);


  if (TruePred != AnyPred && !isAllActivePredicate(DAG, TruePred))

    return SDValue();


  SDValue LastB = Op->getOperand(0);

  SDValue Default = Op->getOperand(1);


  if (LastB.getOpcode() != AArch64ISD::LASTB || LastB.getOperand(0) != AnyPred)

    return SDValue();


  return DAG.getNode(AArch64ISD::CLASTB_N, SDLoc(Op), Op->getValueType(0),

                     AnyPred, Default, LastB.getOperand(1));

}


// Optimize CSEL instructions


static SDValue performCSELCombine(SDNode *N,

                                  TargetLowering::DAGCombinerInfo &DCI,

                                  SelectionDAG &DAG) {

  // CSEL x, x, cc -> x

  if (N->getOperand(0) == N->getOperand(1))

    return N->getOperand(0);


  if (SDValue R = foldCSELOfCSEL(N, DAG))

    return R;


  // Try to reassociate the true/false expressions so that we can do CSE with

  // a SUBS instruction used to perform the comparison.

  if (SDValue R = reassociateCSELOperandsForCSE(N, DAG))

    return R;


  // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1

  // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1

  if (SDValue Folded = foldCSELofCTTZ(N, DAG))

    return Folded;


  // CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x)

  // if SUB(y, x) already exists and we can produce a swapped predicate for cc.

  SDValue Cond = N->getOperand(3);

  if (DCI.isAfterLegalizeDAG() && Cond.getOpcode() == AArch64ISD::SUBS &&

      Cond.hasOneUse() && Cond->hasNUsesOfValue(0, 0) &&

      DAG.doesNodeExist(ISD::SUB, N->getVTList(),

                        {Cond.getOperand(1), Cond.getOperand(0)}) &&

      !DAG.doesNodeExist(ISD::SUB, N->getVTList(),

                         {Cond.getOperand(0), Cond.getOperand(1)}) &&

      !isNullConstant(Cond.getOperand(1))) {

    AArch64CC::CondCode OldCond =

        static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));

    AArch64CC::CondCode NewCond = getSwappedCondition(OldCond);

    if (NewCond != AArch64CC::AL) {

      SDLoc DL(N);

      SDValue Sub = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),

                                Cond.getOperand(1), Cond.getOperand(0));

      return DAG.getNode(AArch64ISD::CSEL, DL, N->getVTList(), N->getOperand(0),

                         N->getOperand(1), getCondCode(DAG, NewCond),

                         Sub.getValue(1));

    }

  }


  // CSEL a, b, cc, SUBS(SUB(x,y), 0) -> CSEL a, b, cc, SUBS(x,y) if cc doesn't

  // use overflow flags, to avoid the comparison with zero. In case of success,

  // this also replaces the original SUB(x,y) with the newly created SUBS(x,y).

  // NOTE: Perhaps in the future use performFlagSettingCombine to replace SUB

  // nodes with their SUBS equivalent as is already done for other flag-setting

  // operators, in which case doing the replacement here becomes redundant.

  if (Cond.getOpcode() == AArch64ISD::SUBS && Cond->hasNUsesOfValue(1, 1) &&

      isNullConstant(Cond.getOperand(1))) {

    SDValue Sub = Cond.getOperand(0);

    AArch64CC::CondCode CC =

        static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));

    if (Sub.getOpcode() == ISD::SUB &&

        (CC == AArch64CC::EQ || CC == AArch64CC::NE || CC == AArch64CC::MI ||

         CC == AArch64CC::PL)) {

      SDLoc DL(N);

      SDValue Subs = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),

                                 Sub.getOperand(0), Sub.getOperand(1));

      DCI.CombineTo(Sub.getNode(), Subs);

      DCI.CombineTo(Cond.getNode(), Subs, Subs.getValue(1));

      return SDValue(N, 0);

    }

  }


  // CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z

  if (SDValue CondLast = foldCSELofLASTB(N, DAG))

    return CondLast;


  return performCONDCombine(N, DCI, DAG, 2, 3);

}


// Try to re-use an already extended operand of a vector SetCC feeding a

// extended select. Doing so avoids requiring another full extension of the

// SET_CC result when lowering the select.


static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {

  EVT Op0MVT = Op->getOperand(0).getValueType();

  if (!Op0MVT.isVector() || Op->use_empty())

    return SDValue();


  // Make sure that all uses of Op are VSELECTs with result matching types where

  // the result type has a larger element type than the SetCC operand.

  SDNode *FirstUse = *Op->user_begin();

  if (FirstUse->getOpcode() != ISD::VSELECT)

    return SDValue();

  EVT UseMVT = FirstUse->getValueType(0);

  if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())

    return SDValue();

  if (any_of(Op->users(), [&UseMVT](const SDNode *N) {

        return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;

      }))

    return SDValue();


  APInt V;

  if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))

    return SDValue();


  SDLoc DL(Op);

  SDValue Op0ExtV;

  SDValue Op1ExtV;

  ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();

  // Check if the first operand of the SET_CC is already extended. If it is,

  // split the SET_CC and re-use the extended version of the operand.

  SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),

                                        Op->getOperand(0));

  SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),

                                        Op->getOperand(0));

  if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {

    Op0ExtV = SDValue(Op0SExt, 0);

    Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));

  } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {

    Op0ExtV = SDValue(Op0ZExt, 0);

    Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));

  } else

    return SDValue();


  return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),

                     Op0ExtV, Op1ExtV, Op->getOperand(2));

}


static SDValue


performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,

                               SelectionDAG &DAG) {

  SDValue Vec = N->getOperand(0);

  if (DCI.isBeforeLegalize() &&

      Vec.getValueType().getVectorElementType() == MVT::i1 &&

      Vec.getValueType().isFixedLengthVector() &&

      Vec.getValueType().isPow2VectorType()) {

    SDLoc DL(N);

    return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,

                                  DAG);

  }


  return SDValue();

}


static SDValue performSETCCCombine(SDNode *N,

                                   TargetLowering::DAGCombinerInfo &DCI,

                                   SelectionDAG &DAG) {

  assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();

  SDLoc DL(N);

  EVT VT = N->getValueType(0);


  if (SDValue V = tryToWidenSetCCOperands(N, DAG))

    return V;


  // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X

  if (Cond == ISD::SETNE && isOneConstant(RHS) &&

      LHS->getOpcode() == AArch64ISD::CSEL &&

      isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&

      LHS->hasOneUse()) {

    // Invert CSEL's condition.

    auto OldCond =

        static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));

    auto NewCond = getInvertedCondCode(OldCond);


    // csel 0, 1, !cond, X

    SDValue CSEL = DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(),

                               LHS.getOperand(0), LHS.getOperand(1),

                               getCondCode(DAG, NewCond), LHS.getOperand(3));

    return DAG.getZExtOrTrunc(CSEL, DL, VT);

  }


  // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne

  if (Cond == ISD::SETNE && isNullConstant(RHS) &&

      LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&

      LHS->hasOneUse()) {

    EVT TstVT = LHS->getValueType(0);

    if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64 &&

        LHS->getConstantOperandVal(1) < TstVT.getFixedSizeInBits()) {

      // this pattern will get better opt in emitComparison

      uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);

      SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),

                                DAG.getSignedConstant(TstImm, DL, TstVT));

      return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));

    }

  }


  // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)

  //   ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)

  // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)

  //   ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)

  if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&

      (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&

      (isNullConstant(RHS) || isAllOnesConstant(RHS)) &&

      LHS->getOpcode() == ISD::BITCAST) {

    EVT ToVT = LHS->getValueType(0);

    EVT FromVT = LHS->getOperand(0).getValueType();

    if (FromVT.isFixedLengthVector() &&

        FromVT.getVectorElementType() == MVT::i1) {

      bool IsNull = isNullConstant(RHS);

      LHS = DAG.getNode(IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND,

                        DL, MVT::i1, LHS->getOperand(0));

      LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,

                        LHS);

      return DAG.getSetCC(DL, VT, LHS, RHS, Cond);

    }

  }


  // Try to perform the memcmp when the result is tested for [in]equality with 0

  if (SDValue V = performOrXorChainCombine(N, DAG))

    return V;


  EVT CmpVT = LHS.getValueType();


  // NOTE: This exists as a combine only because it proved too awkward to match

  // splat(1) across all the NEON types during isel.

  APInt SplatLHSVal;

  if (CmpVT.isInteger() && Cond == ISD::SETGT &&

      ISD::isConstantSplatVector(LHS.getNode(), SplatLHSVal) &&

      SplatLHSVal.isOne())

    return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, CmpVT), RHS, ISD::SETGE);


  return SDValue();

}


// Replace a flag-setting operator (eg ANDS) with the generic version

// (eg AND) if the flag is unused.


static SDValue performFlagSettingCombine(SDNode *N,

                                         TargetLowering::DAGCombinerInfo &DCI,

                                         unsigned GenericOpcode) {

  SDLoc DL(N);

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  EVT VT = N->getValueType(0);


  // If the flag result isn't used, convert back to a generic opcode.

  if (!N->hasAnyUseOfValue(1)) {

    SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());

    return DCI.CombineTo(N, Res, SDValue(N, 1));

  }


  // Combine equivalent generic nodes into this node, re-using the result.

  if (SDNode *Generic = DCI.DAG.getNodeIfExists(

          GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS},

          /*AllowCommute=*/true))

    DCI.CombineTo(Generic, SDValue(N, 0));


  return SDValue();

}


static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {

  // setcc_merge_zero pred

  //   (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne

  //   => extract_subvector (inner setcc_merge_zero)

  SDValue Pred = N->getOperand(0);

  SDValue LHS = N->getOperand(1);

  SDValue RHS = N->getOperand(2);

  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();


  if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||

      LHS->getOpcode() != ISD::SIGN_EXTEND)

    return SDValue();


  SDValue Extract = LHS->getOperand(0);

  if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||

      Extract->getValueType(0) != N->getValueType(0) ||

      Extract->getConstantOperandVal(1) != 0)

    return SDValue();


  SDValue InnerSetCC = Extract->getOperand(0);

  if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)

    return SDValue();


  // By this point we've effectively got

  // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive

  // lanes are already zero then the trunc(sext()) sequence is redundant and we

  // can operate on A directly.

  SDValue InnerPred = InnerSetCC.getOperand(0);

  if (Pred.getOpcode() == AArch64ISD::PTRUE &&

      InnerPred.getOpcode() == AArch64ISD::PTRUE &&

      Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&

      Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&

      Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)

    return Extract;


  return SDValue();

}


static bool isSignExtInReg(const SDValue &V) {

  if (V.getOpcode() != AArch64ISD::VASHR ||

      V.getOperand(0).getOpcode() != AArch64ISD::VSHL)

    return false;


  unsigned BitWidth = V->getValueType(0).getScalarSizeInBits();

  unsigned ShiftAmtR = V.getConstantOperandVal(1);

  unsigned ShiftAmtL = V.getOperand(0).getConstantOperandVal(1);

  return (ShiftAmtR == ShiftAmtL && ShiftAmtR == (BitWidth - 1));

}


static SDValue


performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {

  assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&

         "Unexpected opcode!");


  SelectionDAG &DAG = DCI.DAG;

  SDValue Pred = N->getOperand(0);

  SDValue LHS = N->getOperand(1);

  SDValue RHS = N->getOperand(2);

  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();


  if (SDValue V = performSetCCPunpkCombine(N, DAG))

    return V;


  if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&

      LHS->getOpcode() == ISD::SIGN_EXTEND &&

      LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {

    //    setcc_merge_zero(

    //       pred, extend(setcc_merge_zero(pred, ...)), != splat(0))

    // => setcc_merge_zero(pred, ...)

    if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&

        LHS->getOperand(0)->getOperand(0) == Pred)

      return LHS->getOperand(0);


    //    setcc_merge_zero(

    //        all_active, extend(nxvNi1 ...), != splat(0))

    // -> nxvNi1 ...

    if (isAllActivePredicate(DAG, Pred))

      return LHS->getOperand(0);


    //    setcc_merge_zero(

    //        pred, extend(nxvNi1 ...), != splat(0))

    // -> nxvNi1 and(pred, ...)

    if (DCI.isAfterLegalizeDAG())

      // Do this after legalization to allow more folds on setcc_merge_zero

      // to be recognized.

      return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),

                         LHS->getOperand(0), Pred);

  }


  //    setcc_merge_zero(

  //       pred, insert_subvector(undef, signext_inreg(vNi1), 0), != splat(0))

  // => setcc_merge_zero(

  //       pred, insert_subvector(undef, shl(vNi1), 0), != splat(0))

  if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&

      LHS->getOpcode() == ISD::INSERT_SUBVECTOR && LHS.hasOneUse()) {

    SDValue L0 = LHS->getOperand(0);

    SDValue L1 = LHS->getOperand(1);

    SDValue L2 = LHS->getOperand(2);


    if (L0.getOpcode() == ISD::UNDEF && isNullConstant(L2) &&

        isSignExtInReg(L1)) {

      SDLoc DL(N);

      SDValue Shl = L1.getOperand(0);

      SDValue NewLHS = DAG.getNode(ISD::INSERT_SUBVECTOR, DL,

                                   LHS.getValueType(), L0, Shl, L2);

      return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, N->getValueType(0),

                         Pred, NewLHS, RHS, N->getOperand(3));

    }

  }


  return SDValue();

}


// Optimize some simple tbz/tbnz cases.  Returns the new operand and bit to test

// as well as whether the test should be inverted.  This code is required to

// catch these cases (as opposed to standard dag combines) because

// AArch64ISD::TBZ is matched during legalization.


static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,

                                 SelectionDAG &DAG) {


  if (!Op->hasOneUse())

    return Op;


  // We don't handle undef/constant-fold cases below, as they should have

  // already been taken care of (e.g. and of 0, test of undefined shifted bits,

  // etc.)


  // (tbz (trunc x), b) -> (tbz x, b)

  // This case is just here to enable more of the below cases to be caught.

  if (Op->getOpcode() == ISD::TRUNCATE &&

      Bit < Op->getValueType(0).getSizeInBits()) {

    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);

  }


  // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.

  if (Op->getOpcode() == ISD::ANY_EXTEND &&

      Bit < Op->getOperand(0).getValueSizeInBits()) {

    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);

  }


  if (Op->getNumOperands() != 2)

    return Op;


  auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));

  if (!C)

    return Op;


  switch (Op->getOpcode()) {

  default:

    return Op;


  // (tbz (and x, m), b) -> (tbz x, b)

  case ISD::AND:

    if ((C->getZExtValue() >> Bit) & 1)

      return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);

    return Op;


  // (tbz (shl x, c), b) -> (tbz x, b-c)

  case ISD::SHL:

    if (C->getZExtValue() <= Bit &&

        (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {

      Bit = Bit - C->getZExtValue();

      return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);

    }

    return Op;


  // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x

  case ISD::SRA:

    Bit = Bit + C->getZExtValue();

    if (Bit >= Op->getValueType(0).getSizeInBits())

      Bit = Op->getValueType(0).getSizeInBits() - 1;

    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);


  // (tbz (srl x, c), b) -> (tbz x, b+c)

  case ISD::SRL:

    if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {

      Bit = Bit + C->getZExtValue();

      return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);

    }

    return Op;


  // (tbz (xor x, -1), b) -> (tbnz x, b)

  case ISD::XOR:

    if ((C->getZExtValue() >> Bit) & 1)

      Invert = !Invert;

    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);

  }

}


// Optimize test single bit zero/non-zero and branch.


static SDValue performTBZCombine(SDNode *N,

                                 TargetLowering::DAGCombinerInfo &DCI,

                                 SelectionDAG &DAG) {

  unsigned Bit = N->getConstantOperandVal(2);

  bool Invert = false;

  SDValue TestSrc = N->getOperand(1);

  SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);


  if (TestSrc == NewTestSrc)

    return SDValue();


  unsigned NewOpc = N->getOpcode();

  if (Invert) {

    if (NewOpc == AArch64ISD::TBZ)

      NewOpc = AArch64ISD::TBNZ;

    else {

      assert(NewOpc == AArch64ISD::TBNZ);

      NewOpc = AArch64ISD::TBZ;

    }

  }


  SDLoc DL(N);

  return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,

                     DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));

}


// Swap vselect operands where it may allow a predicated operation to achieve

// the `sel`.

//

//     (vselect (setcc ( condcode) (_) (_)) (a)          (op (a) (b)))

//  => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))


static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {

  auto SelectA = N->getOperand(1);

  auto SelectB = N->getOperand(2);

  auto NTy = N->getValueType(0);


  if (!NTy.isScalableVector())

    return SDValue();

  SDValue SetCC = N->getOperand(0);

  if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())

    return SDValue();


  switch (SelectB.getOpcode()) {

  default:

    return SDValue();

  case ISD::FMUL:

  case ISD::FSUB:

  case ISD::FADD:

    break;

  }

  if (SelectA != SelectB.getOperand(0))

    return SDValue();


  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();

  ISD::CondCode InverseCC =

      ISD::getSetCCInverse(CC, SetCC.getOperand(0).getValueType());

  auto InverseSetCC =

      DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),

                   SetCC.getOperand(1), InverseCC);


  return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,

                     {InverseSetCC, SelectB, SelectA});

}


// vselect (v1i1 setcc) ->

//     vselect (v1iXX setcc)  (XX is the size of the compared operand type)

// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as

// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine

// such VSELECT.


static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {

  if (auto SwapResult = trySwapVSelectOperands(N, DAG))

    return SwapResult;


  SDValue N0 = N->getOperand(0);

  SDValue IfTrue = N->getOperand(1);

  SDValue IfFalse = N->getOperand(2);

  EVT ResVT = N->getValueType(0);

  EVT CCVT = N0.getValueType();


  if (isAllActivePredicate(DAG, N0))

    return N->getOperand(1);


  if (isAllInactivePredicate(N0))

    return N->getOperand(2);


  if (isMergePassthruOpcode(IfTrue.getOpcode()) && IfTrue.hasOneUse()) {

    // vselect A, (merge_pasthru_op all_active, B,{Bn,} -), C

    // vselect A, (merge_pasthru_op -, B,{Bn,} undef), C

    // vselect A, (merge_pasthru_op A, B,{Bn,} -), C

    //   -> merge_pasthru_op A, B,{Bn,} C

    if (isAllActivePredicate(DAG, IfTrue->getOperand(0)) ||

        IfTrue->getOperand(IfTrue.getNumOperands() - 1).isUndef() ||

        IfTrue->getOperand(0) == N0) {

      SmallVector<SDValue, 4> Ops(IfTrue->op_values());

      Ops[0] = N0;

      Ops[IfTrue.getNumOperands() - 1] = IfFalse;


      return DAG.getNode(IfTrue.getOpcode(), SDLoc(N), ResVT, Ops);

    }

  }


  // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform

  // into (OR (ASR lhs, N-1), 1), which requires less instructions for the

  // supported types.

  SDValue SetCC = N->getOperand(0);

  if (SetCC.getOpcode() == ISD::SETCC &&

      SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {

    SDValue CmpLHS = SetCC.getOperand(0);

    EVT VT = CmpLHS.getValueType();

    SDNode *CmpRHS = SetCC.getOperand(1).getNode();

    SDNode *SplatLHS = N->getOperand(1).getNode();

    SDNode *SplatRHS = N->getOperand(2).getNode();

    APInt SplatLHSVal;

    if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&

        VT.isSimple() &&

        is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,

                               MVT::v2i32, MVT::v4i32, MVT::v2i64}),

                     VT.getSimpleVT().SimpleTy) &&

        ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&

        SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&

        ISD::isConstantSplatVectorAllOnes(SplatRHS)) {

      unsigned NumElts = VT.getVectorNumElements();

      SmallVector<SDValue, 8> Ops(

          NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),

                                   VT.getScalarType()));

      SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);


      auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);

      auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));

      return Or;

    }

  }


  EVT CmpVT = N0.getOperand(0).getValueType();

  if (N0.getOpcode() != ISD::SETCC ||

      CCVT.getVectorElementCount() != ElementCount::getFixed(1) ||

      CCVT.getVectorElementType() != MVT::i1 ||

      CmpVT.getVectorElementType().isFloatingPoint())

    return SDValue();


  // Only combine when the result type is of the same size as the compared

  // operands.

  if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())

    return SDValue();


  SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),

                       N0.getOperand(0), N0.getOperand(1),

                       cast<CondCodeSDNode>(N0.getOperand(2))->get());

  return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,

                     IfTrue, IfFalse);

}


/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with

/// the compare-mask instructions rather than going via NZCV, even if LHS and

/// RHS are really scalar. This replaces any scalar setcc in the above pattern

/// with a vector one followed by a DUP shuffle on the result.


static SDValue performSelectCombine(SDNode *N,

                                    TargetLowering::DAGCombinerInfo &DCI) {

  SelectionDAG &DAG = DCI.DAG;

  SDValue N0 = N->getOperand(0);

  EVT ResVT = N->getValueType(0);


  if (N0.getOpcode() != ISD::SETCC)

    return SDValue();


  if (ResVT.isScalableVT())

    return SDValue();


  // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered

  // scalar SetCCResultType. We also don't expect vectors, because we assume

  // that selects fed by vector SETCCs are canonicalized to VSELECT.

  assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&

         "Scalar-SETCC feeding SELECT has unexpected result type!");


  // If NumMaskElts == 0, the comparison is larger than select result. The

  // largest real NEON comparison is 64-bits per lane, which means the result is

  // at most 32-bits and an illegal vector. Just bail out for now.

  EVT SrcVT = N0.getOperand(0).getValueType();


  // Don't try to do this optimization when the setcc itself has i1 operands.

  // There are no legal vectors of i1, so this would be pointless. v1f16 is

  // ruled out to prevent the creation of setcc that need to be scalarized.

  if (SrcVT == MVT::i1 ||

      (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))

    return SDValue();


  int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();

  if (!ResVT.isVector() || NumMaskElts == 0)

    return SDValue();


  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);

  EVT CCVT = SrcVT.changeVectorElementTypeToInteger();


  // Also bail out if the vector CCVT isn't the same size as ResVT.

  // This can happen if the SETCC operand size doesn't divide the ResVT size

  // (e.g., f64 vs v3f32).

  if (CCVT.getSizeInBits() != ResVT.getSizeInBits())

    return SDValue();


  // Make sure we didn't create illegal types, if we're not supposed to.

  assert(DCI.isBeforeLegalize() ||

         DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));


  // First perform a vector comparison, where lane 0 is the one we're interested

  // in.

  SDLoc DL(N0);

  SDValue LHS =

      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));

  SDValue RHS =

      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));

  SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));


  // Now duplicate the comparison mask we want across all other lanes.

  SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);

  SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);

  Mask = DAG.getNode(ISD::BITCAST, DL,

                     ResVT.changeVectorElementTypeToInteger(), Mask);


  return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));

}


static SDValue performDUPCombine(SDNode *N,

                                 TargetLowering::DAGCombinerInfo &DCI) {

  EVT VT = N->getValueType(0);

  SDLoc DL(N);

  // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the

  // 128bit vector version.

  if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {

    EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext());

    SmallVector<SDValue> Ops(N->ops());

    if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),

                                             DCI.DAG.getVTList(LVT), Ops)) {

      return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),

                             DCI.DAG.getConstant(0, DL, MVT::i64));

    }

  }


  if (N->getOpcode() == AArch64ISD::DUP) {

    // If the instruction is known to produce a scalar in SIMD registers, we can

    // duplicate it across the vector lanes using DUPLANE instead of moving it

    // to a GPR first. For example, this allows us to handle:

    //   v4i32 = DUP (i32 (FCMGT (f32, f32)))

    SDValue Op = N->getOperand(0);

    // FIXME: Ideally, we should be able to handle all instructions that

    // produce a scalar value in FPRs.

    if (Op.getOpcode() == AArch64ISD::FCMEQ ||

        Op.getOpcode() == AArch64ISD::FCMGE ||

        Op.getOpcode() == AArch64ISD::FCMGT) {

      EVT ElemVT = VT.getVectorElementType();

      EVT ExpandedVT = VT;

      // Insert into a 128-bit vector to match DUPLANE's pattern.

      if (VT.getSizeInBits() != 128)

        ExpandedVT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,

                                      128 / ElemVT.getSizeInBits());

      SDValue Zero = DCI.DAG.getConstant(0, DL, MVT::i64);

      SDValue Vec = DCI.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpandedVT,

                                    DCI.DAG.getUNDEF(ExpandedVT), Op, Zero);

      return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, Vec, Zero);

    }


    if (DCI.isAfterLegalizeDAG()) {

      // If scalar dup's operand is extract_vector_elt, try to combine them into

      // duplane. For example,

      //

      //    t21: i32 = extract_vector_elt t19, Constant:i64<0>

      //  t18: v4i32 = AArch64ISD::DUP t21

      //  ==>

      //  t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>

      SDValue EXTRACT_VEC_ELT = N->getOperand(0);

      if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

        if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {

          unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());

          return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),

                                 EXTRACT_VEC_ELT.getOperand(1));

        }

      }

    }


    return performPostLD1Combine(N, DCI, false);

  }


  return SDValue();

}


/// Get rid of unnecessary NVCASTs (that don't change the type).


static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG) {

  if (N->getValueType(0) == N->getOperand(0).getValueType())

    return N->getOperand(0);

  if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)

    return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),

                       N->getOperand(0).getOperand(0));


  return SDValue();

}


// If all users of the globaladdr are of the form (globaladdr + constant), find

// the smallest constant, fold it into the globaladdr's offset and rewrite the

// globaladdr as (globaladdr + constant) - constant.


static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,

                                           const AArch64Subtarget *Subtarget,

                                           const TargetMachine &TM) {

  auto *GN = cast<GlobalAddressSDNode>(N);

  if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=

      AArch64II::MO_NO_FLAG)

    return SDValue();


  uint64_t MinOffset = -1ull;

  for (SDNode *N : GN->users()) {

    if (N->getOpcode() != ISD::ADD)

      return SDValue();

    auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));

    if (!C)

      C = dyn_cast<ConstantSDNode>(N->getOperand(1));

    if (!C)

      return SDValue();

    MinOffset = std::min(MinOffset, C->getZExtValue());

  }

  uint64_t Offset = MinOffset + GN->getOffset();


  // Require that the new offset is larger than the existing one. Otherwise, we

  // can end up oscillating between two possible DAGs, for example,

  // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).

  if (Offset <= uint64_t(GN->getOffset()))

    return SDValue();


  // Check whether folding this offset is legal. It must not go out of bounds of

  // the referenced object to avoid violating the code model, and must be

  // smaller than 2^20 because this is the largest offset expressible in all

  // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF

  // stores an immediate signed 21 bit offset.)

  //

  // This check also prevents us from folding negative offsets, which will end

  // up being treated in the same way as large positive ones. They could also

  // cause code model violations, and aren't really common enough to matter.

  if (Offset >= (1 << 20))

    return SDValue();


  const GlobalValue *GV = GN->getGlobal();

  Type *T = GV->getValueType();

  if (!T->isSized() ||

      Offset > GV->getDataLayout().getTypeAllocSize(T))

    return SDValue();


  SDLoc DL(GN);

  SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);

  return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,

                     DAG.getConstant(MinOffset, DL, MVT::i64));

}


static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG,

                                  const AArch64Subtarget *Subtarget) {

  SDValue BR = N->getOperand(0);

  if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||

      !BR.getValueType().isScalarInteger())

    return SDValue();


  SDLoc DL(N);

  return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));

}


// Turns the vector of indices into a vector of byte offstes by scaling Offset

// by (BitWidth / 8).


static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,

                                          SDLoc DL, unsigned BitWidth) {

  assert(Offset.getValueType().isScalableVector() &&

         "This method is only for scalable vectors of offsets");


  SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);

  SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);


  return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);

}


/// Check if the value of \p OffsetInBytes can be used as an immediate for

/// the gather load/prefetch and scatter store instructions with vector base and

/// immediate offset addressing mode:

///

///      [<Zn>.[S|D]{, #<imm>}]

///

/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.


inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,

                                                  unsigned ScalarSizeInBytes) {

  // The immediate is not a multiple of the scalar size.

  if (OffsetInBytes % ScalarSizeInBytes)

    return false;


  // The immediate is out of range.

  if (OffsetInBytes / ScalarSizeInBytes > 31)

    return false;


  return true;

}


/// Check if the value of \p Offset represents a valid immediate for the SVE

/// gather load/prefetch and scatter store instructiona with vector base and

/// immediate offset addressing mode:

///

///      [<Zn>.[S|D]{, #<imm>}]

///

/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.


static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,

                                           unsigned ScalarSizeInBytes) {

  ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());

  return OffsetConst && isValidImmForSVEVecImmAddrMode(

                            OffsetConst->getZExtValue(), ScalarSizeInBytes);

}


static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,

                                          unsigned Opcode,

                                          bool OnlyPackedOffsets = true) {

  const SDValue Src = N->getOperand(2);

  const EVT SrcVT = Src->getValueType(0);

  assert(SrcVT.isScalableVector() &&

         "Scatter stores are only possible for SVE vectors");


  SDLoc DL(N);

  MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();


  // Make sure that source data will fit into an SVE register

  if (SrcVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)

    return SDValue();


  // For FPs, ACLE only supports _packed_ single and double precision types.

  // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.

  if (SrcElVT.isFloatingPoint())

    if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&

        ((Opcode != AArch64ISD::SST1Q_PRED &&

          Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||

         ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))

      return SDValue();


  // Depending on the addressing mode, this is either a pointer or a vector of

  // pointers (that fits into one register)

  SDValue Base = N->getOperand(4);

  // Depending on the addressing mode, this is either a single offset or a

  // vector of offsets  (that fits into one register)

  SDValue Offset = N->getOperand(5);


  // For "scalar + vector of indices", just scale the indices. This only

  // applies to non-temporal scatters because there's no instruction that takes

  // indices.

  if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {

    Offset =

        getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());

    Opcode = AArch64ISD::SSTNT1_PRED;

  } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {

    Offset =

        getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());

    Opcode = AArch64ISD::SST1Q_PRED;

  }


  // In the case of non-temporal gather loads there's only one SVE instruction

  // per data-size: "scalar + vector", i.e.

  //    * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]

  // Since we do have intrinsics that allow the arguments to be in a different

  // order, we may need to swap them to match the spec.

  if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&

      Offset.getValueType().isVector())

    std::swap(Base, Offset);


  // SST1_IMM requires that the offset is an immediate that is:

  //    * a multiple of #SizeInBytes,

  //    * in the range [0, 31 x #SizeInBytes],

  // where #SizeInBytes is the size in bytes of the stored items. For

  // immediates outside that range and non-immediate scalar offsets use SST1 or

  // SST1_UXTW instead.

  if (Opcode == AArch64ISD::SST1_IMM_PRED) {

    if (!isValidImmForSVEVecImmAddrMode(Offset,

                                        SrcVT.getScalarSizeInBits() / 8)) {

      if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)

        Opcode = AArch64ISD::SST1_UXTW_PRED;

      else

        Opcode = AArch64ISD::SST1_PRED;


      std::swap(Base, Offset);

    }

  }


  auto &TLI = DAG.getTargetLoweringInfo();

  if (!TLI.isTypeLegal(Base.getValueType()))

    return SDValue();


  // Some scatter store variants allow unpacked offsets, but only as nxv2i32

  // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to

  // nxv2i64. Legalize accordingly.

  if (!OnlyPackedOffsets &&

      Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)

    Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);


  if (!TLI.isTypeLegal(Offset.getValueType()))

    return SDValue();


  // Source value type that is representable in hardware

  EVT HwSrcVt = getSVEContainerType(SrcVT);


  // Keep the original type of the input data to store - this is needed to be

  // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For

  // FP values we want the integer equivalent, so just use HwSrcVt.

  SDValue InputVT = DAG.getValueType(SrcVT);

  if (SrcVT.isFloatingPoint())

    InputVT = DAG.getValueType(HwSrcVt);


  SDVTList VTs = DAG.getVTList(MVT::Other);

  SDValue SrcNew;


  if (Src.getValueType().isFloatingPoint())

    SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);

  else

    SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);


  SDValue Ops[] = {N->getOperand(0), // Chain

                   SrcNew,

                   N->getOperand(3), // Pg

                   Base,

                   Offset,

                   InputVT};


  return DAG.getNode(Opcode, DL, VTs, Ops);

}


static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,

                                        unsigned Opcode,

                                        bool OnlyPackedOffsets = true) {

  const EVT RetVT = N->getValueType(0);

  assert(RetVT.isScalableVector() &&

         "Gather loads are only possible for SVE vectors");


  SDLoc DL(N);


  // Make sure that the loaded data will fit into an SVE register

  if (RetVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)

    return SDValue();


  // Depending on the addressing mode, this is either a pointer or a vector of

  // pointers (that fits into one register)

  SDValue Base = N->getOperand(3);

  // Depending on the addressing mode, this is either a single offset or a

  // vector of offsets  (that fits into one register)

  SDValue Offset = N->getOperand(4);


  // For "scalar + vector of indices", scale the indices to obtain unscaled

  // offsets. This applies to non-temporal and quadword gathers, which do not

  // have an addressing mode with scaled offset.

  if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {

    Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,

                                        RetVT.getScalarSizeInBits());

    Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;

  } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {

    Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,

                                        RetVT.getScalarSizeInBits());

    Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;

  }


  // In the case of non-temporal gather loads and quadword gather loads there's

  // only one addressing mode : "vector + scalar", e.g.

  //   ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]

  // Since we do have intrinsics that allow the arguments to be in a different

  // order, we may need to swap them to match the spec.

  if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||

       Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&

      Offset.getValueType().isVector())

    std::swap(Base, Offset);


  // GLD{FF}1_IMM requires that the offset is an immediate that is:

  //    * a multiple of #SizeInBytes,

  //    * in the range [0, 31 x #SizeInBytes],

  // where #SizeInBytes is the size in bytes of the loaded items. For

  // immediates outside that range and non-immediate scalar offsets use

  // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.

  if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||

      Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {

    if (!isValidImmForSVEVecImmAddrMode(Offset,

                                        RetVT.getScalarSizeInBits() / 8)) {

      if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)

        Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)

                     ? AArch64ISD::GLD1_UXTW_MERGE_ZERO

                     : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;

      else

        Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)

                     ? AArch64ISD::GLD1_MERGE_ZERO

                     : AArch64ISD::GLDFF1_MERGE_ZERO;


      std::swap(Base, Offset);

    }

  }


  auto &TLI = DAG.getTargetLoweringInfo();

  if (!TLI.isTypeLegal(Base.getValueType()))

    return SDValue();


  // Some gather load variants allow unpacked offsets, but only as nxv2i32

  // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to

  // nxv2i64. Legalize accordingly.

  if (!OnlyPackedOffsets &&

      Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)

    Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);


  // Return value type that is representable in hardware

  EVT HwRetVt = getSVEContainerType(RetVT);


  // Keep the original output value type around - this is needed to be able to

  // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP

  // values we want the integer equivalent, so just use HwRetVT.

  SDValue OutVT = DAG.getValueType(RetVT);

  if (RetVT.isFloatingPoint())

    OutVT = DAG.getValueType(HwRetVt);


  SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);

  SDValue Ops[] = {N->getOperand(0), // Chain

                   N->getOperand(2), // Pg

                   Base, Offset, OutVT};


  SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);

  SDValue LoadChain = SDValue(Load.getNode(), 1);


  if (RetVT.isInteger() && (RetVT != HwRetVt))

    Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));


  // If the original return value was FP, bitcast accordingly. Doing it here

  // means that we can avoid adding TableGen patterns for FPs.

  if (RetVT.isFloatingPoint())

    Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));


  return DAG.getMergeValues({Load, LoadChain}, DL);

}


static SDValue


performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,

                              SelectionDAG &DAG) {

  SDLoc DL(N);

  SDValue Src = N->getOperand(0);

  unsigned Opc = Src->getOpcode();


  // Sign extend of an unsigned unpack -> signed unpack

  if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {


    unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI

                                               : AArch64ISD::SUNPKLO;


    // Push the sign extend to the operand of the unpack

    // This is necessary where, for example, the operand of the unpack

    // is another unpack:

    // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)

    // ->

    // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)

    // ->

    // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))

    SDValue ExtOp = Src->getOperand(0);

    auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();

    EVT EltTy = VT.getVectorElementType();

    (void)EltTy;


    assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&

           "Sign extending from an invalid type");


    EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());


    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),

                              ExtOp, DAG.getValueType(ExtVT));


    return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);

  }


  // Sign extend of CSET -> CSETM.

  if (Opc == AArch64ISD::CSEL &&

      cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1) {

    EVT VT = N->getValueType(0);

    SDValue TVal = Src.getOperand(0);

    SDValue FVal = Src.getOperand(1);


    // SIGN_EXTEND_INREG (CSEL 0, 1, cc, NZCV), i1 --> CSEL 0, -1, cc, NZCV

    if (isNullConstant(TVal) && isOneConstant(FVal))

      return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal,

                         DAG.getAllOnesConstant(DL, VT), Src.getOperand(2),

                         Src.getOperand(3));


    // SIGN_EXTEND_INREG (CSEL 1, 0, cc, NZCV), i1 --> CSEL -1, 0, cc, NZCV

    if (isOneConstant(TVal) && isNullConstant(FVal))

      return DAG.getNode(AArch64ISD::CSEL, DL, VT,

                         DAG.getAllOnesConstant(DL, VT), FVal,

                         Src.getOperand(2), Src.getOperand(3));

  }


  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  if (!EnableCombineMGatherIntrinsics)

    return SDValue();


  // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates

  // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.

  unsigned NewOpc;

  unsigned MemVTOpNum = 4;

  switch (Opc) {

  case AArch64ISD::LD1_MERGE_ZERO:

    NewOpc = AArch64ISD::LD1S_MERGE_ZERO;

    MemVTOpNum = 3;

    break;

  case AArch64ISD::LDNF1_MERGE_ZERO:

    NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;

    MemVTOpNum = 3;

    break;

  case AArch64ISD::LDFF1_MERGE_ZERO:

    NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;

    MemVTOpNum = 3;

    break;

  case AArch64ISD::GLD1_MERGE_ZERO:

    NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;

    break;

  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:

    NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;

    break;

  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:

    NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;

    break;

  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:

    NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;

    break;

  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:

    NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;

    break;

  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:

    NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;

    break;

  case AArch64ISD::GLD1_IMM_MERGE_ZERO:

    NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;

    break;

  case AArch64ISD::GLDFF1_MERGE_ZERO:

    NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;

    break;

  case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:

    NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;

    break;

  case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:

    NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;

    break;

  case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:

    NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;

    break;

  case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:

    NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;

    break;

  case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:

    NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;

    break;

  case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:

    NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;

    break;

  case AArch64ISD::GLDNT1_MERGE_ZERO:

    NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;

    break;

  default:

    return SDValue();

  }


  EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();

  EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();


  if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())

    return SDValue();


  EVT DstVT = N->getValueType(0);

  SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);


  SmallVector<SDValue, 5> Ops;

  for (unsigned I = 0; I < Src->getNumOperands(); ++I)

    Ops.push_back(Src->getOperand(I));


  SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);

  DCI.CombineTo(N, ExtLoad);

  DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));


  // Return N so it doesn't get rechecked

  return SDValue(N, 0);

}


/// Legalize the gather prefetch (scalar + vector addressing mode) when the

/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset

/// != nxv2i32) do not need legalization.


static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {

  const unsigned OffsetPos = 4;

  SDValue Offset = N->getOperand(OffsetPos);


  // Not an unpacked vector, bail out.

  if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)

    return SDValue();


  // Extend the unpacked offset vector to 64-bit lanes.

  SDLoc DL(N);

  Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);

  SmallVector<SDValue, 5> Ops(N->ops());

  // Replace the offset operand with the 64-bit one.

  Ops[OffsetPos] = Offset;


  return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);

}


/// Combines a node carrying the intrinsic

/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses

/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to

/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the

/// sve gather prefetch instruction with vector plus immediate addressing mode.


static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,

                                               unsigned ScalarSizeInBytes) {

  const unsigned ImmPos = 4, OffsetPos = 3;

  // No need to combine the node if the immediate is valid...

  if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))

    return SDValue();


  // ...otherwise swap the offset base with the offset...

  SmallVector<SDValue, 5> Ops(N->ops());

  std::swap(Ops[ImmPos], Ops[OffsetPos]);

  // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to

  // `aarch64_sve_prfb_gather_uxtw_index`.

  SDLoc DL(N);

  Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,

                           MVT::i64);


  return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);

}


// Return true if the vector operation can guarantee only the first lane of its

// result contains data, with all bits in other lanes set to zero.


static bool isLanes1toNKnownZero(SDValue Op) {

  switch (Op.getOpcode()) {

  default:

    return false;

  case AArch64ISD::ANDV_PRED:

  case AArch64ISD::EORV_PRED:

  case AArch64ISD::FADDA_PRED:

  case AArch64ISD::FADDV_PRED:

  case AArch64ISD::FMAXNMV_PRED:

  case AArch64ISD::FMAXV_PRED:

  case AArch64ISD::FMINNMV_PRED:

  case AArch64ISD::FMINV_PRED:

  case AArch64ISD::ORV_PRED:

  case AArch64ISD::SADDV_PRED:

  case AArch64ISD::SMAXV_PRED:

  case AArch64ISD::SMINV_PRED:

  case AArch64ISD::UADDV_PRED:

  case AArch64ISD::UMAXV_PRED:

  case AArch64ISD::UMINV_PRED:

    return true;

  }

}


// Return true if the vector operation can guarantee that the first lane of its

// result is active.


static bool isLane0KnownActive(SDValue Op) {

  switch (Op.getOpcode()) {

  default:

    return false;

  case AArch64ISD::REINTERPRET_CAST:

    return isLane0KnownActive(Op->getOperand(0));

  case ISD::SPLAT_VECTOR:

    return isOneConstant(Op.getOperand(0));

  case AArch64ISD::PTRUE:

    return Op.getConstantOperandVal(0) == AArch64SVEPredPattern::all;

  };

}


static SDValue removeRedundantInsertVectorElt(SDNode *N) {

  assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");

  SDValue InsertVec = N->getOperand(0);

  SDValue InsertElt = N->getOperand(1);

  SDValue InsertIdx = N->getOperand(2);


  // We only care about inserts into the first element...

  if (!isNullConstant(InsertIdx))

    return SDValue();

  // ...of a zero'd vector...

  if (!ISD::isConstantSplatVectorAllZeros(InsertVec.getNode()))

    return SDValue();

  // ...where the inserted data was previously extracted...

  if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

    return SDValue();


  SDValue ExtractVec = InsertElt.getOperand(0);

  SDValue ExtractIdx = InsertElt.getOperand(1);


  // ...from the first element of a vector.

  if (!isNullConstant(ExtractIdx))

    return SDValue();


  // If we get here we are effectively trying to zero lanes 1-N of a vector.


  // Ensure there's no type conversion going on.

  if (N->getValueType(0) != ExtractVec.getValueType())

    return SDValue();


  if (!isLanes1toNKnownZero(ExtractVec))

    return SDValue();


  // The explicit zeroing is redundant.

  return ExtractVec;

}


static SDValue


performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {

  if (SDValue Res = removeRedundantInsertVectorElt(N))

    return Res;


  return performPostLD1Combine(N, DCI, true);

}


static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,

                                      TargetLowering::DAGCombinerInfo &DCI,

                                      const AArch64Subtarget *Subtarget) {

  SDValue N0 = N->getOperand(0);

  EVT VT = N->getValueType(0);


  // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.

  if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::FP_ROUND)

    return SDValue();


  auto hasValidElementTypeForFPExtLoad = [](EVT VT) {

    EVT EltVT = VT.getVectorElementType();

    return EltVT == MVT::f32 || EltVT == MVT::f64;

  };


  // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))

  // We purposefully don't care about legality of the nodes here as we know

  // they can be split down into something legal.

  if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&

      N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&

      VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&

      VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {

    LoadSDNode *LN0 = cast<LoadSDNode>(N0);

    SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,

                                     LN0->getChain(), LN0->getBasePtr(),

                                     N0.getValueType(), LN0->getMemOperand());

    DCI.CombineTo(N, ExtLoad);

    DCI.CombineTo(

        N0.getNode(),

        DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,

                    DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),

        ExtLoad.getValue(1));

    return SDValue(N, 0); // Return N so it doesn't get rechecked!

  }


  return SDValue();

}


static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,

                                      const AArch64Subtarget *Subtarget) {

  EVT VT = N->getValueType(0);


  // Don't expand for NEON, SVE2 or SME

  if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())

    return SDValue();


  SDLoc DL(N);


  SDValue Mask = N->getOperand(0);

  SDValue In1 = N->getOperand(1);

  SDValue In2 = N->getOperand(2);


  SDValue InvMask = DAG.getNOT(DL, Mask, VT);

  SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);

  SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);

  return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);

}


static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);


  SDValue Insert = N->getOperand(0);

  if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)

    return SDValue();


  if (!Insert.getOperand(0).isUndef())

    return SDValue();


  uint64_t IdxInsert = Insert.getConstantOperandVal(2);

  uint64_t IdxDupLane = N->getConstantOperandVal(1);

  if (IdxInsert != 0 || IdxDupLane != 0)

    return SDValue();


  SDValue Bitcast = Insert.getOperand(1);

  if (Bitcast.getOpcode() != ISD::BITCAST)

    return SDValue();


  SDValue Subvec = Bitcast.getOperand(0);

  EVT SubvecVT = Subvec.getValueType();

  if (!SubvecVT.is128BitVector())

    return SDValue();

  EVT NewSubvecVT =

      getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType());


  SDLoc DL(N);

  SDValue NewInsert =

      DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,

                  DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));

  SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,

                                      NewInsert, N->getOperand(1));

  return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);

}


// Try to combine mull with uzp1.


static SDValue tryCombineMULLWithUZP1(SDNode *N,

                                      TargetLowering::DAGCombinerInfo &DCI,

                                      SelectionDAG &DAG) {

  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);


  SDValue ExtractHigh;

  SDValue ExtractLow;

  SDValue TruncHigh;

  SDValue TruncLow;

  SDLoc DL(N);


  // Check the operands are trunc and extract_high.

  if (isEssentiallyExtractHighSubvector(LHS) &&

      RHS.getOpcode() == ISD::TRUNCATE) {

    TruncHigh = RHS;

    if (LHS.getOpcode() == ISD::BITCAST)

      ExtractHigh = LHS.getOperand(0);

    else

      ExtractHigh = LHS;

  } else if (isEssentiallyExtractHighSubvector(RHS) &&

             LHS.getOpcode() == ISD::TRUNCATE) {

    TruncHigh = LHS;

    if (RHS.getOpcode() == ISD::BITCAST)

      ExtractHigh = RHS.getOperand(0);

    else

      ExtractHigh = RHS;

  } else

    return SDValue();


  // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op

  // with uzp1.

  // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll

  SDValue TruncHighOp = TruncHigh.getOperand(0);

  EVT TruncHighOpVT = TruncHighOp.getValueType();

  if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||

      DAG.isSplatValue(TruncHighOp, false))

    return SDValue();


  // Check there is other extract_high with same source vector.

  // For example,

  //

  //    t18: v4i16 = extract_subvector t2, Constant:i64<0>

  //    t12: v4i16 = truncate t11

  //  t31: v4i32 = AArch64ISD::SMULL t18, t12

  //    t23: v4i16 = extract_subvector t2, Constant:i64<4>

  //    t16: v4i16 = truncate t15

  //  t30: v4i32 = AArch64ISD::SMULL t23, t1

  //

  // This dagcombine assumes the two extract_high uses same source vector in

  // order to detect the pair of the mull. If they have different source vector,

  // this code will not work.

  // TODO: Should also try to look through a bitcast.

  bool HasFoundMULLow = true;

  SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);

  if (ExtractHighSrcVec->use_size() != 2)

    HasFoundMULLow = false;


  // Find ExtractLow.

  for (SDNode *User : ExtractHighSrcVec.getNode()->users()) {

    if (User == ExtractHigh.getNode())

      continue;


    if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||

        !isNullConstant(User->getOperand(1))) {

      HasFoundMULLow = false;

      break;

    }


    ExtractLow.setNode(User);

  }


  if (!ExtractLow || !ExtractLow->hasOneUse())

    HasFoundMULLow = false;


  // Check ExtractLow's user.

  if (HasFoundMULLow) {

    SDNode *ExtractLowUser = *ExtractLow.getNode()->user_begin();

    if (ExtractLowUser->getOpcode() != N->getOpcode()) {

      HasFoundMULLow = false;

    } else {

      if (ExtractLowUser->getOperand(0) == ExtractLow) {

        if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)

          TruncLow = ExtractLowUser->getOperand(1);

        else

          HasFoundMULLow = false;

      } else {

        if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)

          TruncLow = ExtractLowUser->getOperand(0);

        else

          HasFoundMULLow = false;

      }

    }

  }


  // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op

  // with uzp1.

  // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll

  EVT TruncHighVT = TruncHigh.getValueType();

  EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());

  SDValue TruncLowOp =

      HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);

  EVT TruncLowOpVT = TruncLowOp.getValueType();

  if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||

                         DAG.isSplatValue(TruncLowOp, false)))

    return SDValue();


  // Create uzp1, extract_high and extract_low.

  if (TruncHighOpVT != UZP1VT)

    TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);

  if (TruncLowOpVT != UZP1VT)

    TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);


  SDValue UZP1 =

      DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);

  SDValue HighIdxCst =

      DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);

  SDValue NewTruncHigh =

      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);

  DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);


  if (HasFoundMULLow) {

    EVT TruncLowVT = TruncLow.getValueType();

    SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,

                                      UZP1, ExtractLow.getOperand(1));

    DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);

  }


  return SDValue(N, 0);

}


static SDValue performMULLCombine(SDNode *N,

                                  TargetLowering::DAGCombinerInfo &DCI,

                                  SelectionDAG &DAG) {

  if (SDValue Val =

          tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG))

    return Val;


  if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))

    return Val;


  return SDValue();

}


static SDValue performPTestFirstCombine(SDNode *N,

                                        TargetLowering::DAGCombinerInfo &DCI,

                                        SelectionDAG &DAG) {

  if (DCI.isBeforeLegalize())

    return SDValue();


  SDLoc DL(N);

  auto Mask = N->getOperand(0);

  auto Pred = N->getOperand(1);


  if (!isLane0KnownActive(Mask))

    return SDValue();


  if (Pred->getOpcode() == AArch64ISD::REINTERPRET_CAST)

    Pred = Pred->getOperand(0);


  if (Pred->getOpcode() == ISD::CONCAT_VECTORS) {

    Pred = Pred->getOperand(0);

    Pred = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pred);

    return DAG.getNode(AArch64ISD::PTEST_FIRST, DL, N->getValueType(0), Mask,

                       Pred);

  }


  return SDValue();

}


static SDValue


performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,

                             SelectionDAG &DAG) {

  // Let's do below transform.

  //

  //         t34: v4i32 = AArch64ISD::UADDLV t2

  //       t35: i32 = extract_vector_elt t34, Constant:i64<0>

  //     t7: i64 = zero_extend t35

  //   t20: v1i64 = scalar_to_vector t7

  // ==>

  //      t34: v4i32 = AArch64ISD::UADDLV t2

  //    t39: v2i32 = extract_subvector t34, Constant:i64<0>

  //  t40: v1i64 = AArch64ISD::NVCAST t39

  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  EVT VT = N->getValueType(0);

  if (VT != MVT::v1i64)

    return SDValue();


  SDValue ZEXT = N->getOperand(0);

  if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)

    return SDValue();


  SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);

  if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

      EXTRACT_VEC_ELT.getValueType() != MVT::i32)

    return SDValue();


  if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))

    return SDValue();


  SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);

  if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||

      UADDLV.getValueType() != MVT::v4i32 ||

      UADDLV.getOperand(0).getValueType() != MVT::v8i8)

    return SDValue();


  // Let's generate new sequence with AArch64ISD::NVCAST.

  SDLoc DL(N);

  SDValue EXTRACT_SUBVEC =

      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,

                  DAG.getConstant(0, DL, MVT::i64));

  SDValue NVCAST =

      DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);


  return NVCAST;

}


static SDValue performVectorDeinterleaveCombine(

    SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {

  if (!DCI.isBeforeLegalize())

    return SDValue();


  unsigned NumParts = N->getNumOperands();

  if (NumParts != 2 && NumParts != 4)

    return SDValue();


  EVT SubVecTy = N->getValueType(0);


  // At the moment we're unlikely to see a fixed-width vector deinterleave as

  // we usually generate shuffles instead.

  unsigned MinNumElements = SubVecTy.getVectorMinNumElements();

  if (!SubVecTy.isScalableVector() ||

      SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||

      !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))

    return SDValue();


  // Make sure each input operand is the correct extract_subvector of the same

  // wider vector.

  SDValue Op0 = N->getOperand(0);

  for (unsigned I = 0; I < NumParts; I++) {

    SDValue OpI = N->getOperand(I);

    if (OpI->getOpcode() != ISD::EXTRACT_SUBVECTOR ||

        OpI->getOperand(0) != Op0->getOperand(0))

      return SDValue();

    if (OpI->getConstantOperandVal(1) != (I * MinNumElements))

      return SDValue();

  }


  // Normal loads are currently already handled by the InterleavedAccessPass so

  // we don't expect to see them here. Bail out if the masked load has an

  // unexpected number of uses, since we want to avoid a situation where we have

  // both deinterleaving loads and normal loads in the same block. Also, discard

  // masked loads that are extending, indexed, have an unexpected offset or have

  // an unsupported passthru value until we find a valid use case.

  auto MaskedLoad = dyn_cast<MaskedLoadSDNode>(Op0->getOperand(0));

  if (!MaskedLoad || !MaskedLoad->hasNUsesOfValue(NumParts, 0) ||

      !MaskedLoad->isSimple() || !ISD::isNormalMaskedLoad(MaskedLoad) ||

      !MaskedLoad->getOffset().isUndef() ||

      (!MaskedLoad->getPassThru()->isUndef() &&

       !isZerosVector(MaskedLoad->getPassThru().getNode())))

    return SDValue();


  // Now prove that the mask is an interleave of identical masks.

  SDLoc DL(N);

  SDValue NarrowMask =

      getNarrowMaskForInterleavedOps(DAG, DL, MaskedLoad->getMask(), NumParts);

  if (!NarrowMask)

    return SDValue();


  const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret

                                          : Intrinsic::aarch64_sve_ld4_sret;

  SDValue NewLdOps[] = {MaskedLoad->getChain(),

                        DAG.getConstant(IID, DL, MVT::i32), NarrowMask,

                        MaskedLoad->getBasePtr()};

  SDValue Res;

  if (NumParts == 2)

    Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,

                      {SubVecTy, SubVecTy, MVT::Other}, NewLdOps);

  else

    Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,

                      {SubVecTy, SubVecTy, SubVecTy, SubVecTy, MVT::Other},

                      NewLdOps);


  // We can now generate a structured load!

  SmallVector<SDValue, 4> ResOps(NumParts);

  for (unsigned Idx = 0; Idx < NumParts; Idx++)

    ResOps[Idx] = SDValue(Res.getNode(), Idx);


  // Replace uses of the original chain result with the new chain result.

  DAG.ReplaceAllUsesOfValueWith(SDValue(MaskedLoad, 1),

                                SDValue(Res.getNode(), NumParts));

  return DCI.CombineTo(N, ResOps, false);

}


/// If the operand is a bitwise AND with a constant RHS, and the shift has a

/// constant RHS and is the only use, we can pull it out of the shift, i.e.

///

///   (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))

///

/// We prefer this canonical form to match existing isel patterns.


static SDValue performSHLCombine(SDNode *N,

                                 TargetLowering::DAGCombinerInfo &DCI,

                                 SelectionDAG &DAG) {

  if (DCI.isBeforeLegalizeOps())

    return SDValue();


  SDValue Op0 = N->getOperand(0);

  if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())

    return SDValue();


  SDValue C1 = Op0->getOperand(1);

  SDValue C2 = N->getOperand(1);

  if (!isa<ConstantSDNode>(C1) || !isa<ConstantSDNode>(C2))

    return SDValue();


  // Might be folded into shifted op, do not lower.

  if (N->hasOneUse()) {

    unsigned UseOpc = N->user_begin()->getOpcode();

    if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||

        UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)

      return SDValue();

  }


  SDLoc DL(N);

  EVT VT = N->getValueType(0);


  // Don't combine unless (shl C1, C2) can be constant folded. Otherwise,

  // DAGCombiner will simplify (and (op x...), (op y...)) -> (op (and x, y))

  // causing infinite loop. Result may also be worse.

  SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);

  if (!isa<ConstantSDNode>(NewRHS))

    return SDValue();


  SDValue X = Op0->getOperand(0);

  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);

  return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);

}


static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) {

  unsigned IntrinsicID = N->getConstantOperandVal(1);

  auto Register =

      (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR

                                              : AArch64SysReg::RNDRRS);

  SDLoc DL(N);

  SDValue A = DAG.getNode(

      AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),

      N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));

  SDValue B = DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,

                          DAG.getConstant(0, DL, MVT::i32),

                          DAG.getConstant(0, DL, MVT::i32),

                          getCondCode(DAG, AArch64CC::NE), A.getValue(1));

  return DAG.getMergeValues(

      {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);

}


SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,

                                                 DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  switch (N->getOpcode()) {

  default:

    LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");

    break;

  case ISD::VECTOR_DEINTERLEAVE:

    return performVectorDeinterleaveCombine(N, DCI, DAG);

  case ISD::VECREDUCE_AND:

  case ISD::VECREDUCE_OR:

  case ISD::VECREDUCE_XOR:

    return performVecReduceBitwiseCombine(N, DCI, DAG);

  case ISD::ADD:

  case ISD::SUB:

    return performAddSubCombine(N, DCI);

  case ISD::BUILD_VECTOR:

    return performBuildVectorCombine(N, DCI, DAG);

  case ISD::SMIN:

    return performSMINCombine(N, DAG);

  case ISD::TRUNCATE:

    return performTruncateCombine(N, DAG, DCI);

  case AArch64ISD::ANDS:

    return performFlagSettingCombine(N, DCI, ISD::AND);

  case AArch64ISD::ADC:

    if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))

      return R;

    return foldADCToCINC(N, DAG);

  case AArch64ISD::SBC:

    return foldOverflowCheck(N, DAG, /* IsAdd */ false);

  case AArch64ISD::ADCS:

    if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))

      return R;

    return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);

  case AArch64ISD::SBCS:

    if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))

      return R;

    return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);

  case AArch64ISD::ADDS:

    return performFlagSettingCombine(N, DCI, ISD::ADD);

  case AArch64ISD::SUBS:

    return performFlagSettingCombine(N, DCI, ISD::SUB);

  case AArch64ISD::BICi: {

    APInt DemandedBits =

        APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());

    APInt DemandedElts =

        APInt::getAllOnes(N->getValueType(0).getVectorNumElements());


    if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(

            SDValue(N, 0), DemandedBits, DemandedElts, DCI))

      return SDValue();


    break;

  }

  case ISD::XOR:

    return performXorCombine(N, DAG, DCI, Subtarget);

  case ISD::MUL:

    return performMulCombine(N, DAG, DCI, Subtarget);

  case ISD::SINT_TO_FP:

  case ISD::UINT_TO_FP:

    return performIntToFpCombine(N, DAG, DCI, Subtarget);

  case ISD::FP_TO_SINT:

  case ISD::FP_TO_UINT:

  case ISD::FP_TO_SINT_SAT:

  case ISD::FP_TO_UINT_SAT:

    return performFpToIntCombine(N, DAG, DCI, Subtarget);

  case ISD::OR:

    return performORCombine(N, DCI, Subtarget, *this);

  case ISD::AND:

    return performANDCombine(N, DCI);

  case ISD::FADD:

    return performFADDCombine(N, DCI);

  case ISD::INTRINSIC_WO_CHAIN:

    return performIntrinsicCombine(N, DCI, Subtarget);

  case ISD::ANY_EXTEND:

  case ISD::ZERO_EXTEND:

  case ISD::SIGN_EXTEND:

    return performExtendCombine(N, DCI, DAG);

  case ISD::SIGN_EXTEND_INREG:

    return performSignExtendInRegCombine(N, DCI, DAG);

  case ISD::CONCAT_VECTORS:

    return performConcatVectorsCombine(N, DCI, DAG);

  case ISD::EXTRACT_SUBVECTOR:

    return performExtractSubvectorCombine(N, DCI, DAG);

  case ISD::INSERT_SUBVECTOR:

    return performInsertSubvectorCombine(N, DCI, DAG);

  case ISD::SELECT:

    return performSelectCombine(N, DCI);

  case ISD::VSELECT:

    return performVSelectCombine(N, DCI.DAG);

  case ISD::SETCC:

    return performSETCCCombine(N, DCI, DAG);

  case ISD::LOAD:

    return performLOADCombine(N, DCI, DAG, Subtarget);

  case ISD::STORE:

    return performSTORECombine(N, DCI, DAG, Subtarget);

  case ISD::MSTORE:

    return performMSTORECombine(N, DCI, DAG, Subtarget);

  case ISD::MGATHER:

  case ISD::MSCATTER:

  case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:

    return performMaskedGatherScatterCombine(N, DCI, DAG);

  case ISD::FP_EXTEND:

    return performFPExtendCombine(N, DAG, DCI, Subtarget);

  case AArch64ISD::BRCOND:

    return performBRCONDCombine(N, DCI, DAG);

  case AArch64ISD::TBNZ:

  case AArch64ISD::TBZ:

    return performTBZCombine(N, DCI, DAG);

  case AArch64ISD::CSEL:

    return performCSELCombine(N, DCI, DAG);

  case AArch64ISD::DUP:

  case AArch64ISD::DUPLANE8:

  case AArch64ISD::DUPLANE16:

  case AArch64ISD::DUPLANE32:

  case AArch64ISD::DUPLANE64:

    return performDUPCombine(N, DCI);

  case AArch64ISD::DUPLANE128:

    return performDupLane128Combine(N, DAG);

  case AArch64ISD::NVCAST:

    return performNVCASTCombine(N, DAG);

  case AArch64ISD::SPLICE:

    return performSpliceCombine(N, DAG);

  case AArch64ISD::UUNPKLO:

  case AArch64ISD::UUNPKHI:

    return performUnpackCombine(N, DAG, Subtarget);

  case AArch64ISD::UZP1:

  case AArch64ISD::UZP2:

    return performUzpCombine(N, DAG, Subtarget);

  case AArch64ISD::SETCC_MERGE_ZERO:

    return performSetccMergeZeroCombine(N, DCI);

  case AArch64ISD::REINTERPRET_CAST:

    return performReinterpretCastCombine(N);

  case AArch64ISD::GLD1_MERGE_ZERO:

  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:

  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:

  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:

  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:

  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:

  case AArch64ISD::GLD1_IMM_MERGE_ZERO:

  case AArch64ISD::GLD1S_MERGE_ZERO:

  case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:

  case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:

  case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:

  case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:

  case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:

  case AArch64ISD::GLD1S_IMM_MERGE_ZERO:

    return performGLD1Combine(N, DAG);

  case AArch64ISD::VASHR:

  case AArch64ISD::VLSHR:

    return performVectorShiftCombine(N, *this, DCI);

  case AArch64ISD::SUNPKLO:

    return performSunpkloCombine(N, DAG);

  case AArch64ISD::BSP:

    return performBSPExpandForSVE(N, DAG, Subtarget);

  case ISD::INSERT_VECTOR_ELT:

    return performInsertVectorEltCombine(N, DCI);

  case ISD::EXTRACT_VECTOR_ELT:

    return performExtractVectorEltCombine(N, DCI, Subtarget);

  case ISD::VECREDUCE_ADD:

    return performVecReduceAddCombine(N, DCI.DAG, Subtarget);

  case ISD::GET_ACTIVE_LANE_MASK:

    return performActiveLaneMaskCombine(N, DCI, Subtarget);

  case AArch64ISD::UADDV:

    return performUADDVCombine(N, DAG);

  case AArch64ISD::SMULL:

  case AArch64ISD::UMULL:

  case AArch64ISD::PMULL:

    return performMULLCombine(N, DCI, DAG);

  case AArch64ISD::PTEST_FIRST:

    return performPTestFirstCombine(N, DCI, DAG);

  case ISD::INTRINSIC_VOID:

  case ISD::INTRINSIC_W_CHAIN:

    switch (N->getConstantOperandVal(1)) {

    case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:

      return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);

    case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:

      return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);

    case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:

      return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);

    case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:

      return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);

    case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:

    case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:

    case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:

    case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:

    case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:

    case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:

    case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:

    case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:

      return legalizeSVEGatherPrefetchOffsVec(N, DAG);

    case Intrinsic::aarch64_neon_ld2:

    case Intrinsic::aarch64_neon_ld3:

    case Intrinsic::aarch64_neon_ld4:

    case Intrinsic::aarch64_neon_ld1x2:

    case Intrinsic::aarch64_neon_ld1x3:

    case Intrinsic::aarch64_neon_ld1x4:

    case Intrinsic::aarch64_neon_ld2lane:

    case Intrinsic::aarch64_neon_ld3lane:

    case Intrinsic::aarch64_neon_ld4lane:

    case Intrinsic::aarch64_neon_ld2r:

    case Intrinsic::aarch64_neon_ld3r:

    case Intrinsic::aarch64_neon_ld4r:

    case Intrinsic::aarch64_neon_st2:

    case Intrinsic::aarch64_neon_st3:

    case Intrinsic::aarch64_neon_st4:

    case Intrinsic::aarch64_neon_st1x2:

    case Intrinsic::aarch64_neon_st1x3:

    case Intrinsic::aarch64_neon_st1x4:

    case Intrinsic::aarch64_neon_st2lane:

    case Intrinsic::aarch64_neon_st3lane:

    case Intrinsic::aarch64_neon_st4lane:

      return performNEONPostLDSTCombine(N, DCI, DAG);

    case Intrinsic::aarch64_sve_ldnt1:

      return performLDNT1Combine(N, DAG);

    case Intrinsic::aarch64_sve_ld1rq:

      return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);

    case Intrinsic::aarch64_sve_ld1ro:

      return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);

    case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:

      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);

    case Intrinsic::aarch64_sve_ldnt1_gather:

      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);

    case Intrinsic::aarch64_sve_ldnt1_gather_index:

      return performGatherLoadCombine(N, DAG,

                                      AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);

    case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:

      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);

    case Intrinsic::aarch64_sve_ld1:

      return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);

    case Intrinsic::aarch64_sve_ldnf1:

      return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);

    case Intrinsic::aarch64_sve_ldff1:

      return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);

    case Intrinsic::aarch64_sve_st1:

      return performST1Combine(N, DAG);

    case Intrinsic::aarch64_sve_stnt1:

      return performSTNT1Combine(N, DAG);

    case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:

      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);

    case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:

      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);

    case Intrinsic::aarch64_sve_stnt1_scatter:

      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);

    case Intrinsic::aarch64_sve_stnt1_scatter_index:

      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);

    case Intrinsic::aarch64_sve_ld1_gather:

      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);

    case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:

    case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:

      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);

    case Intrinsic::aarch64_sve_ld1q_gather_index:

      return performGatherLoadCombine(N, DAG,

                                      AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);

    case Intrinsic::aarch64_sve_ld1_gather_index:

      return performGatherLoadCombine(N, DAG,

                                      AArch64ISD::GLD1_SCALED_MERGE_ZERO);

    case Intrinsic::aarch64_sve_ld1_gather_sxtw:

      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,

                                      /*OnlyPackedOffsets=*/false);

    case Intrinsic::aarch64_sve_ld1_gather_uxtw:

      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,

                                      /*OnlyPackedOffsets=*/false);

    case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:

      return performGatherLoadCombine(N, DAG,

                                      AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,

                                      /*OnlyPackedOffsets=*/false);

    case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:

      return performGatherLoadCombine(N, DAG,

                                      AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,

                                      /*OnlyPackedOffsets=*/false);

    case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:

      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);

    case Intrinsic::aarch64_sve_ldff1_gather:

      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);

    case Intrinsic::aarch64_sve_ldff1_gather_index:

      return performGatherLoadCombine(N, DAG,

                                      AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);

    case Intrinsic::aarch64_sve_ldff1_gather_sxtw:

      return performGatherLoadCombine(N, DAG,

                                      AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,

                                      /*OnlyPackedOffsets=*/false);

    case Intrinsic::aarch64_sve_ldff1_gather_uxtw:

      return performGatherLoadCombine(N, DAG,

                                      AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,

                                      /*OnlyPackedOffsets=*/false);

    case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:

      return performGatherLoadCombine(N, DAG,

                                      AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,

                                      /*OnlyPackedOffsets=*/false);

    case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:

      return performGatherLoadCombine(N, DAG,

                                      AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,

                                      /*OnlyPackedOffsets=*/false);

    case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:

      return performGatherLoadCombine(N, DAG,

                                      AArch64ISD::GLDFF1_IMM_MERGE_ZERO);

    case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:

    case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:

      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);

    case Intrinsic::aarch64_sve_st1q_scatter_index:

      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);

    case Intrinsic::aarch64_sve_st1_scatter:

      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);

    case Intrinsic::aarch64_sve_st1_scatter_index:

      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);

    case Intrinsic::aarch64_sve_st1_scatter_sxtw:

      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,

                                        /*OnlyPackedOffsets=*/false);

    case Intrinsic::aarch64_sve_st1_scatter_uxtw:

      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,

                                        /*OnlyPackedOffsets=*/false);

    case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:

      return performScatterStoreCombine(N, DAG,

                                        AArch64ISD::SST1_SXTW_SCALED_PRED,

                                        /*OnlyPackedOffsets=*/false);

    case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:

      return performScatterStoreCombine(N, DAG,

                                        AArch64ISD::SST1_UXTW_SCALED_PRED,

                                        /*OnlyPackedOffsets=*/false);

    case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:

      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);

    case Intrinsic::aarch64_rndr:

    case Intrinsic::aarch64_rndrrs:

      return performRNDRCombine(N, DAG);

    case Intrinsic::aarch64_sme_ldr_zt:

      return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),

                         DAG.getVTList(MVT::Other), N->getOperand(0),

                         N->getOperand(2), N->getOperand(3));

    case Intrinsic::aarch64_sme_str_zt:

      return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),

                         DAG.getVTList(MVT::Other), N->getOperand(0),

                         N->getOperand(2), N->getOperand(3));

    default:

      break;

    }

    break;

  case ISD::GlobalAddress:

    return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());

  case ISD::CTLZ:

    return performCTLZCombine(N, DAG, Subtarget);

  case ISD::SCALAR_TO_VECTOR:

    return performScalarToVectorCombine(N, DCI, DAG);

  case ISD::SHL:

    return performSHLCombine(N, DCI, DAG);

  }

  return SDValue();

}


// Check if the return value is used as only a return value, as otherwise

// we can't perform a tail-call. In particular, we need to check for

// target ISD nodes that are returns and any other "odd" constructs

// that the generic analysis code won't necessarily catch.

bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,

                                               SDValue &Chain) const {

  if (N->getNumValues() != 1)

    return false;

  if (!N->hasNUsesOfValue(1, 0))

    return false;


  SDValue TCChain = Chain;

  SDNode *Copy = *N->user_begin();

  if (Copy->getOpcode() == ISD::CopyToReg) {

    // If the copy has a glue operand, we conservatively assume it isn't safe to

    // perform a tail call.

    if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==

        MVT::Glue)

      return false;

    TCChain = Copy->getOperand(0);

  } else if (Copy->getOpcode() != ISD::FP_EXTEND)

    return false;


  bool HasRet = false;

  for (SDNode *Node : Copy->users()) {

    if (Node->getOpcode() != AArch64ISD::RET_GLUE)

      return false;

    HasRet = true;

  }


  if (!HasRet)

    return false;


  Chain = TCChain;

  return true;

}


// Return whether the an instruction can potentially be optimized to a tail

// call. This will cause the optimizers to attempt to move, or duplicate,

// return instructions to help enable tail call optimizations for this

// instruction.

bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {

  return CI->isTailCall();

}


bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,

                                            Register Offset, bool IsPre,

                                            MachineRegisterInfo &MRI) const {

  auto CstOffset = getIConstantVRegVal(Offset, MRI);

  if (!CstOffset || CstOffset->isZero())

    return false;


  // All of the indexed addressing mode instructions take a signed 9 bit

  // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already

  // encodes the sign/indexing direction.

  return isInt<9>(CstOffset->getSExtValue());

}


bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,

                                                   SDValue &Base,

                                                   SDValue &Offset,

                                                   SelectionDAG &DAG) const {

  if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)

    return false;


  // Non-null if there is exactly one user of the loaded value (ignoring chain).

  SDNode *ValOnlyUser = nullptr;

  for (SDUse &U : N->uses()) {

    if (U.getResNo() == 1)

      continue; // Ignore chain.

    if (ValOnlyUser == nullptr)

      ValOnlyUser = U.getUser();

    else {

      ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.

      break;

    }

  }


  auto IsUndefOrZero = [](SDValue V) {

    return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);

  };


  // If the only user of the value is a scalable vector splat, it is

  // preferable to do a replicating load (ld1r*).

  if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&

      (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||

       (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&

        IsUndefOrZero(ValOnlyUser->getOperand(2)))))

    return false;


  Base = Op->getOperand(0);

  // All of the indexed addressing mode instructions take a signed

  // 9 bit immediate offset.

  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {

    int64_t RHSC = RHS->getSExtValue();

    if (Op->getOpcode() == ISD::SUB)

      RHSC = -(uint64_t)RHSC;

    if (!isInt<9>(RHSC))

      return false;

    // When big-endian VLD1/VST1 are used for vector load and store, and these

    // only allow an offset that's equal to the store size.

    EVT MemType = cast<MemSDNode>(N)->getMemoryVT();

    if (!Subtarget->isLittleEndian() && MemType.isVector() &&

        (uint64_t)RHSC != MemType.getStoreSize())

      return false;

    // Always emit pre-inc/post-inc addressing mode. Use negated constant offset

    // when dealing with subtraction.

    Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));

    return true;

  }

  return false;

}


bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,

                                                      SDValue &Offset,

                                                      ISD::MemIndexedMode &AM,

                                                      SelectionDAG &DAG) const {

  EVT VT;

  SDValue Ptr;

  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {

    VT = LD->getMemoryVT();

    Ptr = LD->getBasePtr();

  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {

    VT = ST->getMemoryVT();

    Ptr = ST->getBasePtr();

  } else

    return false;


  if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))

    return false;

  AM = ISD::PRE_INC;

  return true;

}


bool AArch64TargetLowering::getPostIndexedAddressParts(

    SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,

    ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {

  EVT VT;

  SDValue Ptr;

  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {

    VT = LD->getMemoryVT();

    Ptr = LD->getBasePtr();

  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {

    VT = ST->getMemoryVT();

    Ptr = ST->getBasePtr();

  } else

    return false;


  if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))

    return false;

  // Post-indexing updates the base, so it's not a valid transform

  // if that's not the same as the load's pointer.

  if (Ptr != Base)

    return false;

  AM = ISD::POST_INC;

  return true;

}


static void replaceBoolVectorBitcast(SDNode *N,

                                     SmallVectorImpl<SDValue> &Results,

                                     SelectionDAG &DAG) {

  SDLoc DL(N);

  SDValue Op = N->getOperand(0);

  EVT VT = N->getValueType(0);

  [[maybe_unused]] EVT SrcVT = Op.getValueType();

  assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&

         "Must be bool vector.");


  // Special handling for Clang's __builtin_convertvector. For vectors with <8

  // elements, it adds a vector concatenation with undef(s). If we encounter

  // this here, we can skip the concat.

  if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {

    bool AllUndef = true;

    for (unsigned I = 1; I < Op.getNumOperands(); ++I)

      AllUndef &= Op.getOperand(I).isUndef();


    if (AllUndef)

      Op = Op.getOperand(0);

  }


  SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);

  if (VectorBits)

    Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));

}


static void CustomNonLegalBITCASTResults(SDNode *N,

                                         SmallVectorImpl<SDValue> &Results,

                                         SelectionDAG &DAG, EVT ExtendVT,

                                         EVT CastVT) {

  SDLoc DL(N);

  SDValue Op = N->getOperand(0);

  EVT VT = N->getValueType(0);


  // Use SCALAR_TO_VECTOR for lane zero

  SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);

  SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);

  SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);

  Results.push_back(

      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));

}


void AArch64TargetLowering::ReplaceBITCASTResults(

    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {

  SDLoc DL(N);

  SDValue Op = N->getOperand(0);

  EVT VT = N->getValueType(0);

  EVT SrcVT = Op.getValueType();


  if (VT == MVT::v2i16 && SrcVT == MVT::i32) {

    CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);

    return;

  }


  if (VT == MVT::v4i8 && SrcVT == MVT::i32) {

    CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);

    return;

  }


  if (VT == MVT::v2i8 && SrcVT == MVT::i16) {

    CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);

    return;

  }


  if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {

    assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&

           "Expected fp->int bitcast!");


    // Bitcasting between unpacked vector types of different element counts is

    // not a NOP because the live elements are laid out differently.

    //                01234567

    // e.g. nxv2i32 = XX??XX??

    //      nxv4f16 = X?X?X?X?

    if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())

      return;


    SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);

    Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));

    return;

  }


  if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&

      !VT.isVector())

    return replaceBoolVectorBitcast(N, Results, DAG);


  if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))

    return;


  Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,

                                 DAG.getUNDEF(MVT::i32), Op);

  Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);

  Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));

}


static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results,

                               SelectionDAG &DAG,

                               const AArch64Subtarget *Subtarget) {

  EVT VT = N->getValueType(0);

  if (!VT.is256BitVector() ||

      (VT.getScalarType().isFloatingPoint() &&

       !N->getFlags().hasAllowReassociation()) ||

      (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||

      VT.getScalarType() == MVT::bf16)

    return;


  SDValue X = N->getOperand(0);

  auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));

  if (!Shuf) {

    Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));

    X = N->getOperand(1);

    if (!Shuf)

      return;

  }


  if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())

    return;


  // Check the mask is 1,0,3,2,5,4,...

  ArrayRef<int> Mask = Shuf->getMask();

  for (int I = 0, E = Mask.size(); I < E; I++)

    if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))

      return;


  SDLoc DL(N);

  auto LoHi = DAG.SplitVector(X, DL);

  assert(LoHi.first.getValueType() == LoHi.second.getValueType());

  SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),

                             LoHi.first, LoHi.second);


  // Shuffle the elements back into order.

  SmallVector<int> NMask;

  for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {

    NMask.push_back(I);

    NMask.push_back(I);

  }

  Results.push_back(

      DAG.getVectorShuffle(VT, DL,

                           DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,

                                       DAG.getUNDEF(LoHi.first.getValueType())),

                           DAG.getUNDEF(VT), NMask));

}


static void ReplaceReductionResults(SDNode *N,

                                    SmallVectorImpl<SDValue> &Results,

                                    SelectionDAG &DAG, unsigned InterOp,

                                    unsigned AcrossOp) {

  EVT LoVT, HiVT;

  SDValue Lo, Hi;

  SDLoc DL(N);

  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

  std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

  SDValue InterVal = DAG.getNode(InterOp, DL, LoVT, Lo, Hi);

  SDValue SplitVal = DAG.getNode(AcrossOp, DL, LoVT, InterVal);

  Results.push_back(SplitVal);

}


void AArch64TargetLowering::ReplaceExtractSubVectorResults(

    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {

  SDValue In = N->getOperand(0);

  EVT InVT = In.getValueType();


  // Common code will handle these just fine.

  if (!InVT.isScalableVector() || !InVT.isInteger())

    return;


  SDLoc DL(N);

  EVT VT = N->getValueType(0);


  // The following checks bail if this is not a halving operation.


  ElementCount ResEC = VT.getVectorElementCount();


  if (InVT.getVectorElementCount() != (ResEC * 2))

    return;


  auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));

  if (!CIndex)

    return;


  unsigned Index = CIndex->getZExtValue();

  if ((Index != 0) && (Index != ResEC.getKnownMinValue()))

    return;


  unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;

  EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());


  SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));

  Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));

}


void AArch64TargetLowering::ReplaceGetActiveLaneMaskResults(

    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {

  assert((Subtarget->hasSVE2p1() ||

          (Subtarget->hasSME2() && Subtarget->isStreaming())) &&

         "Custom lower of get.active.lane.mask missing required feature.");


  assert(N->getValueType(0) == MVT::nxv32i1 &&

         "Unexpected result type for get.active.lane.mask");


  SDLoc DL(N);

  SDValue Idx = N->getOperand(0);

  SDValue TC = N->getOperand(1);


  assert(Idx.getValueType().getFixedSizeInBits() <= 64 &&

         "Unexpected operand type for get.active.lane.mask");


  if (Idx.getValueType() != MVT::i64) {

    Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);

    TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);

  }


  SDValue ID =

      DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);

  EVT HalfVT = N->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());

  auto WideMask =

      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {HalfVT, HalfVT}, {ID, Idx, TC});


  Results.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0),

                                {WideMask.getValue(0), WideMask.getValue(1)}));

}


// Create an even/odd pair of X registers holding integer value V.


static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {

  SDLoc DL(V.getNode());

  auto [VLo, VHi] = DAG.SplitScalar(V, DL, MVT::i64, MVT::i64);

  if (DAG.getDataLayout().isBigEndian())

    std::swap (VLo, VHi);

  SDValue RegClass =

      DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, DL, MVT::i32);

  SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, DL, MVT::i32);

  SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, DL, MVT::i32);

  const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };

  return SDValue(

      DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops), 0);

}


static void ReplaceCMP_SWAP_128Results(SDNode *N,

                                       SmallVectorImpl<SDValue> &Results,

                                       SelectionDAG &DAG,

                                       const AArch64Subtarget *Subtarget) {

  assert(N->getValueType(0) == MVT::i128 &&

         "AtomicCmpSwap on types less than 128 should be legal");


  MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();

  if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {

    // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,

    // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.

    SDValue Ops[] = {

        createGPRPairNode(DAG, N->getOperand(2)), // Compare value

        createGPRPairNode(DAG, N->getOperand(3)), // Store value

        N->getOperand(1), // Ptr

        N->getOperand(0), // Chain in

    };


    unsigned Opcode;

    switch (MemOp->getMergedOrdering()) {

    case AtomicOrdering::Monotonic:

      Opcode = AArch64::CASPX;

      break;

    case AtomicOrdering::Acquire:

      Opcode = AArch64::CASPAX;

      break;

    case AtomicOrdering::Release:

      Opcode = AArch64::CASPLX;

      break;

    case AtomicOrdering::AcquireRelease:

    case AtomicOrdering::SequentiallyConsistent:

      Opcode = AArch64::CASPALX;

      break;

    default:

      llvm_unreachable("Unexpected ordering!");

    }


    MachineSDNode *CmpSwap = DAG.getMachineNode(

        Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);

    DAG.setNodeMemRefs(CmpSwap, {MemOp});


    unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;

    if (DAG.getDataLayout().isBigEndian())

      std::swap(SubReg1, SubReg2);

    SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,

                                            SDValue(CmpSwap, 0));

    SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,

                                            SDValue(CmpSwap, 0));

    Results.push_back(

        DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));

    Results.push_back(SDValue(CmpSwap, 1)); // Chain out

    return;

  }


  unsigned Opcode;

  switch (MemOp->getMergedOrdering()) {

  case AtomicOrdering::Monotonic:

    Opcode = AArch64::CMP_SWAP_128_MONOTONIC;

    break;

  case AtomicOrdering::Acquire:

    Opcode = AArch64::CMP_SWAP_128_ACQUIRE;

    break;

  case AtomicOrdering::Release:

    Opcode = AArch64::CMP_SWAP_128_RELEASE;

    break;

  case AtomicOrdering::AcquireRelease:

  case AtomicOrdering::SequentiallyConsistent:

    Opcode = AArch64::CMP_SWAP_128;

    break;

  default:

    llvm_unreachable("Unexpected ordering!");

  }


  SDLoc DL(N);

  auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);

  auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);

  SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,

                   New.first,        New.second,    N->getOperand(0)};

  SDNode *CmpSwap = DAG.getMachineNode(

      Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),

      Ops);

  DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});


  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,

                                SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));

  Results.push_back(SDValue(CmpSwap, 3));

}


static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,

                                       AtomicOrdering Ordering) {

  // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see

  // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because

  // the type is not legal. Therefore we shouldn't expect to see a 128-bit

  // ATOMIC_LOAD_CLR at any point.

  assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&

         "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");

  assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");

  assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");


  if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {

    // The operand will need to be XORed in a separate step.

    switch (Ordering) {

    case AtomicOrdering::Monotonic:

      return AArch64::LDCLRP;

      break;

    case AtomicOrdering::Acquire:

      return AArch64::LDCLRPA;

      break;

    case AtomicOrdering::Release:

      return AArch64::LDCLRPL;

      break;

    case AtomicOrdering::AcquireRelease:

    case AtomicOrdering::SequentiallyConsistent:

      return AArch64::LDCLRPAL;

      break;

    default:

      llvm_unreachable("Unexpected ordering!");

    }

  }


  if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {

    switch (Ordering) {

    case AtomicOrdering::Monotonic:

      return AArch64::LDSETP;

      break;

    case AtomicOrdering::Acquire:

      return AArch64::LDSETPA;

      break;

    case AtomicOrdering::Release:

      return AArch64::LDSETPL;

      break;

    case AtomicOrdering::AcquireRelease:

    case AtomicOrdering::SequentiallyConsistent:

      return AArch64::LDSETPAL;

      break;

    default:

      llvm_unreachable("Unexpected ordering!");

    }

  }


  if (ISDOpcode == ISD::ATOMIC_SWAP) {

    switch (Ordering) {

    case AtomicOrdering::Monotonic:

      return AArch64::SWPP;

      break;

    case AtomicOrdering::Acquire:

      return AArch64::SWPPA;

      break;

    case AtomicOrdering::Release:

      return AArch64::SWPPL;

      break;

    case AtomicOrdering::AcquireRelease:

    case AtomicOrdering::SequentiallyConsistent:

      return AArch64::SWPPAL;

      break;

    default:

      llvm_unreachable("Unexpected ordering!");

    }

  }


  llvm_unreachable("Unexpected ISDOpcode!");

}


static void ReplaceATOMIC_LOAD_128Results(SDNode *N,

                                          SmallVectorImpl<SDValue> &Results,

                                          SelectionDAG &DAG,

                                          const AArch64Subtarget *Subtarget) {

  // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it

  // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions

  // rather than the CASP instructions, because CASP has register classes for

  // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG

  // to present them as single operands. LSE128 instructions use the GPR64

  // register class (because the pair does not have to be sequential), like

  // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.


  assert(N->getValueType(0) == MVT::i128 &&

         "AtomicLoadXXX on types less than 128 should be legal");


  if (!Subtarget->hasLSE128())

    return;


  MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();

  const SDValue &Chain = N->getOperand(0);

  const SDValue &Ptr = N->getOperand(1);

  const SDValue &Val128 = N->getOperand(2);

  std::pair<SDValue, SDValue> Val2x64 =

      DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);


  const unsigned ISDOpcode = N->getOpcode();

  const unsigned MachineOpcode =

      getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());


  if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {

    SDLoc DL(Val128);

    Val2x64.first =

        DAG.getNode(ISD::XOR, DL, MVT::i64,

                    DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.first);

    Val2x64.second =

        DAG.getNode(ISD::XOR, DL, MVT::i64,

                    DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.second);

  }


  SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};

  if (DAG.getDataLayout().isBigEndian())

    std::swap(Ops[0], Ops[1]);


  MachineSDNode *AtomicInst =

      DAG.getMachineNode(MachineOpcode, SDLoc(N),

                         DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);


  DAG.setNodeMemRefs(AtomicInst, {MemOp});


  SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);

  if (DAG.getDataLayout().isBigEndian())

    std::swap(Lo, Hi);


  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));

  Results.push_back(SDValue(AtomicInst, 2)); // Chain out

}


void AArch64TargetLowering::ReplaceNodeResults(

    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {

  switch (N->getOpcode()) {

  default:

    llvm_unreachable("Don't know how to custom expand this");

  case ISD::BITCAST:

    ReplaceBITCASTResults(N, Results, DAG);

    return;

  case ISD::VECREDUCE_ADD:

  case ISD::VECREDUCE_SMAX:

  case ISD::VECREDUCE_SMIN:

  case ISD::VECREDUCE_UMAX:

  case ISD::VECREDUCE_UMIN:

    Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));

    return;

  case ISD::VECTOR_COMPRESS:

    if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))

      Results.push_back(Res);

    return;

  case ISD::ADD:

  case ISD::FADD:

    ReplaceAddWithADDP(N, Results, DAG, Subtarget);

    return;


  case ISD::CTPOP:

  case ISD::PARITY:

    if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))

      Results.push_back(Result);

    return;

  case AArch64ISD::SADDV:

    ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);

    return;

  case AArch64ISD::UADDV:

    ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);

    return;

  case AArch64ISD::SMINV:

    ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);

    return;

  case AArch64ISD::UMINV:

    ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);

    return;

  case AArch64ISD::SMAXV:

    ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);

    return;

  case AArch64ISD::UMAXV:

    ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);

    return;

  case ISD::MULHS:

    if (useSVEForFixedLengthVectorVT(SDValue(N, 0).getValueType()))

      Results.push_back(

          LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));

    return;

  case ISD::MULHU:

    if (useSVEForFixedLengthVectorVT(SDValue(N, 0).getValueType()))

      Results.push_back(

          LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));

    return;

  case ISD::FP_TO_UINT:

  case ISD::FP_TO_SINT:

  case ISD::STRICT_FP_TO_SINT:

  case ISD::STRICT_FP_TO_UINT:

    assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");

    // Let normal code take care of it by not adding anything to Results.

    return;

  case ISD::ATOMIC_CMP_SWAP:

    ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);

    return;

  case ISD::ATOMIC_LOAD_CLR:

    assert(N->getValueType(0) != MVT::i128 &&

           "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");

    break;

  case ISD::ATOMIC_LOAD_AND:

  case ISD::ATOMIC_LOAD_OR:

  case ISD::ATOMIC_SWAP: {

    assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&

           "Expected 128-bit atomicrmw.");

    // These need custom type legalisation so we go directly to instruction.

    ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);

    return;

  }

  case ISD::ADDRSPACECAST: {

    SDValue V = LowerADDRSPACECAST(SDValue(N, 0), DAG);

    Results.push_back(V);

    return;

  }

  case ISD::ATOMIC_LOAD:

  case ISD::LOAD: {

    MemSDNode *LoadNode = cast<MemSDNode>(N);

    EVT MemVT = LoadNode->getMemoryVT();

    // Handle lowering 256 bit non temporal loads into LDNP for little-endian

    // targets.

    if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&

        MemVT.getSizeInBits() == 256u &&

        (MemVT.getScalarSizeInBits() == 8u ||

         MemVT.getScalarSizeInBits() == 16u ||

         MemVT.getScalarSizeInBits() == 32u ||

         MemVT.getScalarSizeInBits() == 64u)) {


      EVT HalfVT = MemVT.getHalfNumVectorElementsVT(*DAG.getContext());

      SDValue Result = DAG.getMemIntrinsicNode(

          AArch64ISD::LDNP, SDLoc(N),

          DAG.getVTList({MVT::v2i64, MVT::v2i64, MVT::Other}),

          {LoadNode->getChain(), LoadNode->getBasePtr()},

          LoadNode->getMemoryVT(), LoadNode->getMemOperand());


      SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,

                                 DAG.getBitcast(HalfVT, Result.getValue(0)),

                                 DAG.getBitcast(HalfVT, Result.getValue(1)));

      Results.append({Pair, Result.getValue(2) /* Chain */});

      return;

    }


    if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||

        LoadNode->getMemoryVT() != MVT::i128) {

      // Non-volatile or atomic loads are optimized later in AArch64's load/store

      // optimizer.

      return;

    }


    if (SDValue(N, 0).getValueType() == MVT::i128) {

      auto *AN = dyn_cast<AtomicSDNode>(LoadNode);

      bool isLoadAcquire =

          AN && AN->getSuccessOrdering() == AtomicOrdering::Acquire;

      unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;


      if (isLoadAcquire)

        assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));


      SDValue Result = DAG.getMemIntrinsicNode(

          Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),

          {LoadNode->getChain(), LoadNode->getBasePtr()},

          LoadNode->getMemoryVT(), LoadNode->getMemOperand());


      unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;


      SDValue Pair =

          DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,

                      Result.getValue(FirstRes), Result.getValue(1 - FirstRes));

      Results.append({Pair, Result.getValue(2) /* Chain */});

    }

    return;

  }

  case ISD::EXTRACT_SUBVECTOR:

    ReplaceExtractSubVectorResults(N, Results, DAG);

    return;

  case ISD::INSERT_SUBVECTOR:

  case ISD::CONCAT_VECTORS:

    // Custom lowering has been requested for INSERT_SUBVECTOR and

    // CONCAT_VECTORS -- but delegate to common code for result type

    // legalisation

    return;

  case ISD::GET_ACTIVE_LANE_MASK:

    ReplaceGetActiveLaneMaskResults(N, Results, DAG);

    return;

  case ISD::INTRINSIC_WO_CHAIN: {

    EVT VT = N->getValueType(0);


    Intrinsic::ID IntID =

        static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));

    switch (IntID) {

    default:

      return;

    case Intrinsic::aarch64_sve_clasta_n: {

      assert((VT == MVT::i8 || VT == MVT::i16) &&

             "custom lowering for unexpected type");

      SDLoc DL(N);

      auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));

      auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,

                           N->getOperand(1), Op2, N->getOperand(3));

      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));

      return;

    }

    case Intrinsic::aarch64_sve_clastb_n: {

      assert((VT == MVT::i8 || VT == MVT::i16) &&

             "custom lowering for unexpected type");

      SDLoc DL(N);

      auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));

      auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,

                           N->getOperand(1), Op2, N->getOperand(3));

      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));

      return;

    }

    case Intrinsic::aarch64_sve_lasta: {

      assert((VT == MVT::i8 || VT == MVT::i16) &&

             "custom lowering for unexpected type");

      SDLoc DL(N);

      auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,

                           N->getOperand(1), N->getOperand(2));

      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));

      return;

    }

    case Intrinsic::aarch64_sve_lastb: {

      assert((VT == MVT::i8 || VT == MVT::i16) &&

             "custom lowering for unexpected type");

      SDLoc DL(N);

      auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,

                           N->getOperand(1), N->getOperand(2));

      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));

      return;

    }

    case Intrinsic::aarch64_sme_in_streaming_mode: {

      SDLoc DL(N);

      SDValue Chain = DAG.getEntryNode();


      SDValue RuntimePStateSM =

          getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));

      Results.push_back(

          DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM));

      return;

    }

    case Intrinsic::experimental_vector_match: {

      if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)

        return;


      // NOTE: Only trivial type promotion is supported.

      EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);

      if (NewVT.getVectorNumElements() != VT.getVectorNumElements())

        return;


      SDLoc DL(N);

      auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());

      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));

      return;

    }

    }

  }

  case ISD::READ_REGISTER: {

    SDLoc DL(N);

    assert(N->getValueType(0) == MVT::i128 &&

           "READ_REGISTER custom lowering is only for 128-bit sysregs");

    SDValue Chain = N->getOperand(0);

    SDValue SysRegName = N->getOperand(1);


    SDValue Result = DAG.getNode(

        AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),

        Chain, SysRegName);


    // Sysregs are not endian. Result.getValue(0) always contains the lower half

    // of the 128-bit System Register value.

    SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,

                               Result.getValue(0), Result.getValue(1));

    Results.push_back(Pair);

    Results.push_back(Result.getValue(2)); // Chain

    return;

  }

  }

}


bool AArch64TargetLowering::useLoadStackGuardNode(const Module &M) const {

  if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())

    return TargetLowering::useLoadStackGuardNode(M);

  return true;

}


unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {

  // Combine multiple FDIVs with the same divisor into multiple FMULs by the

  // reciprocal if there are three or more FDIVs.

  return 3;

}


TargetLoweringBase::LegalizeTypeAction


AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {

  // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,

  // v4i16, v2i32 instead of to promote.

  if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||

      VT == MVT::v1f32)

    return TypeWidenVector;


  return TargetLoweringBase::getPreferredVectorAction(VT);

}


// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic

// provided the address is 16-byte aligned.


bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {

  if (!Subtarget->hasLSE2())

    return false;


  if (auto LI = dyn_cast<LoadInst>(I))

    return LI->getType()->getPrimitiveSizeInBits() == 128 &&

           LI->getAlign() >= Align(16);


  if (auto SI = dyn_cast<StoreInst>(I))

    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&

           SI->getAlign() >= Align(16);


  return false;

}


bool AArch64TargetLowering::isOpSuitableForLSE128(const Instruction *I) const {

  if (!Subtarget->hasLSE128())

    return false;


  // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP

  // will clobber the two registers.

  if (const auto *SI = dyn_cast<StoreInst>(I))

    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&

           SI->getAlign() >= Align(16) &&

           (SI->getOrdering() == AtomicOrdering::Release ||

            SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);


  if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))

    return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&

           RMW->getAlign() >= Align(16) &&

           (RMW->getOperation() == AtomicRMWInst::Xchg ||

            RMW->getOperation() == AtomicRMWInst::And ||

            RMW->getOperation() == AtomicRMWInst::Or);


  return false;

}


bool AArch64TargetLowering::isOpSuitableForRCPC3(const Instruction *I) const {

  if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())

    return false;


  if (auto LI = dyn_cast<LoadInst>(I))

    return LI->getType()->getPrimitiveSizeInBits() == 128 &&

           LI->getAlign() >= Align(16) &&

           LI->getOrdering() == AtomicOrdering::Acquire;


  if (auto SI = dyn_cast<StoreInst>(I))

    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&

           SI->getAlign() >= Align(16) &&

           SI->getOrdering() == AtomicOrdering::Release;


  return false;

}


bool AArch64TargetLowering::shouldInsertFencesForAtomic(

    const Instruction *I) const {

  if (isOpSuitableForRCPC3(I))

    return false;

  if (isOpSuitableForLSE128(I))

    return false;

  if (isOpSuitableForLDPSTP(I))

    return true;

  return false;

}


bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore(

    const Instruction *I) const {

  // Store-Release instructions only provide seq_cst guarantees when paired with

  // Load-Acquire instructions. MSVC CRT does not use these instructions to

  // implement seq_cst loads and stores, so we need additional explicit fences

  // after memory writes.

  if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())

    return false;


  switch (I->getOpcode()) {

  default:

    return false;

  case Instruction::AtomicCmpXchg:

    return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==

           AtomicOrdering::SequentiallyConsistent;

  case Instruction::AtomicRMW:

    return cast<AtomicRMWInst>(I)->getOrdering() ==

           AtomicOrdering::SequentiallyConsistent;

  case Instruction::Store:

    return cast<StoreInst>(I)->getOrdering() ==

           AtomicOrdering::SequentiallyConsistent;

  }

}


// Loads and stores less than 128-bits are already atomic; ones above that

// are doomed anyway, so defer to the default libcall and blame the OS when

// things go wrong.

TargetLoweringBase::AtomicExpansionKind


AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {

  unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();

  if (Size != 128)

    return AtomicExpansionKind::None;

  if (isOpSuitableForRCPC3(SI))

    return AtomicExpansionKind::None;

  if (isOpSuitableForLSE128(SI))

    return AtomicExpansionKind::Expand;

  if (isOpSuitableForLDPSTP(SI))

    return AtomicExpansionKind::None;

  return AtomicExpansionKind::Expand;

}


// Loads and stores less than 128-bits are already atomic; ones above that

// are doomed anyway, so defer to the default libcall and blame the OS when

// things go wrong.

TargetLowering::AtomicExpansionKind


AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {

  unsigned Size = LI->getType()->getPrimitiveSizeInBits();


  if (Size != 128)

    return AtomicExpansionKind::None;

  if (isOpSuitableForRCPC3(LI))

    return AtomicExpansionKind::None;

  // No LSE128 loads

  if (isOpSuitableForLDPSTP(LI))

    return AtomicExpansionKind::None;


  // At -O0, fast-regalloc cannot cope with the live vregs necessary to

  // implement atomicrmw without spilling. If the target address is also on the

  // stack and close enough to the spill slot, this can lead to a situation

  // where the monitor always gets cleared and the atomic operation can never

  // succeed. So at -O0 lower this operation to a CAS loop.

  if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)

    return AtomicExpansionKind::CmpXChg;


  // Using CAS for an atomic load has a better chance of succeeding under high

  // contention situations. So use it if available.

  return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg

                             : AtomicExpansionKind::LLSC;

}


// Return true if the atomic operation expansion will lower to use a library

// call, and is thus ineligible to use an LLSC expansion.


static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,

                                   const AtomicRMWInst *RMW) {

  if (!RMW->isFloatingPointOperation())

    return false;

  switch (RMW->getType()->getScalarType()->getTypeID()) {

  case Type::FloatTyID:

  case Type::DoubleTyID:

  case Type::HalfTyID:

  case Type::BFloatTyID:

    // Will use soft float

    return !Subtarget.hasFPARMv8();

  default:

    // fp128 will emit library calls.

    return true;

  }


  llvm_unreachable("covered type switch");

}


// The "default" for integer RMW operations is to expand to an LL/SC loop.

// However, with the LSE instructions (or outline-atomics mode, which provides

// library routines in place of the LSE-instructions), we can directly emit many

// operations instead.

TargetLowering::AtomicExpansionKind


AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {

  Type *Ty = AI->getType();

  unsigned Size = Ty->getPrimitiveSizeInBits();

  assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");


  bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&

                      (AI->getOperation() == AtomicRMWInst::Xchg ||

                       AI->getOperation() == AtomicRMWInst::Or ||

                       AI->getOperation() == AtomicRMWInst::And);

  if (CanUseLSE128)

    return AtomicExpansionKind::None;


  // If LSFE available, use atomic FP instructions in preference to expansion

  if (Subtarget->hasLSFE() && (AI->getOperation() == AtomicRMWInst::FAdd ||

                               AI->getOperation() == AtomicRMWInst::FMax ||

                               AI->getOperation() == AtomicRMWInst::FMin ||

                               AI->getOperation() == AtomicRMWInst::FMaximum ||

                               AI->getOperation() == AtomicRMWInst::FMinimum))

    return AtomicExpansionKind::None;


  // Nand is not supported in LSE.

  // Leave 128 bits to LLSC or CmpXChg.

  if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&

      !AI->isFloatingPointOperation()) {

    if (Subtarget->hasLSE())

      return AtomicExpansionKind::None;

    if (Subtarget->outlineAtomics()) {

      // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.

      // Don't outline them unless

      // (1) high level <atomic> support approved:

      //   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf

      // (2) low level libgcc and compiler-rt support implemented by:

      //   min/max outline atomics helpers

      if (AI->getOperation() != AtomicRMWInst::Min &&

          AI->getOperation() != AtomicRMWInst::Max &&

          AI->getOperation() != AtomicRMWInst::UMin &&

          AI->getOperation() != AtomicRMWInst::UMax) {

        return AtomicExpansionKind::None;

      }

    }

  }


  // At -O0, fast-regalloc cannot cope with the live vregs necessary to

  // implement atomicrmw without spilling. If the target address is also on the

  // stack and close enough to the spill slot, this can lead to a situation

  // where the monitor always gets cleared and the atomic operation can never

  // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if

  // we have a single CAS instruction that can replace the loop.

  if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||

      Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI))

    return AtomicExpansionKind::CmpXChg;


  return AtomicExpansionKind::LLSC;

}


TargetLowering::AtomicExpansionKind


AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(

    AtomicCmpXchgInst *AI) const {

  // If subtarget has LSE, leave cmpxchg intact for codegen.

  if (Subtarget->hasLSE() || Subtarget->outlineAtomics())

    return AtomicExpansionKind::None;

  // At -O0, fast-regalloc cannot cope with the live vregs necessary to

  // implement cmpxchg without spilling. If the address being exchanged is also

  // on the stack and close enough to the spill slot, this can lead to a

  // situation where the monitor always gets cleared and the atomic operation

  // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.

  if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)

    return AtomicExpansionKind::None;


  // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand

  // it.

  unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();

  if (Size > 64)

    return AtomicExpansionKind::None;


  return AtomicExpansionKind::LLSC;

}


Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,

                                             Type *ValueTy, Value *Addr,

                                             AtomicOrdering Ord) const {

  Module *M = Builder.GetInsertBlock()->getParent()->getParent();

  bool IsAcquire = isAcquireOrStronger(Ord);


  // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd

  // intrinsic must return {i64, i64} and we have to recombine them into a

  // single i128 here.

  if (ValueTy->getPrimitiveSizeInBits() == 128) {

    Intrinsic::ID Int =

        IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;


    Value *LoHi =

        Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");


    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");

    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");


    auto *Int128Ty = Type::getInt128Ty(Builder.getContext());

    Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");

    Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");


    Value *Or = Builder.CreateOr(

        Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");

    return Builder.CreateBitCast(Or, ValueTy);

  }


  Type *Tys[] = { Addr->getType() };

  Intrinsic::ID Int =

      IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;


  const DataLayout &DL = M->getDataLayout();

  IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));

  CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);

  CI->addParamAttr(0, Attribute::get(Builder.getContext(),

                                     Attribute::ElementType, IntEltTy));

  Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);


  return Builder.CreateBitCast(Trunc, ValueTy);

}


void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(

    IRBuilderBase &Builder) const {

  Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {});

}


Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,

                                                   Value *Val, Value *Addr,

                                                   AtomicOrdering Ord) const {

  Module *M = Builder.GetInsertBlock()->getParent()->getParent();

  bool IsRelease = isReleaseOrStronger(Ord);


  // Since the intrinsics must have legal type, the i128 intrinsics take two

  // parameters: "i64, i64". We must marshal Val into the appropriate form

  // before the call.

  if (Val->getType()->getPrimitiveSizeInBits() == 128) {

    Intrinsic::ID Int =

        IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;

    Function *Stxr = Intrinsic::getOrInsertDeclaration(M, Int);

    Type *Int64Ty = Type::getInt64Ty(M->getContext());

    Type *Int128Ty = Type::getInt128Ty(M->getContext());


    Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);


    Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");

    Value *Hi =

        Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");

    return Builder.CreateCall(Stxr, {Lo, Hi, Addr});

  }


  Intrinsic::ID Int =

      IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;

  Type *Tys[] = { Addr->getType() };

  Function *Stxr = Intrinsic::getOrInsertDeclaration(M, Int, Tys);


  const DataLayout &DL = M->getDataLayout();

  IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));

  Val = Builder.CreateBitCast(Val, IntValTy);


  CallInst *CI = Builder.CreateCall(

      Stxr, {Builder.CreateZExtOrBitCast(

                 Val, Stxr->getFunctionType()->getParamType(0)),

             Addr});

  CI->addParamAttr(1, Attribute::get(Builder.getContext(),

                                     Attribute::ElementType, Val->getType()));

  return CI;

}


bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(

    Type *Ty, CallingConv::ID CallConv, bool isVarArg,

    const DataLayout &DL) const {

  if (!Ty->isArrayTy()) {

    const TypeSize &TySize = Ty->getPrimitiveSizeInBits();

    return TySize.isScalable() && TySize.getKnownMinValue() > 128;

  }


  // All non aggregate members of the type must have the same type

  SmallVector<EVT> ValueVTs;

  ComputeValueVTs(*this, DL, Ty, ValueVTs);

  return all_equal(ValueVTs);

}


bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,

                                                            EVT) const {

  return false;

}


static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {

  Module *M = IRB.GetInsertBlock()->getParent()->getParent();

  Function *ThreadPointerFunc = Intrinsic::getOrInsertDeclaration(

      M, Intrinsic::thread_pointer, IRB.getPtrTy());

  return IRB.CreatePointerCast(

      IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),

                             Offset),

      IRB.getPtrTy(0));

}


Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {

  // Android provides a fixed TLS slot for the stack cookie. See the definition

  // of TLS_SLOT_STACK_GUARD in

  // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h

  if (Subtarget->isTargetAndroid())

    return UseTlsOffset(IRB, 0x28);


  // Fuchsia is similar.

  // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.

  if (Subtarget->isTargetFuchsia())

    return UseTlsOffset(IRB, -0x10);


  return TargetLowering::getIRStackGuard(IRB);

}


void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {

  // MSVC CRT provides functionalities for stack protection.

  RTLIB::LibcallImpl SecurityCheckCookieLibcall =

      getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);


  RTLIB::LibcallImpl SecurityCookieVar =

      getLibcallImpl(RTLIB::STACK_CHECK_GUARD);

  if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&

      SecurityCookieVar != RTLIB::Unsupported) {

    // MSVC CRT has a global variable holding security cookie.

    M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),

                        PointerType::getUnqual(M.getContext()));


    // MSVC CRT has a function to validate security cookie.

    FunctionCallee SecurityCheckCookie =

        M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),

                              Type::getVoidTy(M.getContext()),

                              PointerType::getUnqual(M.getContext()));

    if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {

      F->setCallingConv(CallingConv::Win64);

      F->addParamAttr(0, Attribute::AttrKind::InReg);

    }

    return;

  }

  TargetLowering::insertSSPDeclarations(M);

}


Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {

  // MSVC CRT has a function to validate security cookie.

  RTLIB::LibcallImpl SecurityCheckCookieLibcall =

      getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);

  if (SecurityCheckCookieLibcall != RTLIB::Unsupported)

    return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall));

  return TargetLowering::getSSPStackGuardCheck(M);

}


Value *


AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {

  // Android provides a fixed TLS slot for the SafeStack pointer. See the

  // definition of TLS_SLOT_SAFESTACK in

  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h

  if (Subtarget->isTargetAndroid())

    return UseTlsOffset(IRB, 0x48);


  // Fuchsia is similar.

  // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.

  if (Subtarget->isTargetFuchsia())

    return UseTlsOffset(IRB, -0x8);


  return TargetLowering::getSafeStackPointerLocation(IRB);

}


/// If a physical register, this returns the register that receives the

/// exception address on entry to an EH pad.


Register AArch64TargetLowering::getExceptionPointerRegister(

    const Constant *PersonalityFn) const {

  // FIXME: This is a guess. Has this been defined yet?

  return AArch64::X0;

}


/// If a physical register, this returns the register that receives the

/// exception typeid on entry to a landing pad.


Register AArch64TargetLowering::getExceptionSelectorRegister(

    const Constant *PersonalityFn) const {

  // FIXME: This is a guess. Has this been defined yet?

  return AArch64::X1;

}


bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(

    const Instruction &AndI) const {

  // Only sink 'and' mask to cmp use block if it is masking a single bit, since

  // this is likely to be fold the and/cmp/br into a single tbz instruction.  It

  // may be beneficial to sink in other cases, but we would have to check that

  // the cmp would not get folded into the br to form a cbz for these to be

  // beneficial.

  ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));

  if (!Mask)

    return false;

  return Mask->getValue().isPowerOf2();

}


bool AArch64TargetLowering::

    shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

        SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,

        unsigned OldShiftOpcode, unsigned NewShiftOpcode,

        SelectionDAG &DAG) const {

  // Does baseline recommend not to perform the fold by default?

  if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

          X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))

    return false;

  // Else, if this is a vector shift, prefer 'shl'.

  return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;

}


TargetLowering::ShiftLegalizationStrategy


AArch64TargetLowering::preferredShiftLegalizationStrategy(

    SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {

  if (DAG.getMachineFunction().getFunction().hasMinSize() &&

      !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())

    return ShiftLegalizationStrategy::LowerToLibcall;

  return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,

                                                            ExpansionFactor);

}


void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {

  // Update IsSplitCSR in AArch64unctionInfo.

  AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();

  AFI->setIsSplitCSR(true);

}


void AArch64TargetLowering::insertCopiesSplitCSR(

    MachineBasicBlock *Entry,

    const SmallVectorImpl<MachineBasicBlock *> &Exits) const {

  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();

  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());

  if (!IStart)

    return;


  const TargetInstrInfo *TII = Subtarget->getInstrInfo();

  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();

  MachineBasicBlock::iterator MBBI = Entry->begin();

  for (const MCPhysReg *I = IStart; *I; ++I) {

    const TargetRegisterClass *RC = nullptr;

    if (AArch64::GPR64RegClass.contains(*I))

      RC = &AArch64::GPR64RegClass;

    else if (AArch64::FPR64RegClass.contains(*I))

      RC = &AArch64::FPR64RegClass;

    else

      llvm_unreachable("Unexpected register class in CSRsViaCopy!");


    Register NewVR = MRI->createVirtualRegister(RC);

    // Create copy from CSR to a virtual register.

    // FIXME: this currently does not emit CFI pseudo-instructions, it works

    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be

    // nounwind. If we want to generalize this later, we may need to emit

    // CFI pseudo-instructions.

    assert(Entry->getParent()->getFunction().hasFnAttribute(

               Attribute::NoUnwind) &&

           "Function should be nounwind in insertCopiesSplitCSR!");

    Entry->addLiveIn(*I);

    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)

        .addReg(*I);


    // Insert the copy-back instructions right before the terminator.

    for (auto *Exit : Exits)

      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),

              TII->get(TargetOpcode::COPY), *I)

          .addReg(NewVR);

  }

}


bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {

  // Integer division on AArch64 is expensive. However, when aggressively

  // optimizing for code size, we prefer to use a div instruction, as it is

  // usually smaller than the alternative sequence.

  // The exception to this is vector division. Since AArch64 doesn't have vector

  // integer division, leaving the division as-is is a loss even in terms of

  // size, because it will have to be scalarized, while the alternative code

  // sequence can be performed in vector form.

  bool OptSize = Attr.hasFnAttr(Attribute::MinSize);

  return OptSize && !VT.isVector();

}


bool AArch64TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,

                                             const MachineFunction &MF) const {

  // Avoid merging stores into fixed-length vectors when Neon is unavailable.

  // In future, we could allow this when SVE is available, but currently,

  // the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and

  // the general lowering may introduce stack spills/reloads).

  if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())

    return false;


  // Do not merge to float value size (128 bytes) if no implicit float attribute

  // is set.

  bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);

  return !NoFloat || MemVT.getSizeInBits() <= 64;

}


bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {

  // We want inc-of-add for scalars and sub-of-not for vectors.

  return VT.isScalarInteger();

}


bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,

                                                 EVT VT) const {

  // v8f16 without fp16 need to be extended to v8f32, which is more difficult to

  // legalize.

  if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())

    return false;

  if (FPVT == MVT::v8bf16)

    return false;

  return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);

}


bool AArch64TargetLowering::preferSelectsOverBooleanArithmetic(EVT VT) const {

  // Expand scalar and SVE operations using selects. Neon vectors prefer sub to

  // avoid vselect becoming bsl / unrolling.

  return !VT.isFixedLengthVector();

}


MachineInstr *


AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,

                                     MachineBasicBlock::instr_iterator &MBBI,

                                     const TargetInstrInfo *TII) const {

  assert(MBBI->isCall() && MBBI->getCFIType() &&

         "Invalid call instruction for a KCFI check");


  switch (MBBI->getOpcode()) {

  case AArch64::BLR:

  case AArch64::BLRNoIP:

  case AArch64::TCRETURNri:

  case AArch64::TCRETURNrix16x17:

  case AArch64::TCRETURNrix17:

  case AArch64::TCRETURNrinotx16:

    break;

  default:

    llvm_unreachable("Unexpected CFI call opcode");

  }


  MachineOperand &Target = MBBI->getOperand(0);

  assert(Target.isReg() && "Invalid target operand for an indirect call");

  Target.setIsRenamable(false);


  return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))

      .addReg(Target.getReg())

      .addImm(MBBI->getCFIType())

      .getInstr();

}


bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {

  return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();

}


unsigned


AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {

  if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())

    return getPointerTy(DL).getSizeInBits();


  return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;

}


void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {

  MachineFrameInfo &MFI = MF.getFrameInfo();

  // If we have any vulnerable SVE stack objects then the stack protector

  // needs to be placed at the top of the SVE stack area, as the SVE locals

  // are placed above the other locals, so we allocate it as if it were a

  // scalable vector.

  // FIXME: It may be worthwhile having a specific interface for this rather

  // than doing it here in finalizeLowering.

  if (MFI.hasStackProtectorIndex()) {

    for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {

      if (MFI.hasScalableStackID(i) &&

          MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) {

        MFI.setStackID(MFI.getStackProtectorIndex(),

                       TargetStackID::ScalableVector);

        MFI.setObjectAlignment(MFI.getStackProtectorIndex(), Align(16));

        break;

      }

    }

  }

  MFI.computeMaxCallFrameSize(MF);

  TargetLoweringBase::finalizeLowering(MF);

}


// Unlike X86, we let frame lowering assign offsets to all catch objects.

bool AArch64TargetLowering::needsFixedCatchObjects() const { return true; }


bool AArch64TargetLowering::shouldLocalize(

    const MachineInstr &MI, const TargetTransformInfo *TTI) const {

  auto &MF = *MI.getMF();

  auto &MRI = MF.getRegInfo();

  auto maxUses = [](unsigned RematCost) {

    // A cost of 1 means remats are basically free.

    if (RematCost == 1)

      return std::numeric_limits<unsigned>::max();

    if (RematCost == 2)

      return 2U;


    // Remat is too expensive, only sink if there's one user.

    if (RematCost > 2)

      return 1U;

    llvm_unreachable("Unexpected remat cost");

  };


  unsigned Opc = MI.getOpcode();

  switch (Opc) {

  case TargetOpcode::G_GLOBAL_VALUE: {

    // On Darwin, TLS global vars get selected into function calls, which

    // we don't want localized, as they can get moved into the middle of a

    // another call sequence.

    const GlobalValue &GV = *MI.getOperand(1).getGlobal();

    if (GV.isThreadLocal() && Subtarget->isTargetMachO())

      return false;

    return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.

  }

  case TargetOpcode::G_FCONSTANT:

  case TargetOpcode::G_CONSTANT: {

    const ConstantInt *CI;

    unsigned AdditionalCost = 0;


    if (Opc == TargetOpcode::G_CONSTANT)

      CI = MI.getOperand(1).getCImm();

    else {

      LLT Ty = MRI.getType(MI.getOperand(0).getReg());

      // We try to estimate cost of 32/64b fpimms, as they'll likely be

      // materialized as integers.

      if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)

        break;

      auto APF = MI.getOperand(1).getFPImm()->getValueAPF();

      bool OptForSize = MF.getFunction().hasOptSize();

      if (isFPImmLegal(APF, EVT::getFloatingPointVT(Ty.getScalarSizeInBits()),

                       OptForSize))

        return true; // Constant should be cheap.

      CI =

          ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());

      // FP materialization also costs an extra move, from gpr to fpr.

      AdditionalCost = 1;

    }

    APInt Imm = CI->getValue();

    InstructionCost Cost = TTI->getIntImmCost(

        Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize);

    assert(Cost.isValid() && "Expected a valid imm cost");


    unsigned RematCost = Cost.getValue();

    RematCost += AdditionalCost;

    Register Reg = MI.getOperand(0).getReg();

    unsigned MaxUses = maxUses(RematCost);

    // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().

    if (MaxUses == std::numeric_limits<unsigned>::max())

      --MaxUses;

    return MRI.hasAtMostUserInstrs(Reg, MaxUses);

  }

  // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being

  // localizable.

  case AArch64::ADRP:

  case AArch64::G_ADD_LOW:

  // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.

  case TargetOpcode::G_PTR_ADD:

    return true;

  default:

    break;

  }

  return TargetLoweringBase::shouldLocalize(MI, TTI);

}


bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {

  // Fallback for scalable vectors.

  // Note that if EnableSVEGISel is true, we allow scalable vector types for

  // all instructions, regardless of whether they are actually supported.

  if (!EnableSVEGISel) {

    if (Inst.getType()->isScalableTy()) {

      return true;

    }


    for (unsigned i = 0; i < Inst.getNumOperands(); ++i)

      if (Inst.getOperand(i)->getType()->isScalableTy())

        return true;


    if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {

      if (AI->getAllocatedType()->isScalableTy())

        return true;

    }

  }


  // Checks to allow the use of SME instructions

  if (auto *Base = dyn_cast<CallBase>(&Inst)) {

    auto CallAttrs = SMECallAttrs(*Base, this);

    if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||

        CallAttrs.requiresPreservingZT0() ||

        CallAttrs.requiresPreservingAllZAState())

      return true;

  }

  return false;

}


// Return the largest legal scalable vector type that matches VT's element type.


static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {

  assert(VT.isFixedLengthVector() &&

         DAG.getTargetLoweringInfo().isTypeLegal(VT) &&

         "Expected legal fixed length vector!");

  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {

  default:

    llvm_unreachable("unexpected element type for SVE container");

  case MVT::i8:

    return EVT(MVT::nxv16i8);

  case MVT::i16:

    return EVT(MVT::nxv8i16);

  case MVT::i32:

    return EVT(MVT::nxv4i32);

  case MVT::i64:

    return EVT(MVT::nxv2i64);

  case MVT::bf16:

    return EVT(MVT::nxv8bf16);

  case MVT::f16:

    return EVT(MVT::nxv8f16);

  case MVT::f32:

    return EVT(MVT::nxv4f32);

  case MVT::f64:

    return EVT(MVT::nxv2f64);

  }

}


// Return a predicate with active lanes corresponding to the extent of VT.


static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,

                                                EVT VT) {

  assert(VT.isFixedLengthVector() &&

         DAG.getTargetLoweringInfo().isTypeLegal(VT) &&

         "Expected legal fixed length vector!");


  std::optional<unsigned> PgPattern =

      getSVEPredPatternFromNumElements(VT.getVectorNumElements());

  assert(PgPattern && "Unexpected element count for SVE predicate");


  // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use

  // AArch64SVEPredPattern::all, which can enable the use of unpredicated

  // variants of instructions when available.

  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();

  unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();

  unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();

  if (MaxSVESize && MinSVESize == MaxSVESize &&

      MaxSVESize == VT.getSizeInBits())

    PgPattern = AArch64SVEPredPattern::all;


  MVT MaskVT;

  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {

  default:

    llvm_unreachable("unexpected element type for SVE predicate");

  case MVT::i8:

    MaskVT = MVT::nxv16i1;

    break;

  case MVT::i16:

  case MVT::f16:

  case MVT::bf16:

    MaskVT = MVT::nxv8i1;

    break;

  case MVT::i32:

  case MVT::f32:

    MaskVT = MVT::nxv4i1;

    break;

  case MVT::i64:

  case MVT::f64:

    MaskVT = MVT::nxv2i1;

    break;

  }


  return getPTrue(DAG, DL, MaskVT, *PgPattern);

}


static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,

                                             EVT VT) {

  assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&

         "Expected legal scalable vector!");

  auto PredTy = VT.changeVectorElementType(MVT::i1);

  return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);

}


static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {

  if (VT.isFixedLengthVector())

    return getPredicateForFixedLengthVector(DAG, DL, VT);


  return getPredicateForScalableVector(DAG, DL, VT);

}


// Grow V to consume an entire SVE register.


static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {

  assert(VT.isScalableVector() &&

         "Expected to convert into a scalable vector!");

  assert(V.getValueType().isFixedLengthVector() &&

         "Expected a fixed length vector operand!");

  SDLoc DL(V);

  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);

  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);

}


// Shrink V so it's just big enough to maintain a VT's worth of data.


static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {

  assert(VT.isFixedLengthVector() &&

         "Expected to convert into a fixed length vector!");

  assert(V.getValueType().isScalableVector() &&

         "Expected a scalable vector operand!");

  SDLoc DL(V);

  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);

  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);

}


// Convert all fixed length vector loads larger than NEON to masked_loads.

SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(

    SDValue Op, SelectionDAG &DAG) const {

  auto Load = cast<LoadSDNode>(Op);


  SDLoc DL(Op);

  EVT VT = Op.getValueType();

  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

  EVT LoadVT = ContainerVT;

  EVT MemVT = Load->getMemoryVT();


  auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);


  if (VT.isFloatingPoint()) {

    LoadVT = ContainerVT.changeTypeToInteger();

    MemVT = MemVT.changeTypeToInteger();

  }


  SDValue NewLoad = DAG.getMaskedLoad(

      LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,

      DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),

      Load->getAddressingMode(), Load->getExtensionType());


  SDValue Result = NewLoad;

  if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {

    EVT ExtendVT = ContainerVT.changeVectorElementType(

        Load->getMemoryVT().getVectorElementType());


    Result = getSVESafeBitCast(ExtendVT, Result, DAG);

    Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,

                         Pg, Result, DAG.getUNDEF(ContainerVT));

  } else if (VT.isFloatingPoint()) {

    Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);

  }


  Result = convertFromScalableVector(DAG, VT, Result);

  SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};

  return DAG.getMergeValues(MergedValues, DL);

}


static SDValue convertFixedMaskToScalableVector(SDValue Mask,

                                                SelectionDAG &DAG) {

  SDLoc DL(Mask);

  EVT InVT = Mask.getValueType();

  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);

  SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);


  if (ISD::isBuildVectorAllOnes(Mask.getNode()))

    return Pg;


  bool InvertCond = false;

  if (isBitwiseNot(Mask)) {

    InvertCond = true;

    Mask = Mask.getOperand(0);

  }


  SDValue Op1, Op2;

  ISD::CondCode CC;


  // When Mask is the result of a SETCC, it's better to regenerate the compare.

  if (Mask.getOpcode() == ISD::SETCC) {

    Op1 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(0));

    Op2 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(1));

    CC = cast<CondCodeSDNode>(Mask.getOperand(2))->get();

  } else {

    Op1 = convertToScalableVector(DAG, ContainerVT, Mask);

    Op2 = DAG.getConstant(0, DL, ContainerVT);

    CC = ISD::SETNE;

  }


  if (InvertCond)

    CC = getSetCCInverse(CC, Op1.getValueType());


  return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),

                     {Pg, Op1, Op2, DAG.getCondCode(CC)});

}


// Convert all fixed length vector loads larger than NEON to masked_loads.

SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(

    SDValue Op, SelectionDAG &DAG) const {

  auto Load = cast<MaskedLoadSDNode>(Op);


  SDLoc DL(Op);

  EVT VT = Op.getValueType();

  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);


  SDValue Mask = Load->getMask();

  // If this is an extending load and the mask type is not the same as

  // load's type then we have to extend the mask type.

  if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {

    assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&

           "Incorrect mask type");

    Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Mask);

  }

  Mask = convertFixedMaskToScalableVector(Mask, DAG);


  SDValue PassThru;

  bool IsPassThruZeroOrUndef = false;


  if (Load->getPassThru()->isUndef()) {

    PassThru = DAG.getUNDEF(ContainerVT);

    IsPassThruZeroOrUndef = true;

  } else {

    if (ContainerVT.isInteger())

      PassThru = DAG.getConstant(0, DL, ContainerVT);

    else

      PassThru = DAG.getConstantFP(0, DL, ContainerVT);

    if (isZerosVector(Load->getPassThru().getNode()))

      IsPassThruZeroOrUndef = true;

  }


  SDValue NewLoad = DAG.getMaskedLoad(

      ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),

      Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),

      Load->getAddressingMode(), Load->getExtensionType());


  SDValue Result = NewLoad;

  if (!IsPassThruZeroOrUndef) {

    SDValue OldPassThru =

        convertToScalableVector(DAG, ContainerVT, Load->getPassThru());

    Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);

  }


  Result = convertFromScalableVector(DAG, VT, Result);

  SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};

  return DAG.getMergeValues(MergedValues, DL);

}


// Convert all fixed length vector stores larger than NEON to masked_stores.

SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(

    SDValue Op, SelectionDAG &DAG) const {

  auto Store = cast<StoreSDNode>(Op);


  SDLoc DL(Op);

  EVT VT = Store->getValue().getValueType();

  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

  EVT MemVT = Store->getMemoryVT();


  auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);

  auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());


  if (VT.isFloatingPoint() && Store->isTruncatingStore()) {

    EVT TruncVT = ContainerVT.changeVectorElementType(

        Store->getMemoryVT().getVectorElementType());

    MemVT = MemVT.changeTypeToInteger();

    NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,

                           NewValue, DAG.getTargetConstant(0, DL, MVT::i64),

                           DAG.getUNDEF(TruncVT));

    NewValue =

        getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);

  } else if (VT.isFloatingPoint()) {

    MemVT = MemVT.changeTypeToInteger();

    NewValue =

        getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);

  }


  return DAG.getMaskedStore(Store->getChain(), DL, NewValue,

                            Store->getBasePtr(), Store->getOffset(), Pg, MemVT,

                            Store->getMemOperand(), Store->getAddressingMode(),

                            Store->isTruncatingStore());

}


SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(

    SDValue Op, SelectionDAG &DAG) const {

  auto *Store = cast<MaskedStoreSDNode>(Op);


  SDLoc DL(Op);

  EVT VT = Store->getValue().getValueType();

  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);


  auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());

  SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG);


  return DAG.getMaskedStore(

      Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),

      Mask, Store->getMemoryVT(), Store->getMemOperand(),

      Store->getAddressingMode(), Store->isTruncatingStore());

}


SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(

    SDValue Op, SelectionDAG &DAG) const {

  SDLoc DL(Op);

  EVT VT = Op.getValueType();

  EVT EltVT = VT.getVectorElementType();


  bool Signed = Op.getOpcode() == ISD::SDIV;

  unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;


  bool Negated;

  uint64_t SplatVal;

  // NOTE: SRAD cannot be used to represent sdiv-by-one.

  if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) &&

      SplatVal > 1) {

    EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

    SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));

    SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32);


    SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);

    SDValue Res =

        DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, DL, ContainerVT, Pg, Op1, Op2);

    if (Negated)

      Res = DAG.getNode(ISD::SUB, DL, ContainerVT,

                        DAG.getConstant(0, DL, ContainerVT), Res);


    return convertFromScalableVector(DAG, VT, Res);

  }


  // Scalable vector i32/i64 DIV is supported.

  if (EltVT == MVT::i32 || EltVT == MVT::i64)

    return LowerToPredicatedOp(Op, DAG, PredOpcode);


  // Scalable vector i8/i16 DIV is not supported. Promote it to i32.

  EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());

  EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());

  unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;


  // If the wider type is legal: extend, op, and truncate.

  EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());

  if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {

    SDValue Op0 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(0));

    SDValue Op1 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(1));

    SDValue Div = DAG.getNode(Op.getOpcode(), DL, WideVT, Op0, Op1);

    return DAG.getNode(ISD::TRUNCATE, DL, VT, Div);

  }


  auto HalveAndExtendVector = [&DAG, &DL, &HalfVT, &PromVT,

                               &ExtendOpcode](SDValue Op) {

    SDValue IdxZero = DAG.getConstant(0, DL, MVT::i64);

    SDValue IdxHalf =

        DAG.getConstant(HalfVT.getVectorNumElements(), DL, MVT::i64);

    SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxZero);

    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxHalf);

    return std::pair<SDValue, SDValue>(

        {DAG.getNode(ExtendOpcode, DL, PromVT, Lo),

         DAG.getNode(ExtendOpcode, DL, PromVT, Hi)});

  };


  // If wider type is not legal: split, extend, op, trunc and concat.

  auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));

  auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));

  SDValue Lo = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0LoExt, Op1LoExt);

  SDValue Hi = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0HiExt, Op1HiExt);

  SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Lo);

  SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Hi);

  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoTrunc, HiTrunc});

}


SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(

    SDValue Op, SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");


  SDLoc DL(Op);

  SDValue Val = Op.getOperand(0);

  EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());

  Val = convertToScalableVector(DAG, ContainerVT, Val);


  bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;

  unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;


  // Repeatedly unpack Val until the result is of the desired element type.

  switch (ContainerVT.getSimpleVT().SimpleTy) {

  default:

    llvm_unreachable("unimplemented container type");

  case MVT::nxv16i8:

    Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);

    if (VT.getVectorElementType() == MVT::i16)

      break;

    [[fallthrough]];

  case MVT::nxv8i16:

    Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);

    if (VT.getVectorElementType() == MVT::i32)

      break;

    [[fallthrough]];

  case MVT::nxv4i32:

    Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);

    assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");

    break;

  }


  return convertFromScalableVector(DAG, VT, Val);

}


SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(

    SDValue Op, SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");


  SDLoc DL(Op);

  SDValue Val = Op.getOperand(0);

  EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());

  Val = convertToScalableVector(DAG, ContainerVT, Val);


  // Repeatedly truncate Val until the result is of the desired element type.

  switch (ContainerVT.getSimpleVT().SimpleTy) {

  default:

    llvm_unreachable("unimplemented container type");

  case MVT::nxv2i64:

    Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);

    Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);

    if (VT.getVectorElementType() == MVT::i32)

      break;

    [[fallthrough]];

  case MVT::nxv4i32:

    Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);

    Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);

    if (VT.getVectorElementType() == MVT::i16)

      break;

    [[fallthrough]];

  case MVT::nxv8i16:

    Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);

    Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);

    assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");

    break;

  }


  return convertFromScalableVector(DAG, VT, Val);

}


SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(

    SDValue Op, SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  EVT InVT = Op.getOperand(0).getValueType();

  assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");


  SDLoc DL(Op);

  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);

  SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));


  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));

}


SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(

    SDValue Op, SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");


  SDLoc DL(Op);

  EVT InVT = Op.getOperand(0).getValueType();

  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);

  SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));


  auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,

                                 Op.getOperand(1), Op.getOperand(2));


  return convertFromScalableVector(DAG, VT, ScalableRes);

}


// Convert vector operation 'Op' to an equivalent predicated operation whereby

// the original operation's type is used to construct a suitable predicate.

// NOTE: The results for inactive lanes are undefined.

SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,

                                                   SelectionDAG &DAG,

                                                   unsigned NewOp) const {

  EVT VT = Op.getValueType();

  SDLoc DL(Op);

  auto Pg = getPredicateForVector(DAG, DL, VT);


  if (VT.isFixedLengthVector()) {

    assert(isTypeLegal(VT) && "Expected only legal fixed-width types");

    EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);


    // Create list of operands by converting existing ones to scalable types.

    SmallVector<SDValue, 4> Operands = {Pg};

    for (const SDValue &V : Op->op_values()) {

      if (isa<CondCodeSDNode>(V)) {

        Operands.push_back(V);

        continue;

      }


      if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {

        EVT VTArg = VTNode->getVT().getVectorElementType();

        EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);

        Operands.push_back(DAG.getValueType(NewVTArg));

        continue;

      }


      assert(isTypeLegal(V.getValueType()) &&

             "Expected only legal fixed-width types");

      Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));

    }


    if (isMergePassthruOpcode(NewOp))

      Operands.push_back(DAG.getUNDEF(ContainerVT));


    auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);

    return convertFromScalableVector(DAG, VT, ScalableRes);

  }


  assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");


  SmallVector<SDValue, 4> Operands = {Pg};

  for (const SDValue &V : Op->op_values()) {

    assert((!V.getValueType().isVector() ||

            V.getValueType().isScalableVector()) &&

           "Only scalable vectors are supported!");

    Operands.push_back(V);

  }


  if (isMergePassthruOpcode(NewOp))

    Operands.push_back(DAG.getUNDEF(VT));


  return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());

}


// If a fixed length vector operation has no side effects when applied to

// undefined elements, we can safely use scalable vectors to perform the same

// operation without needing to worry about predication.

SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,

                                                 SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  assert(VT.isFixedLengthVector() && isTypeLegal(VT) &&

         "Only expected to lower fixed length vector operation!");

  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);


  // Create list of operands by converting existing ones to scalable types.

  SmallVector<SDValue, 4> Ops;

  for (const SDValue &V : Op->op_values()) {

    assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");


    // Pass through non-vector operands.

    if (!V.getValueType().isVector()) {

      Ops.push_back(V);

      continue;

    }


    // "cast" fixed length vector to a scalable vector.

    assert(V.getValueType().isFixedLengthVector() &&

           isTypeLegal(V.getValueType()) &&

           "Only fixed length vectors are supported!");

    Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));

  }


  auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);

  return convertFromScalableVector(DAG, VT, ScalableRes);

}


SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,

    SelectionDAG &DAG) const {

  SDLoc DL(ScalarOp);

  SDValue AccOp = ScalarOp.getOperand(0);

  SDValue VecOp = ScalarOp.getOperand(1);

  EVT SrcVT = VecOp.getValueType();

  EVT ResVT = SrcVT.getVectorElementType();


  EVT ContainerVT = SrcVT;

  if (SrcVT.isFixedLengthVector()) {

    ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);

    VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);

  }


  SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);

  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);


  // Convert operands to Scalable.

  AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,

                      DAG.getUNDEF(ContainerVT), AccOp, Zero);


  // Perform reduction.

  SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,

                            Pg, AccOp, VecOp);


  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);

}


SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,

                                                       SelectionDAG &DAG) const {

  SDLoc DL(ReduceOp);

  SDValue Op = ReduceOp.getOperand(0);

  EVT OpVT = Op.getValueType();

  EVT VT = ReduceOp.getValueType();


  if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)

    return SDValue();


  SDValue Pg = getPredicateForVector(DAG, DL, OpVT);


  switch (ReduceOp.getOpcode()) {

  default:

    return SDValue();

  case ISD::VECREDUCE_OR:

    if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)

      // The predicate can be 'Op' because

      // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).

      return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);

    else

      return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);

  case ISD::VECREDUCE_AND: {

    Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);

    return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);

  }

  case ISD::VECREDUCE_XOR: {

    SDValue ID =

        DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);

    if (OpVT == MVT::nxv1i1) {

      // Emulate a CNTP on .Q using .D and a different governing predicate.

      Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);

      Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);

    }

    SDValue Cntp =

        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);

    return DAG.getAnyExtOrTrunc(Cntp, DL, VT);

  }

  }


  return SDValue();

}


SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,

                                                   SDValue ScalarOp,

                                                   SelectionDAG &DAG) const {

  SDLoc DL(ScalarOp);

  SDValue VecOp = ScalarOp.getOperand(0);

  EVT SrcVT = VecOp.getValueType();


  if (useSVEForFixedLengthVectorVT(

          SrcVT,

          /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {

    EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);

    VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);

  }


  // Lower VECREDUCE_ADD of nxv2i1-nxv16i1 to CNTP rather than UADDV.

  if (ScalarOp.getOpcode() == ISD::VECREDUCE_ADD &&

      VecOp.getOpcode() == ISD::ZERO_EXTEND) {

    SDValue BoolVec = VecOp.getOperand(0);

    if (BoolVec.getValueType().getVectorElementType() == MVT::i1) {

      // CNTP(BoolVec & BoolVec) <=> CNTP(BoolVec & PTRUE)

      SDValue CntpOp = DAG.getNode(

          ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,

          DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64),

          BoolVec, BoolVec);

      return DAG.getAnyExtOrTrunc(CntpOp, DL, ScalarOp.getValueType());

    }

  }


  // UADDV always returns an i64 result.

  EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :

                                                   SrcVT.getVectorElementType();

  EVT RdxVT = SrcVT;

  if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)

    RdxVT = getPackedSVEVectorVT(ResVT);


  SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);

  SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);

  SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,

                            Rdx, DAG.getConstant(0, DL, MVT::i64));


  // The VEC_REDUCE nodes expect an element size result.

  if (ResVT != ScalarOp.getValueType())

    Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());


  return Res;

}


SDValue

AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,

    SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  SDLoc DL(Op);


  EVT InVT = Op.getOperand(1).getValueType();

  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);

  SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));

  SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));


  // Convert the mask to a predicated (NOTE: We don't need to worry about

  // inactive lanes since VSELECT is safe when given undefined elements).

  EVT MaskVT = Op.getOperand(0).getValueType();

  EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);

  auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));

  Mask = DAG.getNode(ISD::TRUNCATE, DL,

                     MaskContainerVT.changeVectorElementType(MVT::i1), Mask);


  auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,

                                Mask, Op1, Op2);


  return convertFromScalableVector(DAG, VT, ScalableRes);

}


SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(

    SDValue Op, SelectionDAG &DAG) const {

  SDLoc DL(Op);

  EVT InVT = Op.getOperand(0).getValueType();

  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);


  assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&

         "Only expected to lower fixed length vector operation!");

  assert(Op.getValueType() == InVT.changeTypeToInteger() &&

         "Expected integer result of the same bit length as the inputs!");


  auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));

  auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));

  auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);


  EVT CmpVT = Pg.getValueType();

  auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,

                         {Pg, Op1, Op2, Op.getOperand(2)});


  EVT PromoteVT = ContainerVT.changeTypeToInteger();

  auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);

  return convertFromScalableVector(DAG, Op.getValueType(), Promote);

}


SDValue

AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,

                                                    SelectionDAG &DAG) const {

  SDLoc DL(Op);

  auto SrcOp = Op.getOperand(0);

  EVT VT = Op.getValueType();

  EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);

  EVT ContainerSrcVT =

      getContainerForFixedLengthVector(DAG, SrcOp.getValueType());


  SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);

  Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);

  return convertFromScalableVector(DAG, VT, Op);

}


SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(

    SDValue Op, SelectionDAG &DAG) const {

  SDLoc DL(Op);

  unsigned NumOperands = Op->getNumOperands();


  assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&

         "Unexpected number of operands in CONCAT_VECTORS");


  auto SrcOp1 = Op.getOperand(0);

  auto SrcOp2 = Op.getOperand(1);

  EVT VT = Op.getValueType();

  EVT SrcVT = SrcOp1.getValueType();


  // Match a splat of 128b segments that fit in a single register.

  if (SrcVT.is128BitVector() && all_equal(Op.getNode()->op_values())) {

    EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

    SDValue Splat =

        DAG.getNode(AArch64ISD::DUPLANE128, DL, ContainerVT,

                    convertToScalableVector(DAG, ContainerVT, SrcOp1),

                    DAG.getConstant(0, DL, MVT::i64, /*isTarget=*/true));

    return convertFromScalableVector(DAG, VT, Splat);

  }


  if (NumOperands > 2) {

    SmallVector<SDValue, 4> Ops;

    EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());

    for (unsigned I = 0; I < NumOperands; I += 2)

      Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,

                                Op->getOperand(I), Op->getOperand(I + 1)));


    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);

  }


  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);


  SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);

  SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);

  SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);


  Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);


  return convertFromScalableVector(DAG, VT, Op);

}


SDValue

AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,

                                                     SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");


  SDLoc DL(Op);

  SDValue Val = Op.getOperand(0);

  SDValue Pg = getPredicateForVector(DAG, DL, VT);

  EVT SrcVT = Val.getValueType();

  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

  EVT ExtendVT = ContainerVT.changeVectorElementType(

      SrcVT.getVectorElementType());


  Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);

  Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);


  Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);

  Val = getSVESafeBitCast(ExtendVT, Val, DAG);

  Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,

                    Pg, Val, DAG.getUNDEF(ContainerVT));


  return convertFromScalableVector(DAG, VT, Val);

}


SDValue

AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,

                                                    SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");


  SDLoc DL(Op);

  SDValue Val = Op.getOperand(0);

  EVT SrcVT = Val.getValueType();

  EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);

  EVT RoundVT = ContainerSrcVT.changeVectorElementType(

      VT.getVectorElementType());

  SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);


  Val = convertToScalableVector(DAG, ContainerSrcVT, Val);

  Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,

                    Op.getOperand(1), DAG.getUNDEF(RoundVT));

  Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);

  Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);


  Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);

  return DAG.getNode(ISD::BITCAST, DL, VT, Val);

}


SDValue

AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,

                                                    SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");


  bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;

  unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU

                             : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;


  SDLoc DL(Op);

  SDValue Val = Op.getOperand(0);

  EVT SrcVT = Val.getValueType();

  EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);

  EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);


  if (VT.bitsGE(SrcVT)) {

    SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);


    Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,

                      VT.changeTypeToInteger(), Val);


    // Safe to use a larger than specified operand because by promoting the

    // value nothing has changed from an arithmetic point of view.

    Val =

        convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);

    Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,

                      DAG.getUNDEF(ContainerDstVT));

    return convertFromScalableVector(DAG, VT, Val);

  } else {

    EVT CvtVT = ContainerSrcVT.changeVectorElementType(

        ContainerDstVT.getVectorElementType());

    SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);


    Val = convertToScalableVector(DAG, ContainerSrcVT, Val);

    Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));

    Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);

    Val = convertFromScalableVector(DAG, SrcVT, Val);


    Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);

    return DAG.getNode(ISD::BITCAST, DL, VT, Val);

  }

}


SDValue

AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,

                                                SelectionDAG &DAG) const {

  SDLoc DL(Op);

  EVT OpVT = Op.getValueType();

  assert(OpVT.isScalableVector() &&

         "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");


  if (Op->getNumOperands() == 3) {

    // aarch64_sve_ld3 only supports packed datatypes.

    EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount());

    Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false);

    SDValue StackPtr =

        DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment);


    // Write out unmodified operands.

    SmallVector<SDValue, 3> Chains;

    for (unsigned I = 0; I < 3; ++I) {

      SDValue Ptr =

          DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL);

      SDValue V = getSVESafeBitCast(PackedVT, Op.getOperand(I), DAG);

      Chains.push_back(

          DAG.getStore(DAG.getEntryNode(), DL, V, Ptr, MachinePointerInfo()));

    }


    Intrinsic::ID IntID = Intrinsic::aarch64_sve_ld3_sret;

    EVT PredVT = PackedVT.changeVectorElementType(MVT::i1);


    SmallVector<SDValue, 7> Ops;

    Ops.push_back(DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains));

    Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));

    Ops.push_back(DAG.getConstant(1, DL, PredVT));

    Ops.push_back(StackPtr);


    // Read back and deinterleave data.

    SDVTList VTs = DAG.getVTList(PackedVT, PackedVT, PackedVT, MVT::Other);

    SDValue LD3 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops);


    SmallVector<SDValue, 3> Results;

    Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(0), DAG));

    Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(1), DAG));

    Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(2), DAG));

    return DAG.getMergeValues(Results, DL);

  }


  // Are multi-register uzp instructions available?

  if (Subtarget->hasSME2() && Subtarget->isStreaming() &&

      OpVT.getVectorElementType() != MVT::i1) {

    Intrinsic::ID IntID;

    switch (Op->getNumOperands()) {

    default:

      return SDValue();

    case 2:

      IntID = Intrinsic::aarch64_sve_uzp_x2;

      break;

    case 4:

      if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&

          OpVT.getScalarSizeInBits() == 64)

        return SDValue();

      IntID = Intrinsic::aarch64_sve_uzp_x4;

      break;

    }


    SmallVector<SDValue, 5> Ops;

    Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));

    Ops.append(Op->op_values().begin(), Op->op_values().end());

    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);

  }


  if (Op->getNumOperands() != 2)

    return SDValue();


  SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),

                             Op.getOperand(1));

  SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),

                            Op.getOperand(1));

  return DAG.getMergeValues({Even, Odd}, DL);

}


SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,

                                                      SelectionDAG &DAG) const {

  SDLoc DL(Op);

  EVT OpVT = Op.getValueType();

  assert(OpVT.isScalableVector() &&

         "Expected scalable vector in LowerVECTOR_INTERLEAVE.");


  if (Op->getNumOperands() == 3) {

    // aarch64_sve_st3 only supports packed datatypes.

    EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount());

    SmallVector<SDValue, 3> InVecs;

    for (SDValue V : Op->ops())

      InVecs.push_back(getSVESafeBitCast(PackedVT, V, DAG));


    Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false);

    SDValue StackPtr =

        DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment);


    Intrinsic::ID IntID = Intrinsic::aarch64_sve_st3;

    EVT PredVT = PackedVT.changeVectorElementType(MVT::i1);


    SmallVector<SDValue, 7> Ops;

    Ops.push_back(DAG.getEntryNode());

    Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));

    Ops.append(InVecs);

    Ops.push_back(DAG.getConstant(1, DL, PredVT));

    Ops.push_back(StackPtr);


    // Interleave operands and store.

    SDValue Chain = DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops);


    // Read back the interleaved data.

    SmallVector<SDValue, 3> Results;

    for (unsigned I = 0; I < 3; ++I) {

      SDValue Ptr =

          DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL);

      SDValue L = DAG.getLoad(PackedVT, DL, Chain, Ptr, MachinePointerInfo());

      Results.push_back(getSVESafeBitCast(OpVT, L, DAG));

    }


    return DAG.getMergeValues(Results, DL);

  }


  // Are multi-register zip instructions available?

  if (Subtarget->hasSME2() && Subtarget->isStreaming() &&

      OpVT.getVectorElementType() != MVT::i1) {

    Intrinsic::ID IntID;

    switch (Op->getNumOperands()) {

    default:

      return SDValue();

    case 2:

      IntID = Intrinsic::aarch64_sve_zip_x2;

      break;

    case 4:

      if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&

          OpVT.getScalarSizeInBits() == 64)

        return SDValue();

      IntID = Intrinsic::aarch64_sve_zip_x4;

      break;

    }


    SmallVector<SDValue, 5> Ops;

    Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));

    Ops.append(Op->op_values().begin(), Op->op_values().end());

    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);

  }


  if (Op->getNumOperands() != 2)

    return SDValue();


  SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),

                           Op.getOperand(1));

  SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),

                           Op.getOperand(1));

  return DAG.getMergeValues({Lo, Hi}, DL);

}


SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,

                                                     SelectionDAG &DAG) const {

  // FIXME: Maybe share some code with LowerMGather/Scatter?

  MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);

  SDLoc DL(HG);

  SDValue Chain = HG->getChain();

  SDValue Inc = HG->getInc();

  SDValue Mask = HG->getMask();

  SDValue Ptr = HG->getBasePtr();

  SDValue Index = HG->getIndex();

  SDValue Scale = HG->getScale();

  SDValue IntID = HG->getIntID();


  // The Intrinsic ID determines the type of update operation.

  [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());

  // Right now, we only support 'add' as an update.

  assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&

         "Unexpected histogram update operation");


  EVT IndexVT = Index.getValueType();

  LLVMContext &Ctx = *DAG.getContext();

  ElementCount EC = IndexVT.getVectorElementCount();

  EVT MemVT = EVT::getVectorVT(Ctx, HG->getMemoryVT(), EC);

  EVT IncExtVT =

      EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());

  EVT IncSplatVT = EVT::getVectorVT(Ctx, IncExtVT, EC);

  bool ExtTrunc = IncSplatVT != MemVT;


  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);

  SDValue PassThru = DAG.getSplatVector(IncSplatVT, DL, Zero);

  SDValue IncSplat = DAG.getSplatVector(

      IncSplatVT, DL, DAG.getAnyExtOrTrunc(Inc, DL, IncExtVT));

  SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};


  MachineMemOperand *MMO = HG->getMemOperand();

  // Create an MMO for the gather, without load|store flags.

  MachineMemOperand *GMMO = DAG.getMachineFunction().getMachineMemOperand(

      MMO->getPointerInfo(), MachineMemOperand::MOLoad, MMO->getSize(),

      MMO->getAlign(), MMO->getAAInfo());

  ISD::MemIndexType IndexType = HG->getIndexType();

  SDValue Gather = DAG.getMaskedGather(

      DAG.getVTList(IncSplatVT, MVT::Other), MemVT, DL, Ops, GMMO, IndexType,

      ExtTrunc ? ISD::EXTLOAD : ISD::NON_EXTLOAD);


  SDValue GChain = Gather.getValue(1);


  // Perform the histcnt, multiply by inc, add to bucket data.

  SDValue ID =

      DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncExtVT);

  SDValue HistCnt =

      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);

  SDValue Mul = DAG.getNode(ISD::MUL, DL, IncSplatVT, HistCnt, IncSplat);

  SDValue Add = DAG.getNode(ISD::ADD, DL, IncSplatVT, Gather, Mul);


  // Create an MMO for the scatter, without load|store flags.

  MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(

      MMO->getPointerInfo(), MachineMemOperand::MOStore, MMO->getSize(),

      MMO->getAlign(), MMO->getAAInfo());


  SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};

  SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,

                                         ScatterOps, SMMO, IndexType, ExtTrunc);

  return Scatter;

}


/// If a PARTIAL_REDUCE_MLA node comes in with an accumulator-input type pairing

/// of (nx)v2i64/(nx)v16i8, we cannot directly lower it to a (u|s)dot. We can

/// however still make use of the dot product instruction by instead

/// accumulating over two steps: (nx)v16i8 -> (nx)v4i32 -> (nx)v2i64.

/// If available, make use of the (U|S)ADDW(B|T) instructions, otherwise

/// the following pattern is emitted:

/// add(add(Acc, ext(EXTRACT_SUBVECTOR(N, 0)), ext(EXTRACT_SUBVECTOR(N,

/// NTy/2))))

SDValue

AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,

                                               SelectionDAG &DAG) const {

  SDLoc DL(Op);


  SDValue Acc = Op.getOperand(0);

  SDValue LHS = Op.getOperand(1);

  SDValue RHS = Op.getOperand(2);

  EVT ResultVT = Op.getValueType();

  EVT OrigResultVT = ResultVT;

  EVT OpVT = LHS.getValueType();


  bool ConvertToScalable =

      ResultVT.isFixedLengthVector() &&

      useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true);


  // We can handle this case natively by accumulating into a wider

  // zero-padded vector.

  if (!ConvertToScalable && ResultVT == MVT::v2i32 && OpVT == MVT::v16i8) {

    SDValue ZeroVec = DAG.getConstant(0, DL, MVT::v4i32);

    SDValue WideAcc = DAG.getInsertSubvector(DL, ZeroVec, Acc, 0);

    SDValue Wide =

        DAG.getNode(Op.getOpcode(), DL, MVT::v4i32, WideAcc, LHS, RHS);

    SDValue Reduced = DAG.getNode(AArch64ISD::ADDP, DL, MVT::v4i32, Wide, Wide);

    return DAG.getExtractSubvector(DL, MVT::v2i32, Reduced, 0);

  }


  if (ConvertToScalable) {

    ResultVT = getContainerForFixedLengthVector(DAG, ResultVT);

    OpVT = getContainerForFixedLengthVector(DAG, LHS.getValueType());

    Acc = convertToScalableVector(DAG, ResultVT, Acc);

    LHS = convertToScalableVector(DAG, OpVT, LHS);

    RHS = convertToScalableVector(DAG, OpVT, RHS);

    Op = DAG.getNode(Op.getOpcode(), DL, ResultVT, {Acc, LHS, RHS});

  }


  // Two-way and four-way partial reductions are supported by patterns.

  // We only need to handle the 8-way partial reduction.

  if (ResultVT.getScalarType() != MVT::i64 || OpVT.getScalarType() != MVT::i8)

    return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Op)

                             : Op;


  EVT DotVT = ResultVT.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;

  SDValue DotNode = DAG.getNode(Op.getOpcode(), DL, DotVT,

                                DAG.getConstant(0, DL, DotVT), LHS, RHS);


  SDValue Res;

  bool IsUnsigned = Op.getOpcode() == ISD::PARTIAL_REDUCE_UMLA;

  if (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable()) {

    unsigned LoOpcode = IsUnsigned ? AArch64ISD::UADDWB : AArch64ISD::SADDWB;

    unsigned HiOpcode = IsUnsigned ? AArch64ISD::UADDWT : AArch64ISD::SADDWT;

    SDValue Lo = DAG.getNode(LoOpcode, DL, ResultVT, Acc, DotNode);

    Res = DAG.getNode(HiOpcode, DL, ResultVT, Lo, DotNode);

  } else {

    // Fold (nx)v4i32 into (nx)v2i64

    auto [DotNodeLo, DotNodeHi] = DAG.SplitVector(DotNode, DL);

    if (IsUnsigned) {

      DotNodeLo = DAG.getZExtOrTrunc(DotNodeLo, DL, ResultVT);

      DotNodeHi = DAG.getZExtOrTrunc(DotNodeHi, DL, ResultVT);

    } else {

      DotNodeLo = DAG.getSExtOrTrunc(DotNodeLo, DL, ResultVT);

      DotNodeHi = DAG.getSExtOrTrunc(DotNodeHi, DL, ResultVT);

    }

    auto Lo = DAG.getNode(ISD::ADD, DL, ResultVT, Acc, DotNodeLo);

    Res = DAG.getNode(ISD::ADD, DL, ResultVT, Lo, DotNodeHi);

  }


  return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Res)

                           : Res;

}


SDValue

AArch64TargetLowering::LowerGET_ACTIVE_LANE_MASK(SDValue Op,

                                                 SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");


  assert(Subtarget->isSVEorStreamingSVEAvailable() &&

         "Lowering fixed length get_active_lane_mask requires SVE!");


  // There are no dedicated fixed-length instructions for GET_ACTIVE_LANE_MASK,

  // but we can use SVE when available.


  SDLoc DL(Op);

  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

  EVT WhileVT = ContainerVT.changeElementType(MVT::i1);


  SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WhileVT,

                             Op.getOperand(0), Op.getOperand(1));

  SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);

  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,

                     DAG.getVectorIdxConstant(0, DL));

}


SDValue

AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,

                                                    SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");


  bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;

  unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU

                             : AArch64ISD::FCVTZU_MERGE_PASSTHRU;


  SDLoc DL(Op);

  SDValue Val = Op.getOperand(0);

  EVT SrcVT = Val.getValueType();

  EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);

  EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);


  if (VT.bitsGT(SrcVT)) {

    EVT CvtVT = ContainerDstVT.changeVectorElementType(

      ContainerSrcVT.getVectorElementType());

    SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);


    Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);

    Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);


    Val = convertToScalableVector(DAG, ContainerDstVT, Val);

    Val = getSVESafeBitCast(CvtVT, Val, DAG);

    Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,

                      DAG.getUNDEF(ContainerDstVT));

    return convertFromScalableVector(DAG, VT, Val);

  } else {

    EVT CvtVT = ContainerSrcVT.changeTypeToInteger();

    SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);


    // Safe to use a larger than specified result since an fp_to_int where the

    // result doesn't fit into the destination is undefined.

    Val = convertToScalableVector(DAG, ContainerSrcVT, Val);

    Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));

    Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);


    return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);

  }

}


static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,

                                         ArrayRef<int> ShuffleMask, EVT VT,

                                         EVT ContainerVT, SelectionDAG &DAG) {

  auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();

  SDLoc DL(Op);

  unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();

  unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();

  bool IsSingleOp =

      ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());


  if (!Subtarget.isNeonAvailable() && !MinSVESize)

    MinSVESize = 128;


  // Ignore two operands if no SVE2 or all index numbers couldn't

  // be represented.

  if (!IsSingleOp && !Subtarget.hasSVE2())

    return SDValue();


  EVT VTOp1 = Op.getOperand(0).getValueType();

  unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();

  unsigned IndexLen = MinSVESize / BitsPerElt;

  unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();

  uint64_t MaxOffset = maxUIntN(BitsPerElt);

  EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();

  EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);

  bool MinMaxEqual = (MinSVESize == MaxSVESize);

  assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&

         "Incorrectly legalised shuffle operation");


  SmallVector<SDValue, 8> TBLMask;

  // If MinSVESize is not equal to MaxSVESize then we need to know which

  // TBL mask element needs adjustment.

  SmallVector<SDValue, 8> AddRuntimeVLMask;


  // Bail out for 8-bits element types, because with 2048-bit SVE register

  // size 8 bits is only sufficient to index into the first source vector.

  if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)

    return SDValue();


  for (int Index : ShuffleMask) {

    // Handling poison index value.

    if (Index < 0)

      Index = 0;

    // If the mask refers to elements in the second operand, then we have to

    // offset the index by the number of elements in a vector. If this is number

    // is not known at compile-time, we need to maintain a mask with 'VL' values

    // to add at runtime.

    if ((unsigned)Index >= ElementsPerVectorReg) {

      if (MinMaxEqual) {

        Index += IndexLen - ElementsPerVectorReg;

      } else {

        Index = Index - ElementsPerVectorReg;

        AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));

      }

    } else if (!MinMaxEqual)

      AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));

    // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals

    // to 255, this might point to the last element of in the second operand

    // of the shufflevector, thus we are rejecting this transform.

    if ((unsigned)Index >= MaxOffset)

      return SDValue();

    TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));

  }


  // Choosing an out-of-range index leads to the lane being zeroed vs zero

  // value where it would perform first lane duplication for out of

  // index elements. For i8 elements an out-of-range index could be a valid

  // for 2048-bit vector register size.

  for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {

    TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));

    if (!MinMaxEqual)

      AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));

  }


  EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);

  SDValue VecMask =

      DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));

  SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);


  SDValue Shuffle;

  if (IsSingleOp)

    Shuffle =

        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,

                    DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),

                    Op1, SVEMask);

  else if (Subtarget.hasSVE2()) {

    if (!MinMaxEqual) {

      unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;

      SDValue VScale = (BitsPerElt == 64)

                           ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))

                           : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));

      SDValue VecMask =

          DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));

      SDValue MulByMask = DAG.getNode(

          ISD::MUL, DL, MaskType,

          DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),

          DAG.getBuildVector(MaskType, DL,

                             ArrayRef(AddRuntimeVLMask.data(), IndexLen)));

      SDValue UpdatedVecMask =

          DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);

      SVEMask = convertToScalableVector(

          DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);

    }

    Shuffle =

        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,

                    DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),

                    Op1, Op2, SVEMask);

  }

  Shuffle = convertFromScalableVector(DAG, VT, Shuffle);

  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);

}


SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(

    SDValue Op, SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");


  auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());

  auto ShuffleMask = SVN->getMask();


  SDLoc DL(Op);

  SDValue Op1 = Op.getOperand(0);

  SDValue Op2 = Op.getOperand(1);


  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

  Op1 = convertToScalableVector(DAG, ContainerVT, Op1);

  Op2 = convertToScalableVector(DAG, ContainerVT, Op2);


  auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {

    if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)

      return MVT::i32;

    return ScalarTy;

  };


  if (SVN->isSplat()) {

    unsigned Lane = std::max(0, SVN->getSplatIndex());

    EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());

    SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,

                                  DAG.getConstant(Lane, DL, MVT::i64));

    Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);

    return convertFromScalableVector(DAG, VT, Op);

  }


  bool ReverseEXT = false;

  unsigned Imm;

  if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&

      Imm == VT.getVectorNumElements() - 1) {

    if (ReverseEXT)

      std::swap(Op1, Op2);

    EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());

    SDValue Scalar = DAG.getNode(

        ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,

        DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));

    Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);

    return convertFromScalableVector(DAG, VT, Op);

  }


  unsigned EltSize = VT.getScalarSizeInBits();

  for (unsigned BlockSize : {64U, 32U, 16U}) {

    if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), BlockSize)) {

      unsigned RevOp;

      if (EltSize == 8)

        RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;

      else if (EltSize == 16)

        RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;

      else

        RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;

      EVT BlockedVT =

          getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), BlockSize));

      SDValue Pg = getPredicateForVector(DAG, DL, BlockedVT);

      SDValue BlockedOp1 = DAG.getNode(ISD::BITCAST, DL, BlockedVT, Op1);

      SDValue BlockedRev = DAG.getNode(RevOp, DL, BlockedVT, Pg, BlockedOp1,

                                       DAG.getUNDEF(BlockedVT));

      SDValue Container =

          DAG.getNode(ISD::BITCAST, DL, ContainerVT, BlockedRev);

      return convertFromScalableVector(DAG, VT, Container);

    }

  }


  if (Subtarget->hasSVE2p1() && EltSize == 64 &&

      isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {

    SDValue Pg = getPredicateForVector(DAG, DL, VT);

    SDValue Revd = DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, ContainerVT,

                               Pg, Op1, DAG.getUNDEF(ContainerVT));

    return convertFromScalableVector(DAG, VT, Revd);

  }


  unsigned WhichResult;

  if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&

      WhichResult == 0)

    return convertFromScalableVector(

        DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));


  if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {

    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;

    return convertFromScalableVector(

        DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));

  }


  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)

    return convertFromScalableVector(

        DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));


  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {

    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;

    return convertFromScalableVector(

        DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));

  }


  // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask

  // represents the same logical operation as performed by a ZIP instruction. In

  // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly

  // equivalent to an AArch64 instruction. There's the extra component of

  // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions

  // only operated on 64/128bit vector types that have a direct mapping to a

  // target register and so an exact mapping is implied.

  // However, when using SVE for fixed length vectors, most legal vector types

  // are actually sub-vectors of a larger SVE register. When mapping

  // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider

  // how the mask's indices translate. Specifically, when the mapping requires

  // an exact meaning for a specific vector index (e.g. Index X is the last

  // vector element in the register) then such mappings are often only safe when

  // the exact SVE register size is know. The main exception to this is when

  // indices are logically relative to the first element of either

  // ISD::VECTOR_SHUFFLE operand because these relative indices don't change

  // when converting from fixed-length to scalable vector types (i.e. the start

  // of a fixed length vector is always the start of a scalable vector).

  unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();

  unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();

  if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {

    if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&

        Op2.isUndef()) {

      Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);

      return convertFromScalableVector(DAG, VT, Op);

    }


    if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&

        WhichResult != 0)

      return convertFromScalableVector(

          DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));


    if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {

      unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;

      return convertFromScalableVector(

          DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));

    }


    if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)

      return convertFromScalableVector(

          DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));


    if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {

      unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;

      return convertFromScalableVector(

          DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));

    }


    if ((Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) &&

        Subtarget->isSVEorStreamingSVEAvailable()) {

      assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == 0 &&

             "Unsupported SVE vector size");


      unsigned Segments = VT.getFixedSizeInBits() / AArch64::SVEBitsPerBlock;

      unsigned SegmentElts = VT.getVectorNumElements() / Segments;

      if (std::optional<unsigned> Lane =

              isDUPQMask(ShuffleMask, Segments, SegmentElts)) {

        SDValue IID =

            DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);

        return convertFromScalableVector(

            DAG, VT,

            DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,

                        {IID, Op1,

                         DAG.getConstant(*Lane, DL, MVT::i64,

                                         /*isTarget=*/true)}));

      }

    }

  }


  // Try to widen the shuffle before generating a possibly expensive SVE TBL.

  // This may allow the shuffle to be matched as something cheaper like ZIP1.

  if (SDValue WideOp = tryWidenMaskForShuffle(Op, DAG))

    return WideOp;


  // Avoid producing TBL instruction if we don't know SVE register minimal size,

  // unless NEON is not available and we can assume minimal SVE register size is

  // 128-bits.

  if (MinSVESize || !Subtarget->isNeonAvailable())

    return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,

                                     DAG);


  return SDValue();

}


SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,

                                                 SelectionDAG &DAG) const {

  SDLoc DL(Op);

  EVT InVT = Op.getValueType();


  assert(VT.isScalableVector() && isTypeLegal(VT) &&

         InVT.isScalableVector() && isTypeLegal(InVT) &&

         "Only expect to cast between legal scalable vector types!");

  assert(VT.getVectorElementType() != MVT::i1 &&

         InVT.getVectorElementType() != MVT::i1 &&

         "For predicate bitcasts, use getSVEPredicateBitCast");


  if (InVT == VT)

    return Op;


  EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());

  EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());


  // Safe bitcasting between unpacked vector types of different element counts

  // is currently unsupported because the following is missing the necessary

  // work to ensure the result's elements live where they're supposed to within

  // an SVE register.

  //                01234567

  // e.g. nxv2i32 = XX??XX??

  //      nxv4f16 = X?X?X?X?

  assert((VT.getVectorElementCount() == InVT.getVectorElementCount() ||

          VT == PackedVT || InVT == PackedInVT) &&

         "Unexpected bitcast!");


  // Pack input if required.

  if (InVT != PackedInVT)

    Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);


  if (Subtarget->isLittleEndian() ||

      PackedVT.getScalarSizeInBits() == PackedInVT.getScalarSizeInBits())

    Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);

  else {

    EVT PackedVTAsInt = PackedVT.changeTypeToInteger();

    EVT PackedInVTAsInt = PackedInVT.changeTypeToInteger();


    // Simulate the effect of casting through memory.

    Op = DAG.getNode(ISD::BITCAST, DL, PackedInVTAsInt, Op);

    if (PackedInVTAsInt.getScalarSizeInBits() != 8)

      Op = DAG.getNode(ISD::BSWAP, DL, PackedInVTAsInt, Op);

    Op = DAG.getNode(AArch64ISD::NVCAST, DL, PackedVTAsInt, Op);

    if (PackedVTAsInt.getScalarSizeInBits() != 8)

      Op = DAG.getNode(ISD::BSWAP, DL, PackedVTAsInt, Op);

    Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);

  }


  // Unpack result if required.

  if (VT != PackedVT)

    Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);


  return Op;

}


bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG,

                                                 SDValue N) const {

  return ::isAllActivePredicate(DAG, N);

}


EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {

  return ::getPromotedVTForPredicate(VT);

}


bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(

    SDValue Op, const APInt &OriginalDemandedBits,

    const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,

    unsigned Depth) const {


  unsigned Opc = Op.getOpcode();

  switch (Opc) {

  case AArch64ISD::VSHL: {

    // Match (VSHL (VLSHR Val X) X)

    SDValue ShiftL = Op;

    SDValue ShiftR = Op->getOperand(0);

    if (ShiftR->getOpcode() != AArch64ISD::VLSHR)

      return false;


    if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())

      return false;


    unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);

    unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);


    // Other cases can be handled as well, but this is not

    // implemented.

    if (ShiftRBits != ShiftLBits)

      return false;


    unsigned ScalarSize = Op.getScalarValueSizeInBits();

    assert(ScalarSize > ShiftLBits && "Invalid shift imm");


    APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);

    APInt UnusedBits = ~OriginalDemandedBits;


    if ((ZeroBits & UnusedBits) != ZeroBits)

      return false;


    // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not

    // used - simplify to just Val.

    return TLO.CombineTo(Op, ShiftR->getOperand(0));

  }

  case AArch64ISD::BICi: {

    // Fold BICi if all destination bits already known to be zeroed

    SDValue Op0 = Op.getOperand(0);

    KnownBits KnownOp0 =

        TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);

    // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))

    APInt BitsToClear =

        (Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))

            .trunc(KnownOp0.getBitWidth());

    APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;

    if (BitsToClear.isSubsetOf(AlreadyZeroedBitsToClear))

      return TLO.CombineTo(Op, Op0);


    Known = KnownOp0 & KnownBits::makeConstant(~BitsToClear);

    return false;

  }

  case ISD::INTRINSIC_WO_CHAIN: {

    if (auto ElementSize = IsSVECntIntrinsic(Op)) {

      unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();

      if (!MaxSVEVectorSizeInBits)

        MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;

      unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;

      // The SVE count intrinsics don't support the multiplier immediate so we

      // don't have to account for that here. The value returned may be slightly

      // over the true required bits, as this is based on the "ALL" pattern. The

      // other patterns are also exposed by these intrinsics, but they all

      // return a value that's strictly less than "ALL".

      unsigned RequiredBits = llvm::bit_width(MaxElements);

      unsigned BitWidth = Known.Zero.getBitWidth();

      if (RequiredBits < BitWidth)

        Known.Zero.setHighBits(BitWidth - RequiredBits);

      return false;

    }

  }

  }


  return TargetLowering::SimplifyDemandedBitsForTargetNode(

      Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);

}


bool AArch64TargetLowering::canCreateUndefOrPoisonForTargetNode(

    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

    bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {


  // TODO: Add more target nodes.

  switch (Op.getOpcode()) {

  case AArch64ISD::MOVI:

  case AArch64ISD::MOVIedit:

  case AArch64ISD::MOVImsl:

  case AArch64ISD::MOVIshift:

  case AArch64ISD::MVNImsl:

  case AArch64ISD::MVNIshift:

  case AArch64ISD::VASHR:

  case AArch64ISD::VLSHR:

  case AArch64ISD::VSHL:

    return false;

  }

  return TargetLowering::canCreateUndefOrPoisonForTargetNode(

      Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);

}


bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {

  return Op.getOpcode() == AArch64ISD::DUP ||

         Op.getOpcode() == AArch64ISD::MOVI ||

         Op.getOpcode() == AArch64ISD::MOVIshift ||

         Op.getOpcode() == AArch64ISD::MOVImsl ||

         Op.getOpcode() == AArch64ISD::MOVIedit ||

         Op.getOpcode() == AArch64ISD::MVNIshift ||

         Op.getOpcode() == AArch64ISD::MVNImsl ||

         // Ignoring fneg(movi(0)), because if it is folded to FPConstant(-0.0),

         // ISel will select fmov(mov i64 0x8000000000000000), resulting in a

         // fmov from fpr to gpr, which is more expensive than fneg(movi(0))

         (Op.getOpcode() == ISD::FNEG &&

          Op.getOperand(0).getOpcode() == AArch64ISD::MOVIedit &&

          Op.getOperand(0).getConstantOperandVal(0) == 0) ||

         (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

          Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||

         TargetLowering::isTargetCanonicalConstantNode(Op);

}


bool AArch64TargetLowering::isComplexDeinterleavingSupported() const {

  return Subtarget->hasSVE() || Subtarget->hasSVE2() ||

         Subtarget->hasComplxNum();

}


bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(

    ComplexDeinterleavingOperation Operation, Type *Ty) const {

  auto *VTy = dyn_cast<VectorType>(Ty);

  if (!VTy)

    return false;


  // If the vector is scalable, SVE is enabled, implying support for complex

  // numbers. Otherwise, we need to ensure complex number support is available

  if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())

    return false;


  auto *ScalarTy = VTy->getScalarType();

  unsigned NumElements = VTy->getElementCount().getKnownMinValue();


  // We can only process vectors that have a bit size of 128 or higher (with an

  // additional 64 bits for Neon). Additionally, these vectors must have a

  // power-of-2 size, as we later split them into the smallest supported size

  // and merging them back together after applying complex operation.

  unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;

  if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||

      !llvm::isPowerOf2_32(VTyWidth))

    return false;


  if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {

    unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();


    if (Operation == ComplexDeinterleavingOperation::CDot)

      return ScalarWidth == 32 || ScalarWidth == 64;

    return 8 <= ScalarWidth && ScalarWidth <= 64;

  }


  // CDot is not supported outside of scalable/sve scopes

  if (Operation == ComplexDeinterleavingOperation::CDot)

    return false;


  return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||

         ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();

}


Value *AArch64TargetLowering::createComplexDeinterleavingIR(

    IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,

    ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,

    Value *Accumulator) const {

  VectorType *Ty = cast<VectorType>(InputA->getType());

  if (Accumulator == nullptr)

    Accumulator = Constant::getNullValue(Ty);

  bool IsScalable = Ty->isScalableTy();

  bool IsInt = Ty->getElementType()->isIntegerTy();


  unsigned TyWidth =

      Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();


  assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&

         "Vector type must be either 64 or a power of 2 that is at least 128");


  if (TyWidth > 128) {

    int Stride = Ty->getElementCount().getKnownMinValue() / 2;

    int AccStride = cast<VectorType>(Accumulator->getType())

                        ->getElementCount()

                        .getKnownMinValue() /

                    2;

    auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);

    auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, uint64_t(0));

    auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, uint64_t(0));

    auto *UpperSplitA = B.CreateExtractVector(HalfTy, InputA, Stride);

    auto *UpperSplitB = B.CreateExtractVector(HalfTy, InputB, Stride);

    Value *LowerSplitAcc = nullptr;

    Value *UpperSplitAcc = nullptr;

    Type *FullTy = Ty;

    FullTy = Accumulator->getType();

    auto *HalfAccTy = VectorType::getHalfElementsVectorType(

        cast<VectorType>(Accumulator->getType()));

    LowerSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, uint64_t(0));

    UpperSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, AccStride);

    auto *LowerSplitInt = createComplexDeinterleavingIR(

        B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);

    auto *UpperSplitInt = createComplexDeinterleavingIR(

        B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);


    auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),

                                        LowerSplitInt, uint64_t(0));

    return B.CreateInsertVector(FullTy, Result, UpperSplitInt, AccStride);

  }


  if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {

    if (IsScalable) {

      if (IsInt)

        return B.CreateIntrinsic(

            Intrinsic::aarch64_sve_cmla_x, Ty,

            {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});


      auto *Mask = B.getAllOnesMask(Ty->getElementCount());

      return B.CreateIntrinsic(

          Intrinsic::aarch64_sve_fcmla, Ty,

          {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});

    }


    Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,

                              Intrinsic::aarch64_neon_vcmla_rot90,

                              Intrinsic::aarch64_neon_vcmla_rot180,

                              Intrinsic::aarch64_neon_vcmla_rot270};


    return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,

                             {Accumulator, InputA, InputB});

  }


  if (OperationType == ComplexDeinterleavingOperation::CAdd) {

    if (IsScalable) {

      if (Rotation == ComplexDeinterleavingRotation::Rotation_90 ||

          Rotation == ComplexDeinterleavingRotation::Rotation_270) {

        if (IsInt)

          return B.CreateIntrinsic(

              Intrinsic::aarch64_sve_cadd_x, Ty,

              {InputA, InputB, B.getInt32((int)Rotation * 90)});


        auto *Mask = B.getAllOnesMask(Ty->getElementCount());

        return B.CreateIntrinsic(

            Intrinsic::aarch64_sve_fcadd, Ty,

            {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});

      }

      return nullptr;

    }


    Intrinsic::ID IntId = Intrinsic::not_intrinsic;

    if (Rotation == ComplexDeinterleavingRotation::Rotation_90)

      IntId = Intrinsic::aarch64_neon_vcadd_rot90;

    else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)

      IntId = Intrinsic::aarch64_neon_vcadd_rot270;


    if (IntId == Intrinsic::not_intrinsic)

      return nullptr;


    return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});

  }


  if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&

      IsScalable) {

    return B.CreateIntrinsic(

        Intrinsic::aarch64_sve_cdot, Accumulator->getType(),

        {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});

  }


  return nullptr;

}


bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {

  unsigned Opc = N->getOpcode();

  if (ISD::isExtOpcode(Opc)) {

    if (any_of(N->users(),

               [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))

      return false;

  }

  return true;

}


unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {

  return Subtarget->getMinimumJumpTableEntries();

}


MVT AArch64TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,

                                                         CallingConv::ID CC,

                                                         EVT VT) const {

  bool NonUnitFixedLengthVector =

      VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();

  if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())

    return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);


  EVT VT1;

  MVT RegisterVT;

  unsigned NumIntermediates;

  getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,

                                       RegisterVT);

  return RegisterVT;

}


unsigned AArch64TargetLowering::getNumRegistersForCallingConv(

    LLVMContext &Context, CallingConv::ID CC, EVT VT) const {

  bool NonUnitFixedLengthVector =

      VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();

  if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())

    return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);


  EVT VT1;

  MVT VT2;

  unsigned NumIntermediates;

  return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,

                                              NumIntermediates, VT2);

}


unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv(

    LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,

    unsigned &NumIntermediates, MVT &RegisterVT) const {

  int NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv(

      Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);

  if (!RegisterVT.isFixedLengthVector() ||

      RegisterVT.getFixedSizeInBits() <= 128)

    return NumRegs;


  assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");

  assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");

  assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");


  // A size mismatch here implies either type promotion or widening and would

  // have resulted in scalarisation if larger vectors had not be available.

  if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {

    EVT EltTy = VT.getVectorElementType();

    EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));

    if (!isTypeLegal(NewVT))

      NewVT = EltTy;


    IntermediateVT = NewVT;

    NumIntermediates = VT.getVectorNumElements();

    RegisterVT = getRegisterType(Context, NewVT);

    return NumIntermediates;

  }


  // SVE VLS support does not introduce a new ABI so we should use NEON sized

  // types for vector arguments and returns.


  unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;

  NumIntermediates *= NumSubRegs;

  NumRegs *= NumSubRegs;


  switch (RegisterVT.getVectorElementType().SimpleTy) {

  default:

    llvm_unreachable("unexpected element type for vector");

  case MVT::i8:

    IntermediateVT = RegisterVT = MVT::v16i8;

    break;

  case MVT::i16:

    IntermediateVT = RegisterVT = MVT::v8i16;

    break;

  case MVT::i32:

    IntermediateVT = RegisterVT = MVT::v4i32;

    break;

  case MVT::i64:

    IntermediateVT = RegisterVT = MVT::v2i64;

    break;

  case MVT::f16:

    IntermediateVT = RegisterVT = MVT::v8f16;

    break;

  case MVT::f32:

    IntermediateVT = RegisterVT = MVT::v4f32;

    break;

  case MVT::f64:

    IntermediateVT = RegisterVT = MVT::v2f64;

    break;

  case MVT::bf16:

    IntermediateVT = RegisterVT = MVT::v8bf16;

    break;

  }


  return NumRegs;

}


bool AArch64TargetLowering::hasInlineStackProbe(

    const MachineFunction &MF) const {

  return !Subtarget->isTargetWindows() &&

         MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();

}


bool AArch64TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {

  switch (Opc) {

  case ISD::TRUNCATE_SSAT_S:

  case ISD::TRUNCATE_SSAT_U:

  case ISD::TRUNCATE_USAT_U:

    if (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)

      return true;

  }


  return TargetLowering::isTypeDesirableForOp(Opc, VT);

}


bool AArch64TargetLowering::shouldPreservePtrArith(const Function &F,

                                                   EVT VT) const {

  return Subtarget->hasCPA() && UseFEATCPACodegen;

}


AArch64AddressingModes.h

MRI
unsigned const MachineRegisterInfo * MRI
Definition AArch64AdvSIMDScalarPass.cpp:103

MatchRegisterName
static MCRegister MatchRegisterName(StringRef Name)

AArch64BaseInfo.h

AArch64CallingConvention.h

AArch64ExpandImm.h

DefMI
MachineInstrBuilder MachineInstrBuilder & DefMI
Definition AArch64ExpandPseudoInsts.cpp:121

isOpcWithIntImmediate
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
Definition AArch64ISelDAGToDAG.cpp:551

extractPtrauthBlendDiscriminators
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
Definition AArch64ISelDAGToDAG.cpp:1510

isIntImmediate
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
Definition AArch64ISelDAGToDAG.cpp:534

trySVESplat64
static SDValue trySVESplat64(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST, APInt &DefBits)
Definition AArch64ISelLowering.cpp:15344

CustomNonLegalBITCASTResults
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
Definition AArch64ISelLowering.cpp:28323

isConcatMask
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
Definition AArch64ISelLowering.cpp:14039

isAddSubSExt
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:5550

emitConditionalComparison
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
Definition AArch64ISelLowering.cpp:3815

changeVectorFPCCToAArch64CC
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
Definition AArch64ISelLowering.cpp:3600

performZExtDeinterleaveShuffleCombine
static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:23125

performIntToFpCombine
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Definition AArch64ISelLowering.cpp:19901

isVShiftRImm
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
Definition AArch64ISelLowering.cpp:16377

isSingletonEXTMask
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
Definition AArch64ISelLowering.cpp:13750

foldCSELofCTTZ
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:25718

performCONDCombine
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
Definition AArch64ISelLowering.cpp:25539

tryConvertSVEWideCompare
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:22511

NormalizeBuildVector
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:15313

replaceZeroVectorStore
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
Definition AArch64ISelLowering.cpp:23495

tryToWidenSetCCOperands
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:26071

performLastTrueTestVectorCombine
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Definition AArch64ISelLowering.cpp:20472

GenerateTBL
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:14253

performMulCombine
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Definition AArch64ISelLowering.cpp:19556

performDUPCombine
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Definition AArch64ISelLowering.cpp:26651

reassociateCSELOperandsForCSE
static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:25824

parsePredicateConstraint
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
Definition AArch64ISelLowering.cpp:12869

splitStores
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Definition AArch64ISelLowering.cpp:23615

analyzeCallOperands
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
Definition AArch64ISelLowering.cpp:8950

IsSVECntIntrinsic
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
Definition AArch64ISelLowering.cpp:19315

isSetCC
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
Definition AArch64ISelLowering.cpp:21147

isLegalArithImmed
static bool isLegalArithImmed(uint64_t C)
Definition AArch64ISelLowering.cpp:3637

getContainerForFixedLengthVector
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
Definition AArch64ISelLowering.cpp:29774

performVectorDeinterleaveCombine
static SDValue performVectorDeinterleaveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:27651

getSVEContainerIRType
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
Definition AArch64ISelLowering.cpp:17760

performSTNT1Combine
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:23463

getGatherVecOpcode
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
Definition AArch64ISelLowering.cpp:6779

performMulVectorCmpZeroCombine
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:19481

convertFixedMaskToScalableVector
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:29923

isZeroingInactiveLanes
static bool isZeroingInactiveLanes(SDValue Op)
Definition AArch64ISelLowering.cpp:292

performPTestFirstCombine
static SDValue performPTestFirstCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:27576

trySwapVSelectOperands
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:26461

tryCombineMULLWithUZP1
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:27429

isExtendedBUILD_VECTOR
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
Definition AArch64ISelLowering.cpp:5502

getSVEPredicateBitCast
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:5893

isZerosVector
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
Definition AArch64ISelLowering.cpp:3461

performGLD1Combine
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:24021

performNVCASTCombine
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
Definition AArch64ISelLowering.cpp:26715

getReducedGprRegisterClass
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
Definition AArch64ISelLowering.cpp:12909

stripVRegCopies
static const MachineInstr * stripVRegCopies(const MachineRegisterInfo &MRI, Register Reg)
Definition AArch64ISelLowering.cpp:3256

carryFlagToValue
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
Definition AArch64ISelLowering.cpp:4447

getScaledOffsetForBitWidth
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
Definition AArch64ISelLowering.cpp:26792

isPredicateCCSettingOp
static bool isPredicateCCSettingOp(SDValue N)
Definition AArch64ISelLowering.cpp:20413

performSHLCombine
static SDValue performSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
If the operand is a bitwise AND with a constant RHS, and the shift has a constant RHS and is the only...
Definition AArch64ISelLowering.cpp:27734

tryLowerToSLI
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:15076

checkValueWidth
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
Definition AArch64ISelLowering.cpp:25312

performSVEAndCombine
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Definition AArch64ISelLowering.cpp:20147

isVectorizedBinOp
bool isVectorizedBinOp(unsigned Opcode)
Definition AArch64ISelLowering.cpp:2417

emitComparison
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:3711

overflowFlagToValue
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:4459

GenerateFixedLengthSVETBL
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:31005

performBRCONDCombine
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:25644

getSVEContainerType
static MVT getSVEContainerType(EVT ContentTy)
Definition AArch64ISelLowering.cpp:23336

isMergePassthruOpcode
static bool isMergePassthruOpcode(unsigned Opc)
Definition AArch64ISelLowering.cpp:253

selectUmullSmull
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
Definition AArch64ISelLowering.cpp:5698

performFADDCombine
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Definition AArch64ISelLowering.cpp:20362

performNEONPostLDSTCombine
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
Definition AArch64ISelLowering.cpp:25175

emitVectorComparison
static SDValue emitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &DL, SelectionDAG &DAG)
Emit vector comparison for floating-point values, producing a mask.
Definition AArch64ISelLowering.cpp:11625

performVectorExtCombine
static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:19519

ReplaceCMP_SWAP_128Results
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Definition AArch64ISelLowering.cpp:28533

isAllActivePredicate
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
Definition AArch64ISelLowering.cpp:15031

getReductionSDNode
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:16594

performORCombine
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
Definition AArch64ISelLowering.cpp:20098

isZeroExtended
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:5544

performSelectCombine
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
Definition AArch64ISelLowering.cpp:26586

isCheapToExtend
static bool isCheapToExtend(const SDValue &N)
Definition AArch64ISelLowering.cpp:23076

EnableOptimizeLogicalImm
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))

performExtractLastActiveCombine
static SDValue performExtractLastActiveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Definition AArch64ISelLowering.cpp:20506

LowerBRCOND
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:7379

isValidImmForSVEVecImmAddrMode
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
Definition AArch64ISelLowering.cpp:26810

isUZP_v_undef_Mask
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
Definition AArch64ISelLowering.cpp:13968

shouldLowerTailCallStackArg
static bool shouldLowerTailCallStackArg(const MachineFunction &MF, const CCValAssign &VA, SDValue Arg, ISD::ArgFlagsTy Flags, int CallOffset)
Check whether a stack argument requires lowering in a tail call.
Definition AArch64ISelLowering.cpp:9346

tryAdvSIMDModImm16
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
Definition AArch64ISelLowering.cpp:14876

getDUPLANEOp
static unsigned getDUPLANEOp(EVT EltType)
Definition AArch64ISelLowering.cpp:14327

changeFPCCToAArch64CC
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
Definition AArch64ISelLowering.cpp:3507

performGlobalAddressCombine
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
Definition AArch64ISelLowering.cpp:26728

LowerTruncateVectorStore
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:7031

tryAdvSIMDModImmFP
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
Definition AArch64ISelLowering.cpp:14973

canLowerSRLToRoundingShiftForVT
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
Definition AArch64ISelLowering.cpp:16412

isExtendOrShiftOperand
static bool isExtendOrShiftOperand(SDValue N)
Definition AArch64ISelLowering.cpp:21814

isLanes1toNKnownZero
static bool isLanes1toNKnownZero(SDValue Op)
Definition AArch64ISelLowering.cpp:27253

setInfoSVEStN
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
Definition AArch64ISelLowering.cpp:16977

performSetccAddFolding
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:21203

performVecReduceAddCombineWithUADDLP
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:18806

extractPtrauthBlendDiscriminators
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
Definition AArch64ISelLowering.cpp:355

getPackedSVEVectorVT
static EVT getPackedSVEVectorVT(EVT VT)
Definition AArch64ISelLowering.cpp:182

performANDORCSELCombine
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:20031

performUnpackCombine
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Definition AArch64ISelLowering.cpp:23696

performTruncateCombine
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
Definition AArch64ISelLowering.cpp:21767

LowerPtrAuthGlobalAddressStatically
static SDValue LowerPtrAuthGlobalAddressStatically(SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC, SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:10830

performVecReduceBitwiseCombine
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:26117

performFlagSettingCombine
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
Definition AArch64ISelLowering.cpp:26217

performSpliceCombine
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:23686

performCSELCombine
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:25995

ReplaceReductionResults
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
Definition AArch64ISelLowering.cpp:28439

isEquivalentMaskless
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
Definition AArch64ISelLowering.cpp:25415

LowerSVEIntrinsicEXT
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:22485

constructDup
static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT, unsigned Opcode, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:14340

isCMP
static bool isCMP(SDValue Op)
Definition AArch64ISelLowering.cpp:21474

SDValue
return SDValue()

LowerSVEIntrinsicIndex
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:22455

tryAdvSIMDModImm64
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
Definition AArch64ISelLowering.cpp:14802

rmwOpMayLowerToLibcall
static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget, const AtomicRMWInst *RMW)
Definition AArch64ISelLowering.cpp:29163

getStructuredLoadFunction
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
Definition AArch64ISelLowering.cpp:17788

numberOfInstrToLoadImm
unsigned numberOfInstrToLoadImm(APInt C)
Definition AArch64ISelLowering.cpp:3651

isCMN
static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:3684

foldCSELOfCSEL
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:25766

convertMergedOpToPredOp
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
Definition AArch64ISelLowering.cpp:22690

tryAdvSIMDModImm8
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
Definition AArch64ISelLowering.cpp:14952

LowerSMELdrStr
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
Definition AArch64ISelLowering.cpp:5992

emitConjunctionRec
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
Definition AArch64ISelLowering.cpp:3933

performScalarToVectorCombine
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:27603

isPow2Splat
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
Definition AArch64ISelLowering.cpp:16212

createTblForTrunc
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
Definition AArch64ISelLowering.cpp:17424

performVectorCompareAndMaskUnaryOpCombine
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:19806

parseConstraintCode
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
Definition AArch64ISelLowering.cpp:12925

isINSMask
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
Definition AArch64ISelLowering.cpp:14000

callConvSupportsVarArgs
static bool callConvSupportsVarArgs(CallingConv::ID CC)
Return true if the call convention supports varargs Currently only those that pass varargs like the C...
Definition AArch64ISelLowering.cpp:8936

GPRArgRegs
static const MCPhysReg GPRArgRegs[]
Definition AArch64ISelLowering.cpp:171

resolveBuildVector
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
Definition AArch64ISelLowering.cpp:14779

LowerSVEIntrinsicDUP
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:22471

performSignExtendSetCCCombine
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:23083

isPassedInFPR
static bool isPassedInFPR(EVT VT)
Definition AArch64ISelLowering.cpp:8079

getIntrinsicID
static unsigned getIntrinsicID(const SDNode *N)
Definition AArch64ISelLowering.cpp:7972

valueToCarryFlag
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
Definition AArch64ISelLowering.cpp:4435

performAddUADDVCombine
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:21250

performExtBinopLoadFold
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:22153

findMoreOptimalIndexType
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:25063

performANDCombine
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Definition AArch64ISelLowering.cpp:20299

GeneratePerfectShuffle
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
Definition AArch64ISelLowering.cpp:14092

emitRestoreZALazySave
static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL, const AArch64TargetLowering &TLI, const AArch64RegisterInfo &TRI, AArch64FunctionInfo &FuncInfo, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:8119

canEmitConjunction
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
Definition AArch64ISelLowering.cpp:3871

isWideDUPMask
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
Definition AArch64ISelLowering.cpp:13833

FlagsVT
constexpr MVT FlagsVT
Value type used for NZCV flags.
Definition AArch64ISelLowering.cpp:169

getPredicateForFixedLengthVector
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
Definition AArch64ISelLowering.cpp:29801

EnableExtToTBL
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))

splitStoreSplat
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
Definition AArch64ISelLowering.cpp:23294

performNegCSelCombine
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:21395

parseSVERegAsConstraint
static std::optional< std::pair< unsigned, const TargetRegisterClass * > > parseSVERegAsConstraint(StringRef Constraint)
Definition AArch64ISelLowering.cpp:12846

performST1Combine
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:23437

performVecReduceAddCombine
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
Definition AArch64ISelLowering.cpp:18987

tryLowerToBSL
static SDValue tryLowerToBSL(SDValue N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:15169

performSignExtendInRegCombine
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:27057

removeRedundantInsertVectorElt
static SDValue removeRedundantInsertVectorElt(SDNode *N)
Definition AArch64ISelLowering.cpp:27291

getCSETCondCode
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
Definition AArch64ISelLowering.cpp:21481

isLane0KnownActive
static bool isLane0KnownActive(SDValue Op)
Definition AArch64ISelLowering.cpp:27278

combineSVEReductionOrderedFP
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:22653

trySQDMULHCombine
static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:21678

legalizeSVEGatherPrefetchOffsVec
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
Definition AArch64ISelLowering.cpp:27209

isNegatedInteger
static bool isNegatedInteger(SDValue Op)
Definition AArch64ISelLowering.cpp:21384

performFirstTrueTestVectorCombine
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Definition AArch64ISelLowering.cpp:20442

CondCodeVT
constexpr MVT CondCodeVT
Value type used for condition codes.
Definition AArch64ISelLowering.cpp:166

isLoadOrMultipleLoads
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
Definition AArch64ISelLowering.cpp:22027

performSubAddMULCombine
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:21908

performLD1Combine
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
Definition AArch64ISelLowering.cpp:23364

hasPairwiseAdd
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
Definition AArch64ISelLowering.cpp:20398

performSMINCombine
static SDValue performSMINCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:21759

LowerVectorMatch
SDValue LowerVectorMatch(SDValue Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:6039

getStructuredStoreFunction
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
Definition AArch64ISelLowering.cpp:17805

performZExtUZPCombine
static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:23185

performVectorShiftCombine
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
Definition AArch64ISelLowering.cpp:24074

performUADDVAddCombine
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:19112

combineSVEPrefetchVecBaseImmOff
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
Definition AArch64ISelLowering.cpp:27232

replaceSplatVectorStore
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
Definition AArch64ISelLowering.cpp:23562

getSignExtendedGatherOpcode
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
Definition AArch64ISelLowering.cpp:6802

isOrXorChain
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
Definition AArch64ISelLowering.cpp:11442

getVShiftImm
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
Definition AArch64ISelLowering.cpp:16346

performGatherLoadCombine
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
Definition AArch64ISelLowering.cpp:26950

foldOverflowCheck
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
Definition AArch64ISelLowering.cpp:21499

combineSVEReductionFP
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:22636

performDupLane128Combine
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:27393

optimizeLogicalImm
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
Definition AArch64ISelLowering.cpp:2435

isLegalCmpImmed
bool isLegalCmpImmed(APInt C)
Definition AArch64ISelLowering.cpp:3645

isSafeSignedCMN
static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:3658

getCmpOperandFoldingProfit
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
Definition AArch64ISelLowering.cpp:4059

performFPExtendCombine
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Definition AArch64ISelLowering.cpp:27335

performUzpCombine
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Definition AArch64ISelLowering.cpp:23839

LowerADDRSPACECAST
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:7062

performConcatVectorsCombine
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:20667

performSVEMulAddSubCombine
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Definition AArch64ISelLowering.cpp:21944

foldCSELofLASTB
static SDValue foldCSELofLASTB(SDNode *Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:25961

combineAcrossLanesIntrinsic
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:22446

getAArch64Cmp
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &DL)
Definition AArch64ISelLowering.cpp:4114

LowerFunnelShift
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:7397

performBuildShuffleExtendCombine
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
Definition AArch64ISelLowering.cpp:19381

tryAdvSIMDModImm321s
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
Definition AArch64ISelLowering.cpp:14921

createTblShuffleForZExt
static Value * createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *ZExtTy, FixedVectorType *DstTy, bool IsLittleEndian)
Definition AArch64ISelLowering.cpp:17384

performAddSubIntoVectorOp
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:21993

getPredicateForScalableVector
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
Definition AArch64ISelLowering.cpp:29846

tryFormConcatFromShuffle
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:14059

changeIntCCToAArch64CC
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC, SDValue RHS={})
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
Definition AArch64ISelLowering.cpp:3478

FPRArgRegs
static const MCPhysReg FPRArgRegs[]
Definition AArch64ISelLowering.cpp:174

getSETCC
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
Definition AArch64ISelLowering.cpp:12949

LowerPREFETCH
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:4525

tryCombineNeonFcvtFP16ToI16
static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:22676

replaceBoolVectorBitcast
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:28296

tryWidenMaskForShuffle
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:14409

performActiveLaneMaskCombine
static SDValue performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *ST)
Definition AArch64ISelLowering.cpp:18894

getPTrue
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
Definition AArch64ISelLowering.cpp:5833

isEXTMask
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
Definition AArch64ISelLowering.cpp:13906

parseReducedGprConstraint
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
Definition AArch64ISelLowering.cpp:12901

tryCombineFixedPointConvert
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:20999

getPredicateForVector
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
Definition AArch64ISelLowering.cpp:29854

performSETCCCombine
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:26132

performMulVectorExtendCombine
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
Definition AArch64ISelLowering.cpp:19461

performAddSubLongCombine
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Definition AArch64ISelLowering.cpp:21432

LowerXALUO
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:4495

performFpToIntCombine
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
Definition AArch64ISelLowering.cpp:19948

calculatePreExtendType
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
Definition AArch64ISelLowering.cpp:19342

performSetCCPunpkCombine
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:26240

isSignExtInReg
static bool isSignExtInReg(const SDValue &V)
Definition AArch64ISelLowering.cpp:26278

getPromotedVTForPredicate
static EVT getPromotedVTForPredicate(EVT VT)
Definition AArch64ISelLowering.cpp:222

changeFPCCToANDAArch64CC
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
Definition AArch64ISelLowering.cpp:3570

foldVectorXorShiftIntoCmp
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
Definition AArch64ISelLowering.cpp:18757

tryCombineCRC32
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:22433

isAllConstantBuildVector
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
Definition AArch64ISelLowering.cpp:15006

performExtractSubvectorCombine
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:20929

tryToReplaceScalarFPConversionWithSVE
static SDValue tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Tries to replace scalar FP <-> INT conversions with SVE in streaming functions, this can help to redu...
Definition AArch64ISelLowering.cpp:19855

tryCombineShiftImm
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:22339

UseTlsOffset
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
Definition AArch64ISelLowering.cpp:29373

WidenVector
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
Definition AArch64ISelLowering.cpp:13338

performLD1ReplicateCombine
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:23416

isSignExtended
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:5538

ConstantBuildVector
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
Definition AArch64ISelLowering.cpp:15365

getPTest
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
Definition AArch64ISelLowering.cpp:22575

ReducedGprConstraint
ReducedGprConstraint
Definition AArch64ISelLowering.cpp:12898

ReducedGprConstraint::Ucj
@ Ucj
Definition AArch64ISelLowering.cpp:12898

ReducedGprConstraint::Uci
@ Uci
Definition AArch64ISelLowering.cpp:12898

isSetCCOrZExtSetCC
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
Definition AArch64ISelLowering.cpp:21190

EnableAArch64ELFLocalDynamicTLSGeneration
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))

ReconstructTruncateFromBuildVector
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:13782

performBSPExpandForSVE
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Definition AArch64ISelLowering.cpp:27373

getCondCode
static SDValue getCondCode(SelectionDAG &DAG, AArch64CC::CondCode CC)
Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
Definition AArch64ISelLowering.cpp:3632

foldADCToCINC
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:21523

checkZExtBool
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:9216

optimizeIncrementingWhile
static SDValue optimizeIncrementingWhile(SDNode *N, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
Definition AArch64ISelLowering.cpp:5841

performSunpkloCombine
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:24106

tryToConvertShuffleOfTbl2ToTbl4
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:14442

getAtomicLoad128Opcode
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
Definition AArch64ISelLowering.cpp:28621

ReplaceAddWithADDP
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Definition AArch64ISelLowering.cpp:28391

EnableSVEGISel
cl::opt< bool > EnableSVEGISel("aarch64-enable-gisel-sve", cl::Hidden, cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false))

performVSelectCombine
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:26499

performSetccMergeZeroCombine
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Definition AArch64ISelLowering.cpp:26290

performPostLD1Combine
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
Definition AArch64ISelLowering.cpp:24125

lookThroughSignExtension
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
Definition AArch64ISelLowering.cpp:10930

hasNearbyPairedStore
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
Definition AArch64ISelLowering.cpp:17968

tryExtendDUPToExtractHigh
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:21062

foldIndexIntoBase
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:25013

emitFloatCompareMask
static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, ISD::CondCode CC, bool NoNaNs, const SDLoc &DL, SelectionDAG &DAG)
For SELECT_CC, when the true/false values are (-1, 0) and the compared values are scalars,...
Definition AArch64ISelLowering.cpp:11674

getZT0FrameIndex
static SDValue getZT0FrameIndex(MachineFrameInfo &MFI, AArch64FunctionInfo &FuncInfo, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:8084

performRNDRCombine
static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:27772

performXorCombine
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Definition AArch64ISelLowering.cpp:19222

skipExtensionForVectorMULL
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:5528

performOrXorChainCombine
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:11469

performAddCombineForShiftedOperands
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:21867

createGPRPairNode
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
Definition AArch64ISelLowering.cpp:28519

shouldBeAdjustedToZero
static bool shouldBeAdjustedToZero(SDValue LHS, APInt C, ISD::CondCode &CC)
Definition AArch64ISelLowering.cpp:4096

combineSVEBitSel
static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:22711

lowerADDSUBO_CARRY
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
Definition AArch64ISelLowering.cpp:4470

isPackedVectorType
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
Definition AArch64ISelLowering.cpp:244

isTRN_v_undef_Mask
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
Definition AArch64ISelLowering.cpp:13987

getSMToggleCondition
static AArch64SME::ToggleCondition getSMToggleCondition(const SMECallAttrs &CallAttrs)
Definition AArch64ISelLowering.cpp:9333

isAddSubZExt
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:5561

isVShiftLImm
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
Definition AArch64ISelLowering.cpp:16366

performExtendCombine
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:23242

performMaskedGatherScatterCombine
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:25133

getTestBitOperand
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:26357

performUADDVCombine
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:19200

performBuildVectorCombine
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:21539

convertToScalableVector
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
Definition AArch64ISelLowering.cpp:29862

performScatterStoreCombine
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
Definition AArch64ISelLowering.cpp:26837

PredicateConstraint::Uph
@ Uph
Definition AArch64ISelLowering.cpp:12834

PredicateConstraint::Upa
@ Upa
Definition AArch64ISelLowering.cpp:12834

PredicateConstraint::Upl
@ Upl
Definition AArch64ISelLowering.cpp:12834

canGuaranteeTCO
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
Definition AArch64ISelLowering.cpp:8908

tryCombineLongOpWithDup
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:22306

LowerFLDEXP
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:7433

combineSVEReductionInt
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:22617

isNVCastToHalfWidthElements
static SDValue isNVCastToHalfWidthElements(SDValue V)
Definition AArch64ISelLowering.cpp:23826

isHalvingTruncateAndConcatOfLegalIntScalableType
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
Definition AArch64ISelLowering.cpp:23745

getEstimate
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
Definition AArch64ISelLowering.cpp:12680

performUADDVZextCombine
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:19162

performAddCSelIntoCSinc
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
Definition AArch64ISelLowering.cpp:21289

performCTLZCombine
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Definition AArch64ISelLowering.cpp:26779

getConstantLaneNumOfExtractHalfOperand
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
Definition AArch64ISelLowering.cpp:5489

ReplaceATOMIC_LOAD_128Results
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Definition AArch64ISelLowering.cpp:28696

emitStrictFPComparison
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
Definition AArch64ISelLowering.cpp:3691

areLoadedOffsetButOtherwiseSame
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
Definition AArch64ISelLowering.cpp:22098

isEssentiallyExtractHighSubvector
static bool isEssentiallyExtractHighSubvector(SDValue N)
Definition AArch64ISelLowering.cpp:21103

mayTailCallThisCC
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
Definition AArch64ISelLowering.cpp:8914

createTblShuffleForSExt
static Value * createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *DstTy, bool IsLittleEndian)
Definition AArch64ISelLowering.cpp:17406

getExtFactor
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
Definition AArch64ISelLowering.cpp:13351

MaxXors
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))

performInsertVectorEltCombine
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Definition AArch64ISelLowering.cpp:27328

tryAdvSIMDModImm32
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
Definition AArch64ISelLowering.cpp:14823

performMULLCombine
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:27563

performLDNT1Combine
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:23390

trySimplifySrlAddToRshrnb
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Definition AArch64ISelLowering.cpp:23799

performAddDotCombine
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:21362

getSMECallAttrs
static SMECallAttrs getSMECallAttrs(const Function &Caller, const AArch64TargetLowering &TLI, const TargetLowering::CallLoweringInfo &CLI)
Definition AArch64ISelLowering.cpp:9005

performExtractVectorEltCombine
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Definition AArch64ISelLowering.cpp:20533

performReinterpretCastCombine
static SDValue performReinterpretCastCombine(SDNode *N)
Definition AArch64ISelLowering.cpp:20136

emitSMEStateSaveRestore
static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, SelectionDAG &DAG, AArch64FunctionInfo *Info, SDLoc DL, SDValue Chain, bool IsSave)
Definition AArch64ISelLowering.cpp:8096

ReconstructShuffleWithRuntimeMask
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:13359

performTBZCombine
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:26430

emitConjunction
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
Definition AArch64ISelLowering.cpp:4045

simplifySetCCIntoEq
static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const SDLoc DL)
Definition AArch64ISelLowering.cpp:4555

tryCombineExtendRShTrunc
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:23759

isAllInactivePredicate
static bool isAllInactivePredicate(SDValue N)
Definition AArch64ISelLowering.cpp:15023

getVectorBitwiseReduce
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:16602

performIntrinsicCombine
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Definition AArch64ISelLowering.cpp:22735

EnableCombineMGatherIntrinsics
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))

isZIP_v_undef_Mask
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
Definition AArch64ISelLowering.cpp:13949

performInsertSubvectorCombine
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:20952

UseFEATCPACodegen
static cl::opt< bool > UseFEATCPACodegen("aarch64-use-featcpa-codegen", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in " "SelectionDAG for FEAT_CPA"), cl::init(false))

createTblShuffleMask
static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, unsigned NumElts, bool IsLittleEndian, SmallVectorImpl< int > &Mask)
Definition AArch64ISelLowering.cpp:17361

convertFromScalableVector
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
Definition AArch64ISelLowering.cpp:29873

performAddCombineSubShift
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:21839

performANDSETCCCombine
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Definition AArch64ISelLowering.cpp:20260

getPredicateRegisterClass
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
Definition AArch64ISelLowering.cpp:12878

performAddSubCombine
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Definition AArch64ISelLowering.cpp:22271

performSubsToAndsCombine
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
Definition AArch64ISelLowering.cpp:25492

getAArch64XALUOOp
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
Definition AArch64ISelLowering.cpp:4259

AArch64ISelLowering.h

FMAInstKind::Accumulator
@ Accumulator
Definition AArch64InstrInfo.cpp:7890

FALKOR_STRIDED_ACCESS_MD
#define FALKOR_STRIDED_ACCESS_MD
Definition AArch64InstrInfo.h:34

Generic
@ Generic
Definition AArch64MCAsmInfo.cpp:24

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

AArch64MachineFunctionInfo.h

AArch64PerfectShuffle.h

AArch64RegisterInfo.h

AArch64SMEAttributes.h

AArch64Subtarget.h

AArch64TargetMachine.h

getNode
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
Definition AMDGPUDelayedMCExpr.cpp:15

isConstant
static bool isConstant(const MachineInstr &MI)
Definition AMDGPUInstructionSelector.cpp:2877

S1
constexpr LLT S1
Definition AMDGPULegalizerInfo.cpp:294

F32
constexpr LLT F32
Definition AMDGPULegalizerInfo.cpp:298

Select
AMDGPU Register Bank Select
Definition AMDGPURegBankSelect.cpp:68

APFloat.h
This file declares a class to represent arbitrary precision floating point values and provide a varie...

APInt.h
This file implements a class to represent arbitrary precision integral constant values and operations...

Scaled
@ Scaled
Definition ARCInstrInfo.cpp:35

isSupportedType
static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T)
Definition ARMCallLowering.cpp:63

OP_VEXT3
@ OP_VEXT3
Definition ARMISelLowering.cpp:8300

OP_VTRNR
@ OP_VTRNR
Definition ARMISelLowering.cpp:8306

OP_VDUP1
@ OP_VDUP1
Definition ARMISelLowering.cpp:8295

OP_VZIPR
@ OP_VZIPR
Definition ARMISelLowering.cpp:8304

OP_VUZPR
@ OP_VUZPR
Definition ARMISelLowering.cpp:8302

OP_VREV
@ OP_VREV
Definition ARMISelLowering.cpp:8293

OP_VZIPL
@ OP_VZIPL
Definition ARMISelLowering.cpp:8303

OP_VTRNL
@ OP_VTRNL
Definition ARMISelLowering.cpp:8305

OP_COPY
@ OP_COPY
Definition ARMISelLowering.cpp:8292

OP_VEXT1
@ OP_VEXT1
Definition ARMISelLowering.cpp:8298

OP_VDUP0
@ OP_VDUP0
Definition ARMISelLowering.cpp:8294

OP_VEXT2
@ OP_VEXT2
Definition ARMISelLowering.cpp:8299

OP_VUZPL
@ OP_VUZPL
Definition ARMISelLowering.cpp:8301

OP_VDUP3
@ OP_VDUP3
Definition ARMISelLowering.cpp:8297

OP_VDUP2
@ OP_VDUP2
Definition ARMISelLowering.cpp:8296

MBB
MachineBasicBlock & MBB
Definition ARMSLSHardening.cpp:71

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

MBBI
MachineBasicBlock MachineBasicBlock::iterator MBBI
Definition ARMSLSHardening.cpp:72

Results
Function Alias Analysis Results
Definition AliasAnalysis.cpp:734

ArrayRef.h

AtomicOrdering.h
Atomic ordering constants.

Attributes.h
This file contains the simple types necessary to represent the attributes associated with functions a...

true
basic Basic Alias true
Definition BasicAliasAnalysis.cpp:2036

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

Info
Analysis containing CSE Info
Definition CSEInfo.cpp:27

CallingConvLower.h

Casting.h

Utils.h

CodeGen.h

CommandLine.h

ComplexDeinterleavingPass.h

Constants.h
This file contains the declarations for the subclasses of Constant, which represent the different fla...

IntrinsicCostStrategy::InstructionCost
@ InstructionCost
Definition CostModel.cpp:52

isConstantSplatVectorMaskForType
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
Definition DAGCombiner.cpp:1039

DataLayout.h

DebugLoc.h

DerivedTypes.h

Default
@ Default
Definition DwarfDebug.cpp:86

isSigned
static bool isSigned(unsigned int Opcode)
Definition ExpandLargeDivRem.cpp:52

Check
#define Check(C,...)
Definition GenericConvergenceVerifierImpl.h:34

GetElementPtrTypeIterator.h

GlobalValue.h

im
#define im(i)

TII
const HexagonInstrInfo * TII
Definition HexagonCopyToCombine.cpp:118

IRBuilder.h

MI
IRTranslator LLVM IR MI
Definition IRTranslator.cpp:110

Function.h

Instruction.h

IntrinsicInst.h

Module.h
Module.h This file contains the declarations for the Module class.

Type.h

Use.h
This defines the Use class.

Value.h

ISDOpcodes.h

Users
iv Induction Variable Users
Definition IVUsers.cpp:48

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

OffsetOp
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
Definition InstCombineCompares.cpp:5876

ShuffleOps
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
Definition InstCombineVectorOps.cpp:803

InstructionCost.h
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...

Instructions.h

Intrinsics.h

TemplateParamKind::Type
@ Type
Definition ItaniumDemangle.h:1243

Ops
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Definition ItaniumDemangle.h:3368

KnownBits.h

RegName
#define RegName(no)

Options
static LVOptions Options
Definition LVOptions.cpp:25

info
lazy value info
Definition LazyValueInfo.cpp:59

LoopInfo.h

F
#define F(x, y, z)
Definition MD5.cpp:55

I
#define I(x, y, z)
Definition MD5.cpp:58

G
#define G(x, y, z)
Definition MD5.cpp:56

MachineBasicBlock.h

MachineFrameInfo.h

MachineFunction.h

MachineInstrBuilder.h

MachineInstr.h

MachineMemOperand.h

MachineRegisterInfo.h

Reg
Register Reg
Definition MachineSink.cpp:2117

TRI
Register const TargetRegisterInfo * TRI
Definition MachineSink.cpp:2118

MachineValueType.h

MathExtras.h

Register
Promote Memory to Register
Definition Mem2Reg.cpp:110

Context
@ Context
Definition MemProfContextDisambiguation.cpp:129

MemoryLocation.h
This file provides utility analysis objects describing memory locations.

T
#define T
Definition Mips16ISelLowering.cpp:353

Signed
@ Signed
Definition NVPTXISelLowering.cpp:5796

ObjCARCUtil.h
This file defines ARC utility functions which are used by various parts of the compiler.

OptimizationRemarkEmitter.h

P
#define P(N)

getCodeModel
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
Definition PPCAsmPrinter.cpp:481

Operation
PowerPC Reduce CR logical Operation
Definition PPCReduceCRLogicals.cpp:735

PatternMatch.h

getVal
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
Definition ProfileSummary.cpp:119

Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition RISCVRedundantCopyElimination.cpp:71

Opc
auto Opc
Definition RISCVRedundantCopyElimination.cpp:75

Mode
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))

getValueType
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
Definition SLPVectorizer.cpp:264

STLExtras.h
This file contains some templates that are useful if you are working with the STL at all.

contains
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480

getSimpleVT
static LLVM_ATTRIBUTE_ALWAYS_INLINE MVT::SimpleValueType getSimpleVT(const unsigned char *MatcherTable, unsigned &MatcherIndex)
getSimpleVT - Decode a value in MatcherTable, if it's a VBR encoded value, use GetVBR to decode it.
Definition SelectionDAGISel.cpp:2705

SelectionDAGNodes.h

SelectionDAG.h

SipHash.h

SmallSet.h
This file defines the SmallSet class.

SmallVectorExtras.h
This file defines less commonly used SmallVector utilities.

SmallVector.h
This file defines the SmallVector class.

Enabled
static bool Enabled
Definition Statistic.cpp:46

Statistic.h
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...

STATISTIC
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171

StringRef.h

Debug.h

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:114

Y
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")

X
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")

getType
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39

BlockSize
static const int BlockSize
Definition TarWriter.cpp:33

TargetCallingConv.h

TargetInstrInfo.h

Ptr
@ Ptr
Definition TargetLibraryInfo.cpp:77

Int
@ Int
Definition TargetLibraryInfo.cpp:65

TargetOpcodes.h

TargetOptions.h

TargetTransformInfo.h
This pass exposes codegen information to IR-level passes.

Triple.h

Twine.h

getVectorElementType
static llvm::Type * getVectorElementType(llvm::Type *Ty)
Definition VETargetTransformInfo.h:24

UndefPoisonKind::PoisonOnly
@ PoisonOnly
Definition ValueTracking.cpp:7379

ValueTracking.h

ValueTypes.h

VectorUtils.h

getFunction
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
Definition WebAssemblyLowerEmscriptenEHSjLj.cpp:442

getSwappedCondition
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
Definition X86InstrInfo.cpp:3354

Concat
static constexpr int Concat[]
Definition X86InterleavedAccess.cpp:232

RHS
Value * RHS
Definition X86PartialReduction.cpp:74

LHS
Value * LHS
Definition X86PartialReduction.cpp:73

Mul
BinaryOperator * Mul
Definition X86PartialReduction.cpp:68

Input
The Input class is used to parse a yaml document into in-memory structs and vectors.
Definition YAMLTraits.h:1313

Node
Definition ItaniumDemangle.h:166

Predicate
Definition AMDGPURegBankLegalizeRules.cpp:376

llvm::AArch64FunctionInfo
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
Definition AArch64MachineFunctionInfo.h:47

llvm::AArch64FunctionInfo::branchTargetEnforcement
bool branchTargetEnforcement() const
Definition AArch64MachineFunctionInfo.h:608

llvm::AArch64FunctionInfo::setIsSVECC
void setIsSVECC(bool s)
Definition AArch64MachineFunctionInfo.h:300

llvm::AArch64FunctionInfo::getVarArgsFPRSize
unsigned getVarArgsFPRSize() const
Definition AArch64MachineFunctionInfo.h:480

llvm::AArch64FunctionInfo::setVarArgsStackOffset
void setVarArgsStackOffset(unsigned Offset)
Definition AArch64MachineFunctionInfo.h:469

llvm::AArch64FunctionInfo::setVarArgsStackIndex
void setVarArgsStackIndex(int Index)
Definition AArch64MachineFunctionInfo.h:466

llvm::AArch64FunctionInfo::setEarlyAllocSMESaveBuffer
void setEarlyAllocSMESaveBuffer(Register Ptr)
Definition AArch64MachineFunctionInfo.h:265

llvm::AArch64FunctionInfo::getZT0SpillSlotIndex
int getZT0SpillSlotIndex() const
Definition AArch64MachineFunctionInfo.h:274

llvm::AArch64FunctionInfo::getTPIDR2Obj
TPIDR2Object & getTPIDR2Obj()
Definition AArch64MachineFunctionInfo.h:287

llvm::AArch64FunctionInfo::setTailCallReservedStack
void setTailCallReservedStack(unsigned bytes)
Definition AArch64MachineFunctionInfo.h:313

llvm::AArch64FunctionInfo::hasELFSignedGOT
bool hasELFSignedGOT() const
Definition AArch64MachineFunctionInfo.h:601

llvm::AArch64FunctionInfo::getForwardedMustTailRegParms
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
Definition AArch64MachineFunctionInfo.h:571

llvm::AArch64FunctionInfo::setIsSplitCSR
void setIsSplitCSR(bool s)
Definition AArch64MachineFunctionInfo.h:352

llvm::AArch64FunctionInfo::getVarArgsFPRIndex
int getVarArgsFPRIndex() const
Definition AArch64MachineFunctionInfo.h:477

llvm::AArch64FunctionInfo::incNumLocalDynamicTLSAccesses
void incNumLocalDynamicTLSAccesses()
Definition AArch64MachineFunctionInfo.h:453

llvm::AArch64FunctionInfo::setBytesInStackArgArea
void setBytesInStackArgArea(unsigned bytes)
Definition AArch64MachineFunctionInfo.h:305

llvm::AArch64FunctionInfo::getVarArgsStackIndex
int getVarArgsStackIndex() const
Definition AArch64MachineFunctionInfo.h:465

llvm::AArch64FunctionInfo::setVarArgsGPRIndex
void setVarArgsGPRIndex(int Index)
Definition AArch64MachineFunctionInfo.h:472

llvm::AArch64FunctionInfo::getVarArgsGPRIndex
int getVarArgsGPRIndex() const
Definition AArch64MachineFunctionInfo.h:471

llvm::AArch64FunctionInfo::setPStateSMReg
void setPStateSMReg(Register Reg)
Definition AArch64MachineFunctionInfo.h:297

llvm::AArch64FunctionInfo::setVarArgsFPRSize
void setVarArgsFPRSize(unsigned Size)
Definition AArch64MachineFunctionInfo.h:481

llvm::AArch64FunctionInfo::getVarArgsStackOffset
unsigned getVarArgsStackOffset() const
Definition AArch64MachineFunctionInfo.h:468

llvm::AArch64FunctionInfo::getSMEFnAttrs
SMEAttrs getSMEFnAttrs() const
Definition AArch64MachineFunctionInfo.h:506

llvm::AArch64FunctionInfo::getVarArgsGPRSize
unsigned getVarArgsGPRSize() const
Definition AArch64MachineFunctionInfo.h:474

llvm::AArch64FunctionInfo::setZT0SpillSlotIndex
void setZT0SpillSlotIndex(int FI)
Definition AArch64MachineFunctionInfo.h:273

llvm::AArch64FunctionInfo::getSRetReturnReg
unsigned getSRetReturnReg() const
Definition AArch64MachineFunctionInfo.h:508

llvm::AArch64FunctionInfo::getPStateSMReg
Register getPStateSMReg() const
Definition AArch64MachineFunctionInfo.h:296

llvm::AArch64FunctionInfo::hasZT0SpillSlotIndex
bool hasZT0SpillSlotIndex() const
Definition AArch64MachineFunctionInfo.h:278

llvm::AArch64FunctionInfo::setSMESaveBufferUsed
void setSMESaveBufferUsed(bool Used=true)
Definition AArch64MachineFunctionInfo.h:286

llvm::AArch64FunctionInfo::setSRetReturnReg
void setSRetReturnReg(unsigned Reg)
Definition AArch64MachineFunctionInfo.h:509

llvm::AArch64FunctionInfo::setSMESaveBufferAddr
void setSMESaveBufferAddr(Register Reg)
Definition AArch64MachineFunctionInfo.h:284

llvm::AArch64FunctionInfo::getBytesInStackArgArea
unsigned getBytesInStackArgArea() const
Definition AArch64MachineFunctionInfo.h:304

llvm::AArch64FunctionInfo::isSMESaveBufferUsed
unsigned isSMESaveBufferUsed() const
Definition AArch64MachineFunctionInfo.h:285

llvm::AArch64FunctionInfo::setVarArgsFPRIndex
void setVarArgsFPRIndex(int Index)
Definition AArch64MachineFunctionInfo.h:478

llvm::AArch64FunctionInfo::setVarArgsGPRSize
void setVarArgsGPRSize(unsigned Size)
Definition AArch64MachineFunctionInfo.h:475

llvm::AArch64FunctionInfo::setArgumentStackToRestore
void setArgumentStackToRestore(unsigned bytes)
Definition AArch64MachineFunctionInfo.h:308

llvm::AArch64FunctionInfo::setHasStreamingModeChanges
void setHasStreamingModeChanges(bool HasChanges)
Definition AArch64MachineFunctionInfo.h:626

llvm::AArch64InstrInfo
Definition AArch64InstrInfo.h:180

llvm::AArch64RegisterInfo
Definition AArch64RegisterInfo.h:26

llvm::AArch64Subtarget
Definition AArch64Subtarget.h:38

llvm::AArch64Subtarget::isTargetWindows
bool isTargetWindows() const
Definition AArch64Subtarget.h:292

llvm::AArch64Subtarget::isNeonAvailable
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
Definition AArch64Subtarget.h:190

llvm::AArch64Subtarget::getMaximumJumpTableSize
unsigned getMaximumJumpTableSize() const
Definition AArch64Subtarget.h:278

llvm::AArch64Subtarget::getPrefLoopAlignment
Align getPrefLoopAlignment() const
Definition AArch64Subtarget.h:272

llvm::AArch64Subtarget::getPrefFunctionAlignment
Align getPrefFunctionAlignment() const
Definition AArch64Subtarget.h:269

llvm::AArch64Subtarget::isTargetMachO
bool isTargetMachO() const
Definition AArch64Subtarget.h:299

llvm::AArch64Subtarget::getMaxBytesForLoopAlignment
unsigned getMaxBytesForLoopAlignment() const
Definition AArch64Subtarget.h:274

llvm::AArch64Subtarget::supportsAddressTopByteIgnored
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
Definition AArch64Subtarget.cpp:578

llvm::AArch64Subtarget::isStreamingCompatible
bool isStreamingCompatible() const
Returns true if the function has a streaming-compatible body.
Definition AArch64Subtarget.h:178

llvm::AArch64Subtarget::isSVEorStreamingSVEAvailable
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
Definition AArch64Subtarget.h:209

llvm::AArch64Subtarget::useSVEForFixedLengthVectors
bool useSVEForFixedLengthVectors() const
Definition AArch64Subtarget.h:428

llvm::AArch64Subtarget::ClassifyGlobalReference
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
Definition AArch64Subtarget.cpp:447

llvm::AArch64Subtarget::isLittleEndian
bool isLittleEndian() const
Definition AArch64Subtarget.h:287

llvm::AArch64Subtarget::isStreaming
bool isStreaming() const
Returns true if the function has a streaming body.
Definition AArch64Subtarget.h:175

llvm::AArch64Subtarget::getMaxSVEVectorSizeInBits
unsigned getMaxSVEVectorSizeInBits() const
Definition AArch64Subtarget.h:406

llvm::AArch64Subtarget::isCallingConvWin64
bool isCallingConvWin64(CallingConv::ID CC, bool IsVarArg) const
Definition AArch64Subtarget.h:361

llvm::AArch64Subtarget::getMinSVEVectorSizeInBits
unsigned getMinSVEVectorSizeInBits() const
Definition AArch64Subtarget.h:412

llvm::AArch64TargetLowering
Definition AArch64ISelLowering.h:64

llvm::AArch64TargetLowering::getExceptionPointerRegister
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
Definition AArch64ISelLowering.cpp:29452

llvm::AArch64TargetLowering::isTruncateFree
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
Definition AArch64ISelLowering.cpp:17230

llvm::AArch64TargetLowering::changeStreamingMode
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, bool InsertVectorLengthCheck=false) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
Definition AArch64ISelLowering.cpp:9271

llvm::AArch64TargetLowering::isFPImmLegal
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
Definition AArch64ISelLowering.cpp:12631

llvm::AArch64TargetLowering::getRegisterTypeForCallingConv
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
Definition AArch64ISelLowering.cpp:31647

llvm::AArch64TargetLowering::shouldReduceLoadWidth
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
Definition AArch64ISelLowering.cpp:17178

llvm::AArch64TargetLowering::initializeSplitCSR
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
Definition AArch64ISelLowering.cpp:29502

llvm::AArch64TargetLowering::computeKnownBitsForTargetNode
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
Definition AArch64ISelLowering.cpp:2576

llvm::AArch64TargetLowering::getPreferredVectorAction
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
Definition AArch64ISelLowering.cpp:29014

llvm::AArch64TargetLowering::getPromotedVTForPredicate
EVT getPromotedVTForPredicate(EVT VT) const
Definition AArch64ISelLowering.cpp:31360

llvm::AArch64TargetLowering::isShuffleMaskLegal
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
Definition AArch64ISelLowering.cpp:16303

llvm::AArch64TargetLowering::getVaListSizeInBits
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
Definition AArch64ISelLowering.cpp:29632

llvm::AArch64TargetLowering::EmitZAInstr
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
Definition AArch64ISelLowering.cpp:3047

llvm::AArch64TargetLowering::insertCopiesSplitCSR
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
Definition AArch64ISelLowering.cpp:29508

llvm::AArch64TargetLowering::getPreferredLargeGEPBaseOffset
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
Definition AArch64ISelLowering.cpp:18540

llvm::AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
Definition AArch64ISelLowering.cpp:29091

llvm::AArch64TargetLowering::shouldExpandCttzElements
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
Definition AArch64ISelLowering.cpp:2189

llvm::AArch64TargetLowering::EmitInitTPIDR2Object
MachineBasicBlock * EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock *BB) const
Definition AArch64ISelLowering.cpp:3098

llvm::AArch64TargetLowering::lowerInterleavedStore
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a stN intrinsic.
Definition AArch64ISelLowering.cpp:18020

llvm::AArch64TargetLowering::EmitTileLoad
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
Definition AArch64ISelLowering.cpp:2997

llvm::AArch64TargetLowering::getNumInterleavedAccesses
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
Definition AArch64ISelLowering.cpp:17695

llvm::AArch64TargetLowering::shouldFoldConstantShiftPairToMask
bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override
Return true if it is profitable to fold a pair of shifts into a mask.
Definition AArch64ISelLowering.cpp:18674

llvm::AArch64TargetLowering::shouldConvertConstantLoadToIntImm
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
Definition AArch64ISelLowering.cpp:18723

llvm::AArch64TargetLowering::preferSelectsOverBooleanArithmetic
bool preferSelectsOverBooleanArithmetic(EVT VT) const override
Should we prefer selects to doing arithmetic on boolean types.
Definition AArch64ISelLowering.cpp:29592

llvm::AArch64TargetLowering::getNumRegistersForCallingConv
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
Definition AArch64ISelLowering.cpp:31663

llvm::AArch64TargetLowering::LowerOperation
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
Definition AArch64ISelLowering.cpp:7555

llvm::AArch64TargetLowering::shouldConvertFpToSat
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
Definition AArch64ISelLowering.cpp:29581

llvm::AArch64TargetLowering::canMergeStoresTo
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
Definition AArch64ISelLowering.cpp:29561

llvm::AArch64TargetLowering::isIntDivCheap
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
Definition AArch64ISelLowering.cpp:29549

llvm::AArch64TargetLowering::shouldRemoveRedundantExtend
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
Definition AArch64ISelLowering.cpp:17214

llvm::AArch64TargetLowering::CCAssignFnForReturn
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
Definition AArch64ISelLowering.cpp:8066

llvm::AArch64TargetLowering::getSetCCResultType
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
Definition AArch64ISelLowering.cpp:2398

llvm::AArch64TargetLowering::optimizeExtendOrTruncateConversion
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
Definition AArch64ISelLowering.cpp:17541

llvm::AArch64TargetLowering::createFastISel
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
Definition AArch64ISelLowering.cpp:2851

llvm::AArch64TargetLowering::CCAssignFnForCall
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
Definition AArch64ISelLowering.cpp:8003

llvm::AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
Definition AArch64ISelLowering.cpp:18176

llvm::AArch64TargetLowering::getTargetMMOFlags
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
Definition AArch64ISelLowering.cpp:17706

llvm::AArch64TargetLowering::hasInlineStackProbe
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
Definition AArch64ISelLowering.cpp:31743

llvm::AArch64TargetLowering::shouldFoldSelectWithIdentityConstant
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
Definition AArch64ISelLowering.cpp:18716

llvm::AArch64TargetLowering::isLegalICmpImmediate
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
Definition AArch64ISelLowering.cpp:18455

llvm::AArch64TargetLowering::emitStoreConditional
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
Definition AArch64ISelLowering.cpp:29312

llvm::AArch64TargetLowering::preferIncOfAddToSubOfNot
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
Definition AArch64ISelLowering.cpp:29576

llvm::AArch64TargetLowering::shouldExpandAtomicLoadInIR
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Definition AArch64ISelLowering.cpp:29136

llvm::AArch64TargetLowering::preferredShiftLegalizationStrategy
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
Definition AArch64ISelLowering.cpp:29493

llvm::AArch64TargetLowering::isOpSuitableForLSE128
bool isOpSuitableForLSE128(const Instruction *I) const
Definition AArch64ISelLowering.cpp:29041

llvm::AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
Definition AArch64ISelLowering.cpp:29243

llvm::AArch64TargetLowering::fixupPtrauthDiscriminator
void fixupPtrauthDiscriminator(MachineInstr &MI, MachineBasicBlock *BB, MachineOperand &IntDiscOp, MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const
Replace (0, vreg) discriminator components with the operands of blend or with (immediate,...
Definition AArch64ISelLowering.cpp:3280

llvm::AArch64TargetLowering::lowerInterleavedLoad
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a ldN intrinsic.
Definition AArch64ISelLowering.cpp:17833

llvm::AArch64TargetLowering::fallBackToDAGISel
bool fallBackToDAGISel(const Instruction &Inst) const override
Definition AArch64ISelLowering.cpp:29743

llvm::AArch64TargetLowering::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
Definition AArch64ISelLowering.cpp:17002

llvm::AArch64TargetLowering::isTypeDesirableForOp
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
Definition AArch64ISelLowering.cpp:31749

llvm::AArch64TargetLowering::isLegalAddScalableImmediate
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Definition AArch64ISelLowering.cpp:18392

llvm::AArch64TargetLowering::getSSPStackGuardCheck
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
Definition AArch64ISelLowering.cpp:29425

llvm::AArch64TargetLowering::shouldExpandAtomicRMWInIR
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Definition AArch64ISelLowering.cpp:29187

llvm::AArch64TargetLowering::createComplexDeinterleavingIR
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
Definition AArch64ISelLowering.cpp:31526

llvm::AArch64TargetLowering::allowsMisalignedMemoryAccesses
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
Definition AArch64ISelLowering.cpp:2787

llvm::AArch64TargetLowering::EmitCheckMatchingVL
MachineBasicBlock * EmitCheckMatchingVL(MachineInstr &MI, MachineBasicBlock *MBB) const
Definition AArch64ISelLowering.cpp:2940

llvm::AArch64TargetLowering::getMaxSupportedInterleaveFactor
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
Definition AArch64ISelLowering.h:232

llvm::AArch64TargetLowering::isLegalInterleavedAccessType
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
Definition AArch64ISelLowering.cpp:17713

llvm::AArch64TargetLowering::insertSSPDeclarations
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
Definition AArch64ISelLowering.cpp:29398

llvm::AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Definition AArch64ISelLowering.cpp:29354

llvm::AArch64TargetLowering::emitLoadLinked
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
Definition AArch64ISelLowering.cpp:29265

llvm::AArch64TargetLowering::EmitLoweredCatchRet
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
Definition AArch64ISelLowering.cpp:2916

llvm::AArch64TargetLowering::isComplexDeinterleavingSupported
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
Definition AArch64ISelLowering.cpp:31482

llvm::AArch64TargetLowering::isZExtFree
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
Definition AArch64ISelLowering.cpp:17274

llvm::AArch64TargetLowering::getAsmOperandValueType
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
Definition AArch64ISelLowering.cpp:13180

llvm::AArch64TargetLowering::ReconstructShuffle
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
Definition AArch64ISelLowering.cpp:13450

llvm::AArch64TargetLowering::EmitZero
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
Definition AArch64ISelLowering.cpp:3081

llvm::AArch64TargetLowering::PerformDAGCombine
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
Definition AArch64ISelLowering.cpp:27789

llvm::AArch64TargetLowering::getSafeStackPointerLocation
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
Definition AArch64ISelLowering.cpp:29435

llvm::AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
Definition AArch64ISelLowering.cpp:29466

llvm::AArch64TargetLowering::isProfitableToHoist
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
Definition AArch64ISelLowering.cpp:17248

llvm::AArch64TargetLowering::isOpSuitableForRCPC3
bool isOpSuitableForRCPC3(const Instruction *I) const
Definition AArch64ISelLowering.cpp:29063

llvm::AArch64TargetLowering::getPointerTy
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
Definition AArch64ISelLowering.h:94

llvm::AArch64TargetLowering::isFMAFasterThanFMulAndFAdd
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
Definition AArch64ISelLowering.cpp:18556

llvm::AArch64TargetLowering::EmitZTInstr
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
Definition AArch64ISelLowering.cpp:3030

llvm::AArch64TargetLowering::EmitFill
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
Definition AArch64ISelLowering.cpp:3015

llvm::AArch64TargetLowering::shouldExpandVectorMatch
bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override
Return true if the @llvm.experimental.vector.match intrinsic should be expanded for vector type ‘VT’ ...
Definition AArch64ISelLowering.cpp:2200

llvm::AArch64TargetLowering::EmitEntryPStateSM
MachineBasicBlock * EmitEntryPStateSM(MachineInstr &MI, MachineBasicBlock *BB) const
Definition AArch64ISelLowering.cpp:3228

llvm::AArch64TargetLowering::shouldInsertFencesForAtomic
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Definition AArch64ISelLowering.cpp:29080

llvm::AArch64TargetLowering::isReassocProfitable
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
Definition AArch64ISelLowering.cpp:7986

llvm::AArch64TargetLowering::shouldPreservePtrArith
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
In AArch64, true if FEAT_CPA is present.
Definition AArch64ISelLowering.cpp:31761

llvm::AArch64TargetLowering::shouldExpandAtomicStoreInIR
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
Definition AArch64ISelLowering.cpp:29119

llvm::AArch64TargetLowering::EmitF128CSEL
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
Definition AArch64ISelLowering.cpp:2857

llvm::AArch64TargetLowering::getOptimalMemOpLLT
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
Definition AArch64ISelLowering.cpp:18353

llvm::AArch64TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
Definition AArch64ISelLowering.cpp:29480

llvm::AArch64TargetLowering::EmitAllocateSMESaveBuffer
MachineBasicBlock * EmitAllocateSMESaveBuffer(MachineInstr &MI, MachineBasicBlock *BB) const
Definition AArch64ISelLowering.cpp:3172

llvm::AArch64TargetLowering::isOffsetFoldingLegal
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
Definition AArch64ISelLowering.cpp:12624

llvm::AArch64TargetLowering::needsFixedCatchObjects
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
Definition AArch64ISelLowering.cpp:29663

llvm::AArch64TargetLowering::EmitAllocateZABuffer
MachineBasicBlock * EmitAllocateZABuffer(MachineInstr &MI, MachineBasicBlock *BB) const
Definition AArch64ISelLowering.cpp:3127

llvm::AArch64TargetLowering::getTM
const AArch64TargetMachine & getTM() const
Definition AArch64ISelLowering.cpp:2023

llvm::AArch64TargetLowering::getVectorTypeBreakdownForCallingConv
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Definition AArch64ISelLowering.cpp:31677

llvm::AArch64TargetLowering::getIRStackGuard
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
Definition AArch64ISelLowering.cpp:29383

llvm::AArch64TargetLowering::targetShrinkDemandedConstant
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Definition AArch64ISelLowering.cpp:2530

llvm::AArch64TargetLowering::getOptimalMemOpType
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Definition AArch64ISelLowering.cpp:18322

llvm::AArch64TargetLowering::generateFMAsInMachineCombiner
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
Definition AArch64ISelLowering.cpp:18590

llvm::AArch64TargetLowering::isComplexDeinterleavingOperationSupported
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
Definition AArch64ISelLowering.cpp:31487

llvm::AArch64TargetLowering::isOpSuitableForLDPSTP
bool isOpSuitableForLDPSTP(const Instruction *I) const
Definition AArch64ISelLowering.cpp:29026

llvm::AArch64TargetLowering::AArch64TargetLowering
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
Definition AArch64ISelLowering.cpp:388

llvm::AArch64TargetLowering::EmitGetSMESaveSize
MachineBasicBlock * EmitGetSMESaveSize(MachineInstr &MI, MachineBasicBlock *BB) const
Definition AArch64ISelLowering.cpp:3203

llvm::AArch64TargetLowering::hasPairedLoad
bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
Definition AArch64ISelLowering.cpp:17682

llvm::AArch64TargetLowering::isLegalAddImmediate
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
Definition AArch64ISelLowering.cpp:18384

llvm::AArch64TargetLowering::shouldConsiderGEPOffsetSplit
bool shouldConsiderGEPOffsetSplit() const override
Definition AArch64ISelLowering.cpp:18551

llvm::AArch64TargetLowering::isVectorClearMaskLegal
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
Definition AArch64ISelLowering.cpp:16337

llvm::AArch64TargetLowering::getScratchRegisters
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
Definition AArch64ISelLowering.cpp:18597

llvm::AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
Definition AArch64ISelLowering.cpp:29307

llvm::AArch64TargetLowering::getExceptionSelectorRegister
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Definition AArch64ISelLowering.cpp:29460

llvm::AArch64TargetLowering::useLoadStackGuardNode
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
Definition AArch64ISelLowering.cpp:29001

llvm::AArch64TargetLowering::EmitInstrWithCustomInserter
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Definition AArch64ISelLowering.cpp:3331

llvm::AArch64TargetLowering::isExtractSubvectorCheap
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
Definition AArch64ISelLowering.cpp:18745

llvm::AArch64TargetLowering::getRoundingControlRegisters
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
Definition AArch64ISelLowering.cpp:18607

llvm::AArch64TargetLowering::lowerInterleaveIntrinsicToStore
bool lowerInterleaveIntrinsicToStore(Instruction *Store, Value *Mask, ArrayRef< Value * > InterleaveValues) const override
Lower an interleave intrinsic to a target specific store intrinsic.
Definition AArch64ISelLowering.cpp:18254

llvm::AArch64TargetLowering::EmitKCFICheck
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
Definition AArch64ISelLowering.cpp:29599

llvm::AArch64TargetLowering::isAllActivePredicate
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
Definition AArch64ISelLowering.cpp:31355

llvm::AArch64TargetLowering::isLegalAddressingMode
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
Definition AArch64ISelLowering.cpp:18461

llvm::AArch64TargetLowering::ComputeNumSignBitsForTargetNode
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
Definition AArch64ISelLowering.cpp:2760

llvm::AArch64TargetLowering::isDesirableToCommuteXorWithShift
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
Definition AArch64ISelLowering.cpp:18650

llvm::AArch64TargetLowering::isDesirableToCommuteWithShift
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
Definition AArch64ISelLowering.cpp:18613

llvm::AArch64TargetLowering::enableAggressiveFMAFusion
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
Definition AArch64ISelLowering.cpp:29627

llvm::AArch64TargetLowering::getScalarShiftAmountTy
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
Definition AArch64ISelLowering.cpp:2782

llvm::AArch64TargetLowering::EmitDynamicProbedAlloc
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
Definition AArch64ISelLowering.cpp:2925

llvm::AArch64TargetLowering::shouldExpandGetActiveLaneMask
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
Definition AArch64ISelLowering.cpp:2168

llvm::AArch64TargetLowering::isMulAddWithConstProfitable
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
Definition AArch64ISelLowering.cpp:18427

llvm::AArch64TargetLowering::useSVEForFixedLengthVectorVT
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
Definition AArch64ISelLowering.cpp:7922

llvm::AArch64TargetLowering::mergeStoresAfterLegalization
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
Definition AArch64ISelLowering.cpp:7918

llvm::AArch64TargetMachine
Definition AArch64TargetMachine.h:24

llvm::AArch64TargetMachine::useNewSMEABILowering
bool useNewSMEABILowering() const
Returns true if the new SME ABI lowering should be used.
Definition AArch64TargetMachine.h:83

llvm::APFloatBase::semanticsPrecision
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:296

llvm::APFloat
Definition APFloat.h:940

llvm::APInt
Class for arbitrary precision integers.
Definition APInt.h:78

llvm::APInt::getAllOnes
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234

llvm::APInt::getLoBits
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:644

llvm::APInt::isNegatedPowerOf2
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449

llvm::APInt::zext
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012

llvm::APInt::getSignMask
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:229

llvm::APInt::isMinSignedValue
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423

llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540

llvm::APInt::sdivrem
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition APInt.cpp:1890

llvm::APInt::setHighBits
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391

llvm::APInt::popcount
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670

llvm::APInt::getHiBits
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition APInt.cpp:639

llvm::APInt::zextOrTrunc
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033

llvm::APInt::trunc
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936

llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488

llvm::APInt::getSignedMaxValue
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209

llvm::APInt::isNegative
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329

llvm::APInt::sadd_ov
LLVM_ABI APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition APInt.cpp:1928

llvm::APInt::sle
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition APInt.h:1166

llvm::APInt::uadd_ov
LLVM_ABI APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition APInt.cpp:1935

llvm::APInt::countr_zero
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639

llvm::APInt::getSignedMinValue
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219

llvm::APInt::sextOrTrunc
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041

llvm::APInt::logBase2
unsigned logBase2() const
Definition APInt.h:1761

llvm::APInt::ashr
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827

llvm::APInt::isMask
bool isMask(unsigned numBits) const
Definition APInt.h:488

llvm::APInt::isNonNegative
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:334

llvm::APInt::sext
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985

llvm::APInt::isSubsetOf
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257

llvm::APInt::isPowerOf2
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440

llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306

llvm::APInt::getHighBitsSet
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296

llvm::APInt::sge
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237

llvm::APInt::isOne
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389

llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562

llvm::AddrSpaceCastSDNode
Definition SelectionDAGNodes.h:1377

llvm::AllocaInst
an instruction to allocate memory on the stack
Definition Instructions.h:65

llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41

llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147

llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142

llvm::AtomicCmpXchgInst
An instruction that atomically checks whether a specified value is in a memory location,...
Definition Instructions.h:507

llvm::AtomicCmpXchgInst::getCompareOperand
Value * getCompareOperand()
Definition Instructions.h:639

llvm::AtomicRMWInst
an instruction that atomically reads a memory location, combines it with another value,...
Definition Instructions.h:710

llvm::AtomicRMWInst::FAdd
@ FAdd
*p = old + v
Definition Instructions.h:747

llvm::AtomicRMWInst::FMinimum
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
Definition Instructions.h:766

llvm::AtomicRMWInst::Min
@ Min
*p = old <signed v ? old : v
Definition Instructions.h:740

llvm::AtomicRMWInst::Or
@ Or
*p = old | v
Definition Instructions.h:734

llvm::AtomicRMWInst::And
@ And
*p = old & v
Definition Instructions.h:730

llvm::AtomicRMWInst::FMaximum
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
Definition Instructions.h:762

llvm::AtomicRMWInst::Max
@ Max
*p = old >signed v ? old : v
Definition Instructions.h:738

llvm::AtomicRMWInst::UMin
@ UMin
*p = old <unsigned v ? old : v
Definition Instructions.h:744

llvm::AtomicRMWInst::FMin
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition Instructions.h:758

llvm::AtomicRMWInst::UMax
@ UMax
*p = old >unsigned v ? old : v
Definition Instructions.h:742

llvm::AtomicRMWInst::FMax
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition Instructions.h:754

llvm::AtomicRMWInst::Xchg
@ Xchg
*p = v
Definition Instructions.h:724

llvm::AtomicRMWInst::Nand
@ Nand
*p = ~(old & v)
Definition Instructions.h:732

llvm::AtomicRMWInst::isFloatingPointOperation
bool isFloatingPointOperation() const
Definition Instructions.h:899

llvm::AtomicRMWInst::getOperation
BinOp getOperation() const
Definition Instructions.h:820

llvm::Attribute::get
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition Attributes.cpp:95

llvm::BasicBlock
LLVM Basic Block Representation.
Definition BasicBlock.h:62

llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213

llvm::BitVector
Definition BitVector.h:101

llvm::BlockAddressSDNode
Definition SelectionDAGNodes.h:2389

llvm::BlockAddressSDNode::getBlockAddress
const BlockAddress * getBlockAddress() const
Definition SelectionDAGNodes.h:2402

llvm::BlockAddress::getFunction
Function * getFunction() const
Definition Constants.h:935

llvm::BuildVectorSDNode
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Definition SelectionDAGNodes.h:2180

llvm::BuildVectorSDNode::getConstantFPSplatNode
LLVM_ABI ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
Definition SelectionDAG.cpp:13885

llvm::BuildVectorSDNode::isConstantSequence
LLVM_ABI std::optional< std::pair< APInt, APInt > > isConstantSequence() const
If this BuildVector is constant and represents the numerical series "<a, a+n, a+2n,...
Definition SelectionDAG.cpp:14014

llvm::BuildVectorSDNode::isConstantSplat
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
Definition SelectionDAG.cpp:13704

llvm::BuildVectorSDNode::getConstantSplatNode
LLVM_ABI ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
Definition SelectionDAG.cpp:13873

llvm::BuildVectorSDNode::getConstantFPSplatPow2ToLog2Int
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
Definition SelectionDAG.cpp:13897

llvm::BuildVectorSDNode::isConstant
LLVM_ABI bool isConstant() const
Definition SelectionDAG.cpp:14004

llvm::CCState
CCState - This class holds information needed while lowering arguments and return values.
Definition CallingConvLower.h:171

llvm::CCState::getFirstUnallocated
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
Definition CallingConvLower.h:318

llvm::CCState::resultsCompatible
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
Definition CallingConvLower.cpp:264

llvm::CCState::CheckReturn
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
Definition CallingConvLower.cpp:99

llvm::CCState::AnalyzeReturn
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
Definition CallingConvLower.cpp:113

llvm::CCState::AllocateStack
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
Definition CallingConvLower.h:408

llvm::CCState::getStackSize
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
Definition CallingConvLower.h:246

llvm::CCValAssign
CCValAssign - Represent assignment of one arg/retval to a location.
Definition CallingConvLower.h:34

llvm::CCValAssign::isRegLoc
bool isRegLoc() const
Definition CallingConvLower.h:123

llvm::CCValAssign::getLocReg
Register getLocReg() const
Definition CallingConvLower.h:129

llvm::CCValAssign::getLocInfo
LocInfo getLocInfo() const
Definition CallingConvLower.h:135

llvm::CCValAssign::BCvt
@ BCvt
Definition CallingConvLower.h:47

llvm::CCValAssign::Trunc
@ Trunc
Definition CallingConvLower.h:48

llvm::CCValAssign::SExt
@ SExt
Definition CallingConvLower.h:38

llvm::CCValAssign::ZExt
@ ZExt
Definition CallingConvLower.h:39

llvm::CCValAssign::Full
@ Full
Definition CallingConvLower.h:37

llvm::CCValAssign::Indirect
@ Indirect
Definition CallingConvLower.h:53

llvm::CCValAssign::FPExt
@ FPExt
Definition CallingConvLower.h:52

llvm::CCValAssign::AExt
@ AExt
Definition CallingConvLower.h:40

llvm::CCValAssign::AExtUpper
@ AExtUpper
Definition CallingConvLower.h:45

llvm::CCValAssign::needsCustom
bool needsCustom() const
Definition CallingConvLower.h:127

llvm::CCValAssign::getValVT
MVT getValVT() const
Definition CallingConvLower.h:121

llvm::CCValAssign::isMemLoc
bool isMemLoc() const
Definition CallingConvLower.h:124

llvm::CCValAssign::getLocMemOffset
int64_t getLocMemOffset() const
Definition CallingConvLower.h:130

llvm::CCValAssign::getLocVT
MVT getLocVT() const
Definition CallingConvLower.h:133

llvm::CallBase::isIndirectCall
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
Definition Instructions.cpp:335

llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition InstrTypes.h:1290

llvm::CallBase::arg_size
unsigned arg_size() const
Definition InstrTypes.h:1288

llvm::CallBase::addParamAttr
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition InstrTypes.h:1504

llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition Instructions.h:1511

llvm::CallInst::isTailCall
bool isTailCall() const
Definition Instructions.h:1622

llvm::ConstantFPSDNode::isZero
bool isZero() const
Return true if the value is positive or negative zero.
Definition SelectionDAGNodes.h:1812

llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition Constants.h:87

llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154

llvm::ConstantPoolSDNode
Definition SelectionDAGNodes.h:2075

llvm::ConstantSDNode
Definition SelectionDAGNodes.h:1740

llvm::ConstantSDNode::isOne
bool isOne() const
Definition SelectionDAGNodes.h:1765

llvm::ConstantSDNode::getZExtValue
uint64_t getZExtValue() const
Definition SelectionDAGNodes.h:1757

llvm::ConstantSDNode::getAPIntValue
const APInt & getAPIntValue() const
Definition SelectionDAGNodes.h:1756

llvm::ConstantSDNode::getSExtValue
int64_t getSExtValue() const
Definition SelectionDAGNodes.h:1758

llvm::ConstantSDNode::isZero
bool isZero() const
Definition SelectionDAGNodes.h:1766

llvm::ConstantSDNode::isAllOnes
bool isAllOnes() const
Definition SelectionDAGNodes.h:1767

llvm::ConstantVector::get
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
Definition Constants.cpp:1426

llvm::Constant
This is an important base class in LLVM.
Definition Constant.h:43

llvm::Constant::getNullValue
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition Constants.cpp:373

llvm::DWARFExpression::Operation::getNumOperands
uint64_t getNumOperands() const
Definition DWARFExpression.h:93

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63

llvm::DataLayout::isLittleEndian
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:207

llvm::DataLayout::isBigEndian
bool isBigEndian() const
Definition DataLayout.h:208

llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition DataLayout.cpp:858

llvm::DataLayout::getPrefTypeAlign
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition DataLayout.cpp:900

llvm::DebugLoc
A debug info location.
Definition DebugLoc.h:124

llvm::DemandedBits
Definition DemandedBits.h:41

llvm::DenseMapBase::lookup
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:194

llvm::DenseMap
Definition DenseMap.h:701

llvm::ElementCount
Definition TypeSize.h:299

llvm::ElementCount::getScalable
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:313

llvm::ElementCount::getFixed
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:310

llvm::ElementCount::isScalar
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:321

llvm::ExternalSymbolSDNode
Definition SelectionDAGNodes.h:2431

llvm::FastISel
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66

llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition DerivedTypes.h:594

llvm::FixedVectorType::getInteger
static FixedVectorType * getInteger(FixedVectorType *VTy)
Definition DerivedTypes.h:606

llvm::FixedVectorType::get
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803

llvm::FunctionCallee
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition DerivedTypes.h:170

llvm::FunctionCallee::getCallee
Value * getCallee()
Definition DerivedTypes.h:189

llvm::FunctionLoweringInfo
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Definition FunctionLoweringInfo.h:56

llvm::FunctionType::getParamType
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition DerivedTypes.h:137

llvm::Function
Definition Function.h:64

llvm::Function::hasOptSize
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706

llvm::Function::getFunctionType
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209

llvm::Function::hasMinSize
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703

llvm::Function::getCallingConv
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270

llvm::Function::getPersonalityFn
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition Function.cpp:1036

llvm::Function::getAttributes
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352

llvm::Function::arg_end
arg_iterator arg_end()
Definition Function.h:875

llvm::Function::arg_begin
arg_iterator arg_begin()
Definition Function.h:866

llvm::Function::getContext
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359

llvm::Function::const_arg_iterator
const Argument * const_arg_iterator
Definition Function.h:73

llvm::Function::hasFnAttribute
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727

llvm::GlobalAddressSDNode
Definition SelectionDAGNodes.h:1961

llvm::GlobalAddressSDNode::getOffset
int64_t getOffset() const
Definition SelectionDAGNodes.h:1976

llvm::GlobalAddressSDNode::getGlobal
const GlobalValue * getGlobal() const
Definition SelectionDAGNodes.h:1975

llvm::GlobalValue
Definition GlobalValue.h:49

llvm::GlobalValue::isThreadLocal
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
Definition GlobalValue.h:265

llvm::GlobalValue::hasExternalWeakLinkage
bool hasExternalWeakLinkage() const
Definition GlobalValue.h:531

llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition GlobalValue.h:663

llvm::GlobalValue::getDataLayout
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:132

llvm::GlobalValue::getValueType
Type * getValueType() const
Definition GlobalValue.h:298

llvm::IRBuilderBase
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114

llvm::IRBuilderBase::CreateConstGEP1_32
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition IRBuilder.h:1939

llvm::IRBuilderBase::CreatePointerCast
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2254

llvm::IRBuilderBase::GetInsertBlock
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201

llvm::IRBuilderBase::CreateCall
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2511

llvm::IRBuilderBase::getPtrTy
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605

llvm::IRBuilderBase::getInt8Ty
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition IRBuilder.h:552

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788

llvm::InstructionCost::getValue
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
Definition InstructionCost.h:88

llvm::InstructionCost::isValid
bool isValid() const
Definition InstructionCost.h:80

llvm::Instruction
Definition Instruction.h:69

llvm::Instruction::eraseFromParent
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition Instruction.cpp:108

llvm::IntegerType
Class to represent integer types.
Definition DerivedTypes.h:42

llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition IntrinsicInst.h:49

llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition IntrinsicInst.h:56

llvm::JumpTableSDNode
Definition SelectionDAGNodes.h:2054

llvm::LLT
Definition LowLevelType.h:40

llvm::LLT::getScalarSizeInBits
constexpr unsigned getScalarSizeInBits() const
Definition LowLevelType.h:265

llvm::LLT::scalar
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition LowLevelType.h:43

llvm::LLT::fixed_vector
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition LowLevelType.h:101

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68

llvm::LLVMContext::diagnose
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
Definition LLVMContext.cpp:247

llvm::LSBaseSDNode::isIndexed
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
Definition SelectionDAGNodes.h:2527

llvm::LoadInst
An instruction for reading from memory.
Definition Instructions.h:181

llvm::LoadSDNode
This class is used to represent ISD::LOAD nodes.
Definition SelectionDAGNodes.h:2539

llvm::LoadSDNode::getBasePtr
const SDValue & getBasePtr() const
Definition SelectionDAGNodes.h:2558

llvm::LoadSDNode::getExtensionType
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Definition SelectionDAGNodes.h:2554

llvm::Loop
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40

llvm::MVT
Machine Value Type.
Definition MachineValueType.h:36

llvm::MVT::getFloatingPointVT
static MVT getFloatingPointVT(unsigned BitWidth)
Definition MachineValueType.h:438

llvm::MVT::is128BitVector
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition MachineValueType.h:157

llvm::MVT::INVALID_SIMPLE_VALUE_TYPE
@ INVALID_SIMPLE_VALUE_TYPE
Definition MachineValueType.h:41

llvm::MVT::integer_fixedlen_vector_valuetypes
static auto integer_fixedlen_vector_valuetypes()
Definition MachineValueType.h:561

llvm::MVT::SimpleTy
SimpleValueType SimpleTy
Definition MachineValueType.h:56

llvm::MVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition MachineValueType.h:353

llvm::MVT::changeVectorElementType
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition MachineValueType.h:214

llvm::MVT::getVectorNumElements
unsigned getVectorNumElements() const
Definition MachineValueType.h:301

llvm::MVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition MachineValueType.h:107

llvm::MVT::isInteger
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition MachineValueType.h:91

llvm::MVT::isScalableVector
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
Definition MachineValueType.h:114

llvm::MVT::getScalableVectorVT
static MVT getScalableVectorVT(MVT VT, unsigned NumElements)
Definition MachineValueType.h:468

llvm::MVT::getVT
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition ValueTypes.cpp:249

llvm::MVT::isScalableVT
bool isScalableVT() const
Return true if the type is a scalable type.
Definition MachineValueType.h:132

llvm::MVT::all_valuetypes
static auto all_valuetypes()
SimpleValueType Iteration.
Definition MachineValueType.h:527

llvm::MVT::integer_valuetypes
static auto integer_valuetypes()
Definition MachineValueType.h:532

llvm::MVT::getSizeInBits
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
Definition MachineValueType.h:315

llvm::MVT::scalable_vector_valuetypes
static auto scalable_vector_valuetypes()
Definition MachineValueType.h:555

llvm::MVT::fixedlen_vector_valuetypes
static auto fixedlen_vector_valuetypes()
Definition MachineValueType.h:549

llvm::MVT::getFixedSizeInBits
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition MachineValueType.h:349

llvm::MVT::isFixedLengthVector
bool isFixedLengthVector() const
Definition MachineValueType.h:136

llvm::MVT::getStoreSize
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition MachineValueType.h:363

llvm::MVT::getVectorVT
static MVT getVectorVT(MVT VT, unsigned NumElements)
Definition MachineValueType.h:458

llvm::MVT::getVectorElementType
MVT getVectorElementType() const
Definition MachineValueType.h:270

llvm::MVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition MachineValueType.h:81

llvm::MVT::getIntegerVT
static MVT getIntegerVT(unsigned BitWidth)
Definition MachineValueType.h:448

llvm::MVT::fp_valuetypes
static auto fp_valuetypes()
Definition MachineValueType.h:538

llvm::MVT::getHalfNumVectorElementsVT
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
Definition MachineValueType.h:232

llvm::MVT::getScalarType
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Definition MachineValueType.h:266

llvm::MVT::is64BitVector
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition MachineValueType.h:152

llvm::MVT::fp_fixedlen_vector_valuetypes
static auto fp_fixedlen_vector_valuetypes()
Definition MachineValueType.h:567

llvm::MachineBasicBlock
Definition MachineBasicBlock.h:122

llvm::MachineBasicBlock::transferSuccessorsAndUpdatePHIs
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
Definition MachineBasicBlock.cpp:955

llvm::MachineBasicBlock::remove_instr
LLVM_ABI MachineInstr * remove_instr(MachineInstr *I)
Remove the possibly bundled instruction from the instruction list without deleting it.
Definition MachineBasicBlock.cpp:1484

llvm::MachineBasicBlock::addSuccessor
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Definition MachineBasicBlock.cpp:816

llvm::MachineBasicBlock::begin
iterator begin()
Definition MachineBasicBlock.h:377

llvm::MachineBasicBlock::instr_iterator
Instructions::iterator instr_iterator
Definition MachineBasicBlock.h:336

llvm::MachineBasicBlock::addLiveIn
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
Definition MachineBasicBlock.h:478

llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition MachineBasicBlock.h:323

llvm::MachineBasicBlock::splice
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Definition MachineBasicBlock.h:1156

llvm::MachineBasicBlock::iterator
MachineInstrBundleIterator< MachineInstr > iterator
Definition MachineBasicBlock.h:341

llvm::MachineFrameInfo
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
Definition MachineFrameInfo.h:111

llvm::MachineFrameInfo::CreateFixedObject
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
Definition MachineFrameInfo.cpp:83

llvm::MachineFrameInfo::getObjectSSPLayout
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
Definition MachineFrameInfo.h:586

llvm::MachineFrameInfo::computeMaxCallFrameSize
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
Definition MachineFrameInfo.cpp:187

llvm::MachineFrameInfo::setAdjustsStack
void setAdjustsStack(bool V)
Definition MachineFrameInfo.h:634

llvm::MachineFrameInfo::CreateStackObject
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
Definition MachineFrameInfo.cpp:51

llvm::MachineFrameInfo::SSPLK_None
@ SSPLK_None
Did not trigger a stack protector.
Definition MachineFrameInfo.h:116

llvm::MachineFrameInfo::setFrameAddressIsTaken
void setFrameAddressIsTaken(bool T)
Definition MachineFrameInfo.h:379

llvm::MachineFrameInfo::hasScalableStackID
bool hasScalableStackID(int ObjectIdx) const
Definition MachineFrameInfo.h:504

llvm::MachineFrameInfo::isImmutableObjectIndex
bool isImmutableObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to an immutable object.
Definition MachineFrameInfo.h:736

llvm::MachineFrameInfo::getStackProtectorIndex
int getStackProtectorIndex() const
Return the index for the stack protector object.
Definition MachineFrameInfo.h:365

llvm::MachineFrameInfo::CreateSpillStackObject
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
Definition MachineFrameInfo.cpp:66

llvm::MachineFrameInfo::setStackID
void setStackID(int ObjectIdx, uint8_t ID)
Definition MachineFrameInfo.h:771

llvm::MachineFrameInfo::setHasTailCall
void setHasTailCall(bool V=true)
Definition MachineFrameInfo.h:663

llvm::MachineFrameInfo::hasMustTailInVarArgFunc
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
Definition MachineFrameInfo.h:658

llvm::MachineFrameInfo::setReturnAddressIsTaken
void setReturnAddressIsTaken(bool s)
Definition MachineFrameInfo.h:385

llvm::MachineFrameInfo::getObjectSize
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
Definition MachineFrameInfo.h:477

llvm::MachineFrameInfo::RemoveStackObject
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
Definition MachineFrameInfo.h:813

llvm::MachineFrameInfo::hasVAStart
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
Definition MachineFrameInfo.h:654

llvm::MachineFrameInfo::CreateVariableSizedObject
LLVM_ABI int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
Definition MachineFrameInfo.cpp:74

llvm::MachineFrameInfo::getObjectIndexEnd
int getObjectIndexEnd() const
Return one past the maximum frame object index.
Definition MachineFrameInfo.h:417

llvm::MachineFrameInfo::hasStackProtectorIndex
bool hasStackProtectorIndex() const
Definition MachineFrameInfo.h:367

llvm::MachineFrameInfo::getObjectOffset
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
Definition MachineFrameInfo.h:544

llvm::MachineFrameInfo::setObjectAlignment
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
Definition MachineFrameInfo.h:515

llvm::MachineFunction
Definition MachineFunction.h:286

llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition MachineFunction.h:762

llvm::MachineFunction::getName
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
Definition MachineFunction.cpp:645

llvm::MachineFunction::getMachineMemOperand
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Definition MachineFunction.cpp:536

llvm::MachineFunction::getFrameInfo
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
Definition MachineFunction.h:778

llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition MachineFunction.h:772

llvm::MachineFunction::getDataLayout
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Definition MachineFunction.cpp:309

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition MachineFunction.h:733

llvm::MachineFunction::iterator
BasicBlockListType::iterator iterator
Definition MachineFunction.h:966

llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition MachineFunction.h:860

llvm::MachineFunction::addLiveIn
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
Definition MachineFunction.cpp:780

llvm::MachineFunction::CreateMachineBasicBlock
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
Definition MachineFunction.cpp:499

llvm::MachineFunction::insert
void insert(iterator MBBI, MachineBasicBlock *MBB)
Definition MachineFunction.h:1003

llvm::MachineFunction::getTarget
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Definition MachineFunction.h:758

llvm::MachineInstrBuilder
Definition MachineInstrBuilder.h:98

llvm::MachineInstrBuilder::addExternalSymbol
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
Definition MachineInstrBuilder.h:213

llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition MachineInstrBuilder.h:160

llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition MachineInstrBuilder.h:253

llvm::MachineInstrBuilder::addFrameIndex
const MachineInstrBuilder & addFrameIndex(int Idx) const
Definition MachineInstrBuilder.h:181

llvm::MachineInstrBuilder::addRegMask
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
Definition MachineInstrBuilder.h:226

llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition MachineInstrBuilder.h:126

llvm::MachineInstrBuilder::addMBB
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Definition MachineInstrBuilder.h:175

llvm::MachineInstrBuilder::getInstr
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Definition MachineInstrBuilder.h:118

llvm::MachineInstrBuilder::addDef
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Definition MachineInstrBuilder.h:145

llvm::MachineInstr
Representation of each machine instruction.
Definition MachineInstr.h:72

llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition MachineInstr.h:587

llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition MachineInstr.h:595

llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition MachineMemOperand.h:130

llvm::MachineMemOperand::getSize
LocationSize getSize() const
Return the size in bytes of the memory reference.
Definition MachineMemOperand.h:243

llvm::MachineMemOperand::Flags
Flags
Flags values. These may be or'd together.
Definition MachineMemOperand.h:133

llvm::MachineMemOperand::MOVolatile
@ MOVolatile
The memory access is volatile.
Definition MachineMemOperand.h:141

llvm::MachineMemOperand::MODereferenceable
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
Definition MachineMemOperand.h:145

llvm::MachineMemOperand::MOLoad
@ MOLoad
The memory access reads data.
Definition MachineMemOperand.h:137

llvm::MachineMemOperand::MONonTemporal
@ MONonTemporal
The memory access is non-temporal.
Definition MachineMemOperand.h:143

llvm::MachineMemOperand::MONone
@ MONone
Definition MachineMemOperand.h:135

llvm::MachineMemOperand::MOInvariant
@ MOInvariant
The memory access always returns the same value (or traps).
Definition MachineMemOperand.h:147

llvm::MachineMemOperand::MOStore
@ MOStore
The memory access writes data.
Definition MachineMemOperand.h:139

llvm::MachineMemOperand::getPointerInfo
const MachinePointerInfo & getPointerInfo() const
Definition MachineMemOperand.h:207

llvm::MachineMemOperand::getFlags
Flags getFlags() const
Return the raw flags of the source value,.
Definition MachineMemOperand.h:227

llvm::MachineMemOperand::getAlign
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
Definition MachineOperand.cpp:1151

llvm::MachineMemOperand::getAAInfo
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Definition MachineMemOperand.h:269

llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition MachineOperand.h:48

llvm::MachineOperand::setImm
void setImm(int64_t immVal)
Definition MachineOperand.h:685

llvm::MachineOperand::getImm
int64_t getImm() const
Definition MachineOperand.h:556

llvm::MachineOperand::setReg
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
Definition MachineOperand.cpp:60

llvm::MachineOperand::isImm
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Definition MachineOperand.h:330

llvm::MachineOperand::getIndex
int getIndex() const
Definition MachineOperand.h:576

llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition MachineOperand.h:368

llvm::MachineOperand::isFI
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
Definition MachineOperand.h:338

llvm::MachineOperand::CreateReg
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
Definition MachineOperand.h:842

llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition MachineRegisterInfo.h:53

llvm::MachineRegisterInfo::createVirtualRegister
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition MachineRegisterInfo.cpp:154

llvm::MachineRegisterInfo::use_empty
bool use_empty(Register RegNo) const
use_empty - Return true if there are no instructions using the specified register.
Definition MachineRegisterInfo.h:509

llvm::MachineSDNode
An SDNode that represents everything that will be needed to construct a MachineInstr.
Definition SelectionDAGNodes.h:3149

llvm::MapVector::size
size_type size() const
Definition MapVector.h:56

llvm::MaskedGatherSDNode::getPassThru
const SDValue & getPassThru() const
Definition SelectionDAGNodes.h:3045

llvm::MaskedGatherSDNode::getExtensionType
ISD::LoadExtType getExtensionType() const
Definition SelectionDAGNodes.h:3047

llvm::MaskedGatherScatterSDNode
This is a base class used to represent MGATHER and MSCATTER nodes.
Definition SelectionDAGNodes.h:2995

llvm::MaskedGatherScatterSDNode::getIndex
const SDValue & getIndex() const
Definition SelectionDAGNodes.h:3021

llvm::MaskedGatherScatterSDNode::isIndexScaled
bool isIndexScaled() const
Definition SelectionDAGNodes.h:3011

llvm::MaskedGatherScatterSDNode::getScale
const SDValue & getScale() const
Definition SelectionDAGNodes.h:3023

llvm::MaskedGatherScatterSDNode::getBasePtr
const SDValue & getBasePtr() const
Definition SelectionDAGNodes.h:3020

llvm::MaskedGatherScatterSDNode::getMask
const SDValue & getMask() const
Definition SelectionDAGNodes.h:3022

llvm::MaskedGatherScatterSDNode::isIndexSigned
bool isIndexSigned() const
Definition SelectionDAGNodes.h:3014

llvm::MaskedGatherScatterSDNode::getIndexType
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
Definition SelectionDAGNodes.h:3008

llvm::MaskedHistogramSDNode::getInc
const SDValue & getInc() const
Definition SelectionDAGNodes.h:3100

llvm::MaskedHistogramSDNode::getScale
const SDValue & getScale() const
Definition SelectionDAGNodes.h:3099

llvm::MaskedHistogramSDNode::getMask
const SDValue & getMask() const
Definition SelectionDAGNodes.h:3098

llvm::MaskedHistogramSDNode::getIntID
const SDValue & getIntID() const
Definition SelectionDAGNodes.h:3101

llvm::MaskedHistogramSDNode::getIndex
const SDValue & getIndex() const
Definition SelectionDAGNodes.h:3097

llvm::MaskedHistogramSDNode::getBasePtr
const SDValue & getBasePtr() const
Definition SelectionDAGNodes.h:3096

llvm::MaskedHistogramSDNode::getIndexType
ISD::MemIndexType getIndexType() const
Definition SelectionDAGNodes.h:3092

llvm::MaskedLoadSDNode
This class is used to represent an MLOAD node.
Definition SelectionDAGNodes.h:2846

llvm::MaskedLoadSDNode::getBasePtr
const SDValue & getBasePtr() const
Definition SelectionDAGNodes.h:2862

llvm::MaskedLoadSDNode::getExtensionType
ISD::LoadExtType getExtensionType() const
Definition SelectionDAGNodes.h:2858

llvm::MaskedLoadSDNode::getMask
const SDValue & getMask() const
Definition SelectionDAGNodes.h:2864

llvm::MaskedLoadSDNode::getPassThru
const SDValue & getPassThru() const
Definition SelectionDAGNodes.h:2865

llvm::MaskedLoadSDNode::getOffset
const SDValue & getOffset() const
Definition SelectionDAGNodes.h:2863

llvm::MaskedLoadStoreSDNode::isUnindexed
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
Definition SelectionDAGNodes.h:2837

llvm::MaskedLoadStoreSDNode::getAddressingMode
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
Definition SelectionDAGNodes.h:2829

llvm::MaskedScatterSDNode::getValue
const SDValue & getValue() const
Definition SelectionDAGNodes.h:3075

llvm::MaskedScatterSDNode::isTruncatingStore
bool isTruncatingStore() const
Return true if the op does a truncation before store.
Definition SelectionDAGNodes.h:3073

llvm::MaskedStoreSDNode
This class is used to represent an MSTORE node.
Definition SelectionDAGNodes.h:2875

llvm::MaskedStoreSDNode::getOffset
const SDValue & getOffset() const
Definition SelectionDAGNodes.h:2900

llvm::MaskedStoreSDNode::getBasePtr
const SDValue & getBasePtr() const
Definition SelectionDAGNodes.h:2899

llvm::MaskedStoreSDNode::getMask
const SDValue & getMask() const
Definition SelectionDAGNodes.h:2901

llvm::MaskedStoreSDNode::getValue
const SDValue & getValue() const
Definition SelectionDAGNodes.h:2898

llvm::MaskedStoreSDNode::isTruncatingStore
bool isTruncatingStore() const
Return true if the op does a truncation before store.
Definition SelectionDAGNodes.h:2890

llvm::MemIntrinsicSDNode
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
Definition SelectionDAGNodes.h:1656

llvm::MemSDNode
This is an abstract virtual class for memory operations.
Definition SelectionDAGNodes.h:1397

llvm::MemSDNode::getAddressSpace
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Definition SelectionDAGNodes.h:1488

llvm::MemSDNode::getBaseAlign
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Definition SelectionDAGNodes.h:1414

llvm::MemSDNode::getAlign
Align getAlign() const
Definition SelectionDAGNodes.h:1415

llvm::MemSDNode::isVolatile
bool isVolatile() const
Definition SelectionDAGNodes.h:1436

llvm::MemSDNode::isSimple
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
Definition SelectionDAGNodes.h:1474

llvm::MemSDNode::getSuccessOrdering
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
Definition SelectionDAGNodes.h:1456

llvm::MemSDNode::getMemOperand
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
Definition SelectionDAGNodes.h:1481

llvm::MemSDNode::getBasePtr
const SDValue & getBasePtr() const
Definition SelectionDAGNodes.h:1510

llvm::MemSDNode::getPointerInfo
const MachinePointerInfo & getPointerInfo() const
Definition SelectionDAGNodes.h:1483

llvm::MemSDNode::getMergedOrdering
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
Definition SelectionDAGNodes.h:1463

llvm::MemSDNode::getChain
const SDValue & getChain() const
Definition SelectionDAGNodes.h:1508

llvm::MemSDNode::isNonTemporal
bool isNonTemporal() const
Definition SelectionDAGNodes.h:1437

llvm::MemSDNode::isAtomic
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
Definition SelectionDAGNodes.h:1466

llvm::MemSDNode::getMemoryVT
EVT getMemoryVT() const
Return the type of the in-memory value.
Definition SelectionDAGNodes.h:1477

llvm::MemoryLocation::UnknownSize
@ UnknownSize
Definition MemoryLocation.h:222

llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67

llvm::Module::getRtLibUseGOT
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition Module.cpp:712

llvm::Module::getModuleFlag
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353

llvm::Pattern
Definition FileCheckImpl.h:587

llvm::PointerType::getUnqual
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition DerivedTypes.h:722

llvm::PoisonValue::get
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition Constants.cpp:1888

llvm::Register
Wrapper class representing virtual and physical registers.
Definition Register.h:19

llvm::Register::isValid
constexpr bool isValid() const
Definition Register.h:107

llvm::SDLoc
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Definition SelectionDAGNodes.h:1225

llvm::SDNode
Represents one node in the SelectionDAG.
Definition SelectionDAGNodes.h:501

llvm::SDNode::isStrictFPOpcode
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
Definition SelectionDAGNodes.h:712

llvm::SDNode::ops
ArrayRef< SDUse > ops() const
Definition SelectionDAGNodes.h:1043

llvm::SDNode::getAsAPIntVal
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
Definition SelectionDAGNodes.h:1791

llvm::SDNode::isMachineOpcode
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
Definition SelectionDAGNodes.h:745

llvm::SDNode::getOpcode
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
Definition SelectionDAGNodes.h:692

llvm::SDNode::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this node.
Definition SelectionDAGNodes.h:764

llvm::SDNode::isOnlyUserOf
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
Definition SelectionDAG.cpp:13139

llvm::SDNode::op_values
iterator_range< value_op_iterator > op_values() const
Definition SelectionDAGNodes.h:1057

llvm::SDNode::dropFlags
void dropFlags(unsigned Mask)
Definition SelectionDAGNodes.h:1087

llvm::SDNode::uses
iterator_range< use_iterator > uses()
Definition SelectionDAGNodes.h:884

llvm::SDNode::use_size
size_t use_size() const
Return the number of uses of this node.
Definition SelectionDAGNodes.h:768

llvm::SDNode::hasPredecessorHelper
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
Definition SelectionDAGNodes.h:953

llvm::SDNode::getAsZExtVal
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
Definition SelectionDAGNodes.h:1783

llvm::SDNode::getNumOperands
unsigned getNumOperands() const
Return the number of values used by this operation.
Definition SelectionDAGNodes.h:1013

llvm::SDNode::getMachineOpcode
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
Definition SelectionDAGNodes.h:750

llvm::SDNode::getVTList
SDVTList getVTList() const
Definition SelectionDAGNodes.h:1062

llvm::SDNode::getOperand
const SDValue & getOperand(unsigned Num) const
Definition SelectionDAGNodes.h:1034

llvm::SDNode::getConstantOperandVal
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Definition SelectionDAGNodes.h:1779

llvm::SDNode::getConstantOperandAPInt
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
Definition SelectionDAGNodes.h:1787

llvm::SDNode::hasAnyUseOfValue
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
Definition SelectionDAG.cpp:13128

llvm::SDNode::getValueType
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
Definition SelectionDAGNodes.h:1104

llvm::SDNode::setCFIType
void setCFIType(uint32_t Type)
Definition SelectionDAGNodes.h:1097

llvm::SDNode::isUndef
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
Definition SelectionDAGNodes.h:699

llvm::SDNode::users
iterator_range< user_iterator > users()
Definition SelectionDAGNodes.h:896

llvm::SDNode::setFlags
void setFlags(SDNodeFlags NewFlags)
Definition SelectionDAGNodes.h:1086

llvm::SDNode::user_begin
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Definition SelectionDAGNodes.h:892

llvm::SDNode::op_end
op_iterator op_end() const
Definition SelectionDAGNodes.h:1042

llvm::SDNode::isAssert
bool isAssert() const
Test if this node is an assert operation.
Definition SelectionDAGNodes.h:728

llvm::SDNode::op_begin
op_iterator op_begin() const
Definition SelectionDAGNodes.h:1041

llvm::SDUse
Represents a use of a SDNode.
Definition SelectionDAGNodes.h:286

llvm::SDValue
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
Definition SelectionDAGNodes.h:147

llvm::SDValue::isUndef
bool isUndef() const
Definition SelectionDAGNodes.h:1292

llvm::SDValue::getNode
SDNode * getNode() const
get the SDNode which holds the desired result
Definition SelectionDAGNodes.h:161

llvm::SDValue::hasOneUse
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
Definition SelectionDAGNodes.h:1302

llvm::SDValue::getValue
SDValue getValue(unsigned R) const
Definition SelectionDAGNodes.h:181

llvm::SDValue::dump
void dump() const
Definition SelectionDAGNodes.h:1310

llvm::SDValue::getValueType
EVT getValueType() const
Return the ValueType of the referenced return value.
Definition SelectionDAGNodes.h:1260

llvm::SDValue::getValueSizeInBits
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
Definition SelectionDAGNodes.h:201

llvm::SDValue::getOperand
const SDValue & getOperand(unsigned i) const
Definition SelectionDAGNodes.h:1268

llvm::SDValue::getScalarValueSizeInBits
uint64_t getScalarValueSizeInBits() const
Definition SelectionDAGNodes.h:205

llvm::SDValue::getResNo
unsigned getResNo() const
get the index which selects a specific result in the SDNode
Definition SelectionDAGNodes.h:158

llvm::SDValue::getConstantOperandVal
uint64_t getConstantOperandVal(unsigned i) const
Definition SelectionDAGNodes.h:1272

llvm::SDValue::getSimpleValueType
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
Definition SelectionDAGNodes.h:192

llvm::SDValue::setNode
void setNode(SDNode *N)
set the SDNode
Definition SelectionDAGNodes.h:164

llvm::SDValue::getOpcode
unsigned getOpcode() const
Definition SelectionDAGNodes.h:1256

llvm::SDValue::getNumOperands
unsigned getNumOperands() const
Definition SelectionDAGNodes.h:1264

llvm::SMEAttrs
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
Definition AArch64SMEAttributes.h:24

llvm::SMEAttrs::hasStreamingInterface
bool hasStreamingInterface() const
Definition AArch64SMEAttributes.h:77

llvm::SMEAttrs::hasStreamingCompatibleInterface
bool hasStreamingCompatibleInterface() const
Definition AArch64SMEAttributes.h:81

llvm::SMEAttrs::hasAgnosticZAInterface
bool hasAgnosticZAInterface() const
Definition AArch64SMEAttributes.h:111

llvm::SMEAttrs::hasStreamingInterfaceOrBody
bool hasStreamingInterfaceOrBody() const
Definition AArch64SMEAttributes.h:78

llvm::SMEAttrs::hasNonStreamingInterface
bool hasNonStreamingInterface() const
Definition AArch64SMEAttributes.h:84

llvm::SMEAttrs::hasStreamingBody
bool hasStreamingBody() const
Definition AArch64SMEAttributes.h:76

llvm::SMEAttrs::Normal
@ Normal
Definition AArch64SMEAttributes.h:39

llvm::SMEAttrs::hasZAState
bool hasZAState() const
Definition AArch64SMEAttributes.h:116

llvm::SMEAttrs::hasZT0State
bool hasZT0State() const
Definition AArch64SMEAttributes.h:142

llvm::SMEAttrs::hasSharedZAInterface
bool hasSharedZAInterface() const
Definition AArch64SMEAttributes.h:112

llvm::SMECallAttrs
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
Definition AArch64SMEAttributes.h:167

llvm::SMECallAttrs::callee
SMEAttrs & callee()
Definition AArch64SMEAttributes.h:181

llvm::SMECallAttrs::requiresEnablingZAAfterCall
bool requiresEnablingZAAfterCall() const
Definition AArch64SMEAttributes.h:208

llvm::SMECallAttrs::caller
SMEAttrs & caller()
Definition AArch64SMEAttributes.h:180

llvm::SMECallAttrs::requiresPreservingZT0
bool requiresPreservingZT0() const
Definition AArch64SMEAttributes.h:198

llvm::SMECallAttrs::requiresSMChange
bool requiresSMChange() const
Definition AArch64SMEAttributes.cpp:110

llvm::SMECallAttrs::requiresLazySave
bool requiresLazySave() const
Definition AArch64SMEAttributes.h:193

llvm::SMECallAttrs::requiresDisablingZABeforeCall
bool requiresDisablingZABeforeCall() const
Definition AArch64SMEAttributes.h:203

llvm::SMECallAttrs::requiresPreservingAllZAState
bool requiresPreservingAllZAState() const
Definition AArch64SMEAttributes.h:212

llvm::ScalableVectorType
Class to represent scalable SIMD vectors.
Definition DerivedTypes.h:641

llvm::ScalableVectorType::get
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:825

llvm::SelectionDAG
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition SelectionDAG.h:229

llvm::SelectionDAG::getReducedAlign
LLVM_ABI Align getReducedAlign(EVT VT, bool UseABI)
In most cases this function returns the ABI alignment for a given type, except for illegal vector typ...
Definition SelectionDAG.cpp:2693

llvm::SelectionDAG::getExtLoad
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Definition SelectionDAG.cpp:9873

llvm::SelectionDAG::getTargetGlobalAddress
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition SelectionDAG.h:758

llvm::SelectionDAG::getExtractVectorElt
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
Definition SelectionDAG.h:941

llvm::SelectionDAG::ComputeMaxSignificantBits
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
Definition SelectionDAG.cpp:5427

llvm::SelectionDAG::getMaskedGather
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
Definition SelectionDAG.cpp:10592

llvm::SelectionDAG::getAddrSpaceCast
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
Definition SelectionDAG.cpp:2435

llvm::SelectionDAG::isKnownNeverSNaN
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Definition SelectionDAG.h:2304

llvm::SelectionDAG::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
Definition SelectionDAG.h:500

llvm::SelectionDAG::getCopyToReg
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition SelectionDAG.h:813

llvm::SelectionDAG::getMergeValues
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
Definition SelectionDAG.cpp:9616

llvm::SelectionDAG::getVTList
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
Definition SelectionDAG.cpp:11276

llvm::SelectionDAG::getShiftAmountConstant
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
Definition SelectionDAG.cpp:1806

llvm::SelectionDAG::getSplatValue
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
Definition SelectionDAG.cpp:3236

llvm::SelectionDAG::getAllOnesConstant
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition SelectionDAG.cpp:1795

llvm::SelectionDAG::getMachineNode
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
Definition SelectionDAG.cpp:11717

llvm::SelectionDAG::getNeutralElement
LLVM_ABI SDValue getNeutralElement(unsigned Opcode, const SDLoc &DL, EVT VT, SDNodeFlags Flags)
Get the (commutative) neutral element for the given opcode, if it exists.
Definition SelectionDAG.cpp:14170

llvm::SelectionDAG::getNodeIfExists
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags, bool AllowCommute=false)
Get the specified node if it's already available, or else return NULL.
Definition SelectionDAG.cpp:11864

llvm::SelectionDAG::getVScale
LLVM_ABI SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
Definition SelectionDAG.cpp:2087

llvm::SelectionDAG::makeEquivalentMemoryOrdering
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
Definition SelectionDAG.cpp:12772

llvm::SelectionDAG::isConstantIntBuildVectorOrConstantInt
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
Definition SelectionDAG.cpp:14063

llvm::SelectionDAG::getJumpTableDebugInfo
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
Definition SelectionDAG.cpp:1956

llvm::SelectionDAG::getSetCC
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
Definition SelectionDAG.h:1314

llvm::SelectionDAG::UnrollVectorOp
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
Definition SelectionDAG.cpp:13349

llvm::SelectionDAG::getConstantFP
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
Definition SelectionDAG.cpp:1868

llvm::SelectionDAG::getExtractSubvector
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
Definition SelectionDAG.h:963

llvm::SelectionDAG::getRegister
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
Definition SelectionDAG.cpp:2323

llvm::SelectionDAG::getElementCount
LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC, bool ConstantFold=true)
Definition SelectionDAG.cpp:2106

llvm::SelectionDAG::getLoad
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
Definition SelectionDAG.cpp:9856

llvm::SelectionDAG::getMemIntrinsicNode
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
Definition SelectionDAG.cpp:9627

llvm::SelectionDAG::getInsertSubvector
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
Definition SelectionDAG.h:956

llvm::SelectionDAG::getStepVector
LLVM_ABI SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
Definition SelectionDAG.cpp:2120

llvm::SelectionDAG::getAtomic
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
Definition SelectionDAG.cpp:9579

llvm::SelectionDAG::getMemcpy
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
Definition SelectionDAG.cpp:9199

llvm::SelectionDAG::addNoMergeSiteInfo
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
Definition SelectionDAG.h:2558

llvm::SelectionDAG::SplitVectorOperand
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
Definition SelectionDAG.h:2469

llvm::SelectionDAG::getNOT
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
Definition SelectionDAG.cpp:1617

llvm::SelectionDAG::getTargetLoweringInfo
const TargetLowering & getTargetLoweringInfo() const
Definition SelectionDAG.h:504

llvm::SelectionDAG::GetSplitDestVTs
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
Definition SelectionDAG.cpp:13588

llvm::SelectionDAG::getTargetJumpTable
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition SelectionDAG.h:768

llvm::SelectionDAG::getUNDEF
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
Definition SelectionDAG.h:1175

llvm::SelectionDAG::getCALLSEQ_END
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
Definition SelectionDAG.h:1152

llvm::SelectionDAG::getBuildVector
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition SelectionDAG.h:868

llvm::SelectionDAG::isSplatValue
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
Definition SelectionDAG.cpp:2979

llvm::SelectionDAG::getBitcast
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
Definition SelectionDAG.cpp:2428

llvm::SelectionDAG::getCopyFromReg
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition SelectionDAG.h:839

llvm::SelectionDAG::getSelect
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
Definition SelectionDAG.h:1343

llvm::SelectionDAG::getNegative
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
Definition SelectionDAG.cpp:1612

llvm::SelectionDAG::setNodeMemRefs
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
Definition SelectionDAG.cpp:11485

llvm::SelectionDAG::getZeroExtendInReg
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
Definition SelectionDAG.cpp:1563

llvm::SelectionDAG::getDataLayout
const DataLayout & getDataLayout() const
Definition SelectionDAG.h:498

llvm::SelectionDAG::doesNodeExist
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
Definition SelectionDAG.cpp:11892

llvm::SelectionDAG::getSelectionDAGInfo
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition SelectionDAG.h:506

llvm::SelectionDAG::areNonVolatileConsecutiveLoads
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
Definition SelectionDAG.cpp:13509

llvm::SelectionDAG::getMaskedHistogram
LLVM_ABI SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType)
Definition SelectionDAG.cpp:10685

llvm::SelectionDAG::getConstant
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
Definition SelectionDAG.cpp:1661

llvm::SelectionDAG::getMemBasePlusOffset
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
Definition SelectionDAG.cpp:8599

llvm::SelectionDAG::getGlobalAddress
LLVM_ABI SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
Definition SelectionDAG.cpp:1886

llvm::SelectionDAG::getSignedTargetConstant
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition SelectionDAG.h:719

llvm::SelectionDAG::getTruncStore
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Definition SelectionDAG.cpp:9982

llvm::SelectionDAG::ReplaceAllUsesWith
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
Definition SelectionDAG.cpp:12234

llvm::SelectionDAG::SplitVector
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
Definition SelectionDAG.cpp:13633

llvm::SelectionDAG::getStore
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
Definition SelectionDAG.cpp:9906

llvm::SelectionDAG::getSignedConstant
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Definition SelectionDAG.cpp:1789

llvm::SelectionDAG::getSplatVector
SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op)
Definition SelectionDAG.h:902

llvm::SelectionDAG::getCALLSEQ_START
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
Definition SelectionDAG.h:1140

llvm::SelectionDAG::getTargetExtractSubreg
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
Definition SelectionDAG.cpp:11835

llvm::SelectionDAG::getSelectCC
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
Definition SelectionDAG.h:1353

llvm::SelectionDAG::getSExtOrTrunc
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
Definition SelectionDAG.cpp:1497

llvm::SelectionDAG::isKnownNeverZero
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
Definition SelectionDAG.cpp:6078

llvm::SelectionDAG::getBoolExtOrTrunc
LLVM_ABI SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
Definition SelectionDAG.cpp:1554

llvm::SelectionDAG::getMaskedStore
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
Definition SelectionDAG.cpp:10543

llvm::SelectionDAG::getExternalSymbol
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
Definition SelectionDAG.cpp:2047

llvm::SelectionDAG::getTarget
const TargetMachine & getTarget() const
Definition SelectionDAG.h:499

llvm::SelectionDAG::getAnyExtOrTrunc
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
Definition SelectionDAG.cpp:1491

llvm::SelectionDAG::getIntPtrConstant
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
Definition SelectionDAG.cpp:1801

llvm::SelectionDAG::getValueType
LLVM_ABI SDValue getValueType(EVT)
Definition SelectionDAG.cpp:2033

llvm::SelectionDAG::getNode
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
Definition SelectionDAG.cpp:10907

llvm::SelectionDAG::getFPExtendOrRound
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
Definition SelectionDAG.cpp:1470

llvm::SelectionDAG::isKnownNeverNaN
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
Definition SelectionDAG.cpp:5875

llvm::SelectionDAG::getTargetConstant
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition SelectionDAG.h:707

llvm::SelectionDAG::ComputeNumSignBits
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
Definition SelectionDAG.cpp:4724

llvm::SelectionDAG::addCalledGlobal
void addCalledGlobal(const SDNode *Node, const GlobalValue *GV, unsigned OpFlags)
Set CalledGlobal to be associated with Node.
Definition SelectionDAG.h:2546

llvm::SelectionDAG::getTargetBlockAddress
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition SelectionDAG.h:808

llvm::SelectionDAG::isBaseWithConstantOffset
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
Definition SelectionDAG.cpp:5856

llvm::SelectionDAG::getVectorIdxConstant
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
Definition SelectionDAG.cpp:1819

llvm::SelectionDAG::ReplaceAllUsesOfValueWith
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
Definition SelectionDAG.cpp:12395

llvm::SelectionDAG::getMachineFunction
MachineFunction & getMachineFunction() const
Definition SelectionDAG.h:493

llvm::SelectionDAG::getSplatBuildVector
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition SelectionDAG.h:885

llvm::SelectionDAG::getFrameIndex
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
Definition SelectionDAG.cpp:1920

llvm::SelectionDAG::computeKnownBits
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
Definition SelectionDAG.cpp:3369

llvm::SelectionDAG::getRegisterMask
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
Definition SelectionDAG.cpp:2339

llvm::SelectionDAG::getZExtOrTrunc
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
Definition SelectionDAG.cpp:1503

llvm::SelectionDAG::getCondCode
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
Definition SelectionDAG.cpp:2074

llvm::SelectionDAG::addCallSiteInfo
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
Definition SelectionDAG.h:2509

llvm::SelectionDAG::MaskedValueIsZero
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
Definition SelectionDAG.cpp:2927

llvm::SelectionDAG::getObjectPtrOffset
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
Definition SelectionDAG.h:1127

llvm::SelectionDAG::getContext
LLVMContext * getContext() const
Definition SelectionDAG.h:511

llvm::SelectionDAG::getTargetExternalSymbol
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
Definition SelectionDAG.cpp:2064

llvm::SelectionDAG::CreateStackTemporary
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
Definition SelectionDAG.cpp:2726

llvm::SelectionDAG::getTargetConstantPool
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition SelectionDAG.h:777

llvm::SelectionDAG::getTargetInsertSubreg
LLVM_ABI SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
Definition SelectionDAG.cpp:11845

llvm::SelectionDAG::getEntryNode
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition SelectionDAG.h:581

llvm::SelectionDAG::getMaskedLoad
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
Definition SelectionDAG.cpp:10497

llvm::SelectionDAG::SplitScalar
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
Definition SelectionDAG.cpp:13573

llvm::SelectionDAG::getVectorShuffle
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
Definition SelectionDAG.cpp:2142

llvm::SelectionDAG::getMaskedScatter
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
Definition SelectionDAG.cpp:10639

llvm::ShuffleVectorInst
This instruction constructs a fixed permutation of two input vectors.
Definition Instructions.h:1935

llvm::ShuffleVectorInst::isSelectMask
static LLVM_ABI bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
Definition Instructions.cpp:1993

llvm::ShuffleVectorInst::getType
VectorType * getType() const
Overload to return most specific vector type.
Definition Instructions.h:1976

llvm::ShuffleVectorInst::isSingleSourceMask
static LLVM_ABI bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
Definition Instructions.cpp:1933

llvm::ShuffleVectorInst::getShuffleMask
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
Definition Instructions.cpp:1861

llvm::ShuffleVectorInst::isDeInterleaveMaskOfFactor
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
Definition Instructions.cpp:2447

llvm::ShuffleVectorInst::isReverseMask
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
Definition Instructions.cpp:1959

llvm::ShuffleVectorSDNode
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
Definition SelectionDAGNodes.h:1680

llvm::ShuffleVectorSDNode::getMaskElt
int getMaskElt(unsigned Idx) const
Definition SelectionDAGNodes.h:1698

llvm::ShuffleVectorSDNode::getSplatIndex
int getSplatIndex() const
Definition SelectionDAGNodes.h:1705

llvm::ShuffleVectorSDNode::getMask
ArrayRef< int > getMask() const
Definition SelectionDAGNodes.h:1693

llvm::ShuffleVectorSDNode::isSplat
bool isSplat() const
Definition SelectionDAGNodes.h:1703

llvm::ShuffleVectorSDNode::isSplatMask
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
Definition SelectionDAG.cpp:14042

llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition SmallPtrSet.h:389

llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition SmallPtrSet.h:527

llvm::SmallSet::count
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175

llvm::SmallSet::insert
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:573

llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition SmallVector.h:937

llvm::SmallVectorImpl::append
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition SmallVector.h:683

llvm::SmallVectorImpl::insert
iterator insert(iterator I, T &&Elt)
Definition SmallVector.h:805

llvm::SmallVectorImpl::clear
void clear()
Definition SmallVector.h:610

llvm::SmallVectorImpl::resize
void resize(size_type N)
Definition SmallVector.h:638

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:416

llvm::SmallVectorTemplateCommon::end
iterator end()
Definition SmallVector.h:272

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:79

llvm::SmallVectorTemplateCommon::data
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition SmallVector.h:289

llvm::SmallVectorTemplateCommon::begin
iterator begin()
Definition SmallVector.h:270

llvm::SmallVectorTemplateCommon::back
reference back()
Definition SmallVector.h:311

llvm::SmallVectorTemplateCommon::empty
bool empty() const
Definition SmallVector.h:82

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1196

llvm::SrcOp
Definition MachineIRBuilder.h:143

llvm::StoreInst
An instruction for storing to memory.
Definition Instructions.h:297

llvm::StoreSDNode
This class is used to represent ISD::STORE nodes.
Definition SelectionDAGNodes.h:2567

llvm::StoreSDNode::getBasePtr
const SDValue & getBasePtr() const
Definition SelectionDAGNodes.h:2586

llvm::StoreSDNode::getValue
const SDValue & getValue() const
Definition SelectionDAGNodes.h:2585

llvm::StoreSDNode::isTruncatingStore
bool isTruncatingStore() const
Return true if the op does a truncation before store.
Definition SelectionDAGNodes.h:2583

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55

llvm::StringRef::getAsInteger
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:472

llvm::StringRef::substr
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition StringRef.h:573

llvm::StringRef::starts_with
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition StringRef.h:261

llvm::StringRef::drop_front
StringRef drop_front(size_t N=1) const
Return a StringRef equal to 'this' but with the first N elements dropped.
Definition StringRef.h:611

llvm::StringRef::slice
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition StringRef.h:686

llvm::StringRef::size
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146

llvm::StringRef::ends_with
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273

llvm::StringSwitch
A switch()-like statement whose cases are string literals.
Definition StringSwitch.h:46

llvm::StringSwitch::Case
StringSwitch & Case(StringLiteral S, T Value)
Definition StringSwitch.h:68

llvm::StringSwitch::Default
R Default(T Value)
Definition StringSwitch.h:192

llvm::StructType::get
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414

llvm::TargetInstrInfo
TargetInstrInfo - Interface to description of machine instruction set.
Definition TargetInstrInfo.h:114

llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition TargetLibraryInfo.h:285

llvm::TargetLoweringBase::isOperationExpand
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
Definition TargetLowering.h:1461

llvm::TargetLoweringBase::getMemValueType
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Definition TargetLowering.h:1738

llvm::TargetLoweringBase::setBooleanVectorContents
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
Definition TargetLowering.h:2558

llvm::TargetLoweringBase::setOperationAction
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
Definition TargetLowering.h:2618

llvm::TargetLoweringBase::finalizeLowering
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
Definition TargetLoweringBase.cpp:2352

llvm::TargetLoweringBase::setMaxDivRemBitWidthSupported
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
Definition TargetLowering.h:2847

llvm::TargetLoweringBase::Enabled
@ Enabled
Definition TargetLowering.h:596

llvm::TargetLoweringBase::Unspecified
@ Unspecified
Definition TargetLowering.h:594

llvm::TargetLoweringBase::PredictableSelectIsExpensive
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
Definition TargetLowering.h:3918

llvm::TargetLoweringBase::shouldReduceLoadWidth
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
Definition TargetLowering.h:1859

llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition TargetLowering.h:1718

llvm::TargetLoweringBase::LegalizeAction
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
Definition TargetLowering.h:202

llvm::TargetLoweringBase::Custom
@ Custom
Definition TargetLowering.h:207

llvm::TargetLoweringBase::Expand
@ Expand
Definition TargetLowering.h:205

llvm::TargetLoweringBase::Promote
@ Promote
Definition TargetLowering.h:204

llvm::TargetLoweringBase::Legal
@ Legal
Definition TargetLowering.h:203

llvm::TargetLoweringBase::LibCall
@ LibCall
Definition TargetLowering.h:206

llvm::TargetLoweringBase::shouldExpandBuildVectorWithShuffles
virtual bool shouldExpandBuildVectorWithShuffles(EVT, unsigned DefinedValues) const
Definition TargetLowering.h:570

llvm::TargetLoweringBase::getLibcallCallingConv
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
Definition TargetLowering.h:3622

llvm::TargetLoweringBase::MaxStoresPerMemcpyOptSize
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
Definition TargetLowering.h:3879

llvm::TargetLoweringBase::emitPatchPoint
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
Definition TargetLoweringBase.cpp:1235

llvm::TargetLoweringBase::getRegClassFor
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
Definition TargetLowering.h:1061

llvm::TargetLoweringBase::getSafeStackPointerLocation
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
Definition TargetLoweringBase.cpp:1975

llvm::TargetLoweringBase::ShiftLegalizationStrategy
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
Definition TargetLowering.h:1096

llvm::TargetLoweringBase::ShiftLegalizationStrategy::LowerToLibcall
@ LowerToLibcall
Definition TargetLowering.h:1099

llvm::TargetLoweringBase::shouldLocalize
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
Definition TargetLoweringBase.cpp:2463

llvm::TargetLoweringBase::setMaximumJumpTableSize
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
Definition TargetLoweringBase.cpp:2119

llvm::TargetLoweringBase::getMinimumJumpTableEntries
virtual unsigned getMinimumJumpTableEntries() const
Return lower limit for number of blocks in a jump table.
Definition TargetLoweringBase.cpp:2103

llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition TargetLowering.h:373

llvm::TargetLoweringBase::MaxLoadsPerMemcmp
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
Definition TargetLowering.h:3898

llvm::TargetLoweringBase::getNumRegistersForCallingConv
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
Definition TargetLowering.h:1836

llvm::TargetLoweringBase::MaxGluedStoresPerMemcpy
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
Definition TargetLowering.h:3885

llvm::TargetLoweringBase::getRegisterTypeForCallingConv
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
Definition TargetLowering.h:1828

llvm::TargetLoweringBase::setOperationPromotedToType
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
Definition TargetLowering.h:2791

llvm::TargetLoweringBase::LegalizeTypeAction
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
Definition TargetLowering.h:212

llvm::TargetLoweringBase::TypeWidenVector
@ TypeWidenVector
Definition TargetLowering.h:220

llvm::TargetLoweringBase::setMaxBytesForAlignment
void setMaxBytesForAlignment(unsigned MaxBytes)
Definition TargetLowering.h:2828

llvm::TargetLoweringBase::setHasExtractBitsInsn
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
Definition TargetLowering.h:2584

llvm::TargetLoweringBase::setIndexedLoadAction
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
Definition TargetLowering.h:2691

llvm::TargetLoweringBase::setPrefLoopAlignment
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
Definition TargetLowering.h:2827

llvm::TargetLoweringBase::getMaximumJumpTableSize
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
Definition TargetLoweringBase.cpp:2115

llvm::TargetLoweringBase::setMaxAtomicSizeInBitsSupported
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
Definition TargetLowering.h:2841

llvm::TargetLoweringBase::getPreferredVectorAction
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Definition TargetLowering.h:537

llvm::TargetLoweringBase::getVectorTypeBreakdownForCallingConv
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Definition TargetLowering.h:1211

llvm::TargetLoweringBase::getSSPStackGuardCheck
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
Definition TargetLoweringBase.cpp:2099

llvm::TargetLoweringBase::setMinFunctionAlignment
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
Definition TargetLowering.h:2814

llvm::TargetLoweringBase::MaxStoresPerMemsetOptSize
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
Definition TargetLowering.h:3864

llvm::TargetLoweringBase::setBooleanContents
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
Definition TargetLowering.h:2544

llvm::TargetLoweringBase::MaxStoresPerMemmove
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
Definition TargetLowering.h:3912

llvm::TargetLoweringBase::computeRegisterProperties
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
Definition TargetLoweringBase.cpp:1355

llvm::TargetLoweringBase::getTypeToTransformTo
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
Definition TargetLowering.h:1172

llvm::TargetLoweringBase::MaxStoresPerMemmoveOptSize
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
Definition TargetLowering.h:3914

llvm::TargetLoweringBase::getIRStackGuard
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
Definition TargetLoweringBase.cpp:2050

llvm::TargetLoweringBase::addRegisterClass
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
Definition TargetLowering.h:2601

llvm::TargetLoweringBase::isTypeLegal
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
Definition TargetLowering.h:1112

llvm::TargetLoweringBase::EnableExtLdPromotion
bool EnableExtLdPromotion
Definition TargetLowering.h:3921

llvm::TargetLoweringBase::setIndexedStoreAction
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
Definition TargetLowering.h:2708

llvm::TargetLoweringBase::getPointerTy
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
Definition TargetLowering.h:380

llvm::TargetLoweringBase::setPrefFunctionAlignment
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
Definition TargetLowering.h:2820

llvm::TargetLoweringBase::isLegalAddImmediate
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
Definition TargetLowering.h:2936

llvm::TargetLoweringBase::isOperationLegal
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Definition TargetLowering.h:1466

llvm::TargetLoweringBase::MaxStoresPerMemset
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
Definition TargetLowering.h:3862

llvm::TargetLoweringBase::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
Definition TargetLowering.h:874

llvm::TargetLoweringBase::setPartialReduceMLAAction
void setPartialReduceMLAAction(unsigned Opc, MVT AccVT, MVT InputVT, LegalizeAction Action)
Indicate how a PARTIAL_REDUCE_U/SMLA node with Acc type AccVT and Input type InputVT should be treate...
Definition TargetLowering.h:2766

llvm::TargetLoweringBase::setTruncStoreAction
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
Definition TargetLowering.h:2681

llvm::TargetLoweringBase::ZeroOrOneBooleanContent
@ ZeroOrOneBooleanContent
Definition TargetLowering.h:239

llvm::TargetLoweringBase::ZeroOrNegativeOneBooleanContent
@ ZeroOrNegativeOneBooleanContent
Definition TargetLowering.h:240

llvm::TargetLoweringBase::preferredShiftLegalizationStrategy
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
Definition TargetLowering.h:1102

llvm::TargetLoweringBase::isOperationLegalOrCustom
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Definition TargetLowering.h:1358

llvm::TargetLoweringBase::MaxLoadsPerMemcmpOptSize
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
Definition TargetLowering.h:3900

llvm::TargetLoweringBase::isLoadExtLegalOrCustom
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
Definition TargetLowering.h:1492

llvm::TargetLoweringBase::isBinOp
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
Definition TargetLowering.h:3016

llvm::TargetLoweringBase::isIndexedLoadLegal
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
Definition TargetLowering.h:1561

llvm::TargetLoweringBase::setStackPointerRegisterToSaveRestore
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
Definition TargetLowering.h:2576

llvm::TargetLoweringBase::AtomicExpansionKind
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
Definition TargetLowering.h:256

llvm::TargetLoweringBase::AtomicExpansionKind::LLSC
@ LLSC
Definition TargetLowering.h:260

llvm::TargetLoweringBase::AtomicExpansionKind::CmpXChg
@ CmpXChg
Definition TargetLowering.h:264

llvm::TargetLoweringBase::AtomicExpansionKind::None
@ None
Definition TargetLowering.h:257

llvm::TargetLoweringBase::AtomicExpansionKind::Expand
@ Expand
Definition TargetLowering.h:270

llvm::TargetLoweringBase::setCondCodeAction
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
Definition TargetLowering.h:2742

llvm::TargetLoweringBase::getLibcallImpl
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
Definition TargetLowering.h:3578

llvm::TargetLoweringBase::getLibcallImplName
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
Definition TargetLowering.h:3589

llvm::TargetLoweringBase::setTargetDAGCombine
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Definition TargetLowering.h:2806

llvm::TargetLoweringBase::setLoadExtAction
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
Definition TargetLowering.h:2635

llvm::TargetLoweringBase::IsStrictFPEnabled
bool IsStrictFPEnabled
Definition TargetLowering.h:3933

llvm::TargetLoweringBase::getLibcallName
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
Definition TargetLowering.h:3583

llvm::TargetLoweringBase::ArgListTy
std::vector< ArgListEntry > ArgListTy
Definition TargetLowering.h:341

llvm::TargetLoweringBase::getAsmOperandValueType
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Definition TargetLowering.h:1709

llvm::TargetLoweringBase::MaxStoresPerMemcpy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
Definition TargetLowering.h:3877

llvm::TargetLoweringBase::getFrameIndexTy
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
Definition TargetLowering.h:393

llvm::TargetLoweringBase::getPointerMemTy
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
Definition TargetLowering.h:387

llvm::TargetLoweringBase::setSchedulingPreference
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
Definition TargetLowering.h:2563

llvm::TargetLoweringBase::getRegisterType
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
Definition TargetLowering.h:1769

llvm::TargetLoweringBase::insertSSPDeclarations
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
Definition TargetLoweringBase.cpp:2065

llvm::TargetLoweringBase::shouldConvertFpToSat
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
Definition TargetLowering.h:3500

llvm::TargetLowering
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Definition TargetLowering.h:3941

llvm::TargetLowering::buildSDIVPow2WithCMov
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
Definition TargetLowering.cpp:6486

llvm::TargetLowering::ConstraintType
ConstraintType
Definition TargetLowering.h:5122

llvm::TargetLowering::C_RegisterClass
@ C_RegisterClass
Definition TargetLowering.h:5124

llvm::TargetLowering::C_Memory
@ C_Memory
Definition TargetLowering.h:5125

llvm::TargetLowering::C_Immediate
@ C_Immediate
Definition TargetLowering.h:5127

llvm::TargetLowering::C_Other
@ C_Other
Definition TargetLowering.h:5128

llvm::TargetLowering::scalarizeVectorStore
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
Definition TargetLowering.cpp:10231

llvm::TargetLowering::softenSetCCOperands
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
Definition TargetLowering.cpp:311

llvm::TargetLowering::isTargetCanonicalConstantNode
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
Definition TargetLowering.h:4392

llvm::TargetLowering::getConstraintType
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
Definition TargetLowering.cpp:5720

llvm::TargetLowering::parametersInCSRMatch
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
Definition TargetLowering.cpp:90

llvm::TargetLowering::LowerToTLSEmulatedModel
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
Definition TargetLowering.cpp:10702

llvm::TargetLowering::isTypeDesirableForOp
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
Definition TargetLowering.h:4544

llvm::TargetLowering::ConstraintWeight
ConstraintWeight
Definition TargetLowering.h:5132

llvm::TargetLowering::CW_Invalid
@ CW_Invalid
Definition TargetLowering.h:5134

llvm::TargetLowering::CW_Constant
@ CW_Constant
Definition TargetLowering.h:5144

llvm::TargetLowering::CW_Register
@ CW_Register
Definition TargetLowering.h:5142

llvm::TargetLowering::CW_Default
@ CW_Default
Definition TargetLowering.h:5145

llvm::TargetLowering::LowerCallTo
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
Definition SelectionDAGBuilder.cpp:10990

llvm::TargetLowering::isPositionIndependent
bool isPositionIndependent() const
Definition TargetLowering.cpp:54

llvm::TargetLowering::getSingleConstraintMatchWeight
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
Definition TargetLowering.cpp:6164

llvm::TargetLowering::getRegForInlineAsmConstraint
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
Definition TargetLowering.cpp:5864

llvm::TargetLowering::SimplifyDemandedBits
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
Definition TargetLowering.cpp:1161

llvm::TargetLowering::SimplifyDemandedBitsForTargetNode
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
Definition TargetLowering.cpp:3950

llvm::TargetLowering::TargetLowering
TargetLowering(const TargetLowering &)=delete

llvm::TargetLowering::useLoadStackGuardNode
virtual bool useLoadStackGuardNode(const Module &M) const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
Definition TargetLowering.h:5769

llvm::TargetLowering::combineRepeatedFPDivisors
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
Definition TargetLowering.h:5304

llvm::TargetLowering::LowerAsmOperandForConstraint
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Definition TargetLowering.cpp:5782

llvm::TargetLowering::expandShiftParts
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Definition TargetLowering.cpp:8360

llvm::TargetLowering::canCreateUndefOrPoisonForTargetNode
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Definition TargetLowering.cpp:4018

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition TargetMachine.h:83

llvm::TargetMachine::getTLSModel
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
Definition TargetMachine.cpp:263

llvm::TargetMachine::getTargetTriple
const Triple & getTargetTriple() const
Definition TargetMachine.h:132

llvm::TargetMachine::useEmulatedTLS
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
Definition TargetMachine.cpp:260

llvm::TargetMachine::getPointerSize
unsigned getPointerSize(unsigned AS) const
Get the pointer size for this target.
Definition TargetMachine.h:218

llvm::TargetMachine::Options
TargetOptions Options
Definition TargetMachine.h:124

llvm::TargetMachine::getCodeModel
CodeModel::Model getCodeModel() const
Returns the code model.
Definition TargetMachine.h:264

llvm::TargetOptions
Definition TargetOptions.h:118

llvm::TargetOptions::TLSSize
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
Definition TargetOptions.h:276

llvm::TargetOptions::NoNaNsFPMath
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
Definition TargetOptions.h:176

llvm::TargetOptions::GuaranteedTailCallOpt
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
Definition TargetOptions.h:216

llvm::TargetOptions::EmitCallGraphSection
unsigned EmitCallGraphSection
Emit section containing call graph metadata.
Definition TargetOptions.h:318

llvm::TargetRegisterClass
Definition TargetRegisterInfo.h:45

llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition TargetRegisterInfo.h:242

llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition TargetTransformInfo.h:223

llvm::TargetTransformInfo::getCastContextHint
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
Definition TargetTransformInfo.cpp:1012

llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition TargetTransformInfo.h:278

llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition TargetTransformInfo.h:279

llvm::TargetTransformInfo::getIntImmCost
LLVM_ABI InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
Definition TargetTransformInfo.cpp:732

llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition TargetTransformInfo.h:301

llvm::Target
Target - Wrapper for Target specific information.
Definition TargetRegistry.h:146

llvm::TruncInst
This class represents a truncation of integer types.
Definition Instructions.h:4554

llvm::TypeSize
Definition TypeSize.h:333

llvm::TypeSize::getFixed
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45

llvm::Type::getInt64Ty
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298

llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273

llvm::Type::isScalableTy
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:62

llvm::Type::getInt128Ty
static LLVM_ABI IntegerType * getInt128Ty(LLVMContext &C)
Definition Type.cpp:299

llvm::Type::getInt32Ty
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297

llvm::Type::isPointerTy
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267

llvm::Type::HalfTyID
@ HalfTyID
16-bit floating point type
Definition Type.h:56

llvm::Type::FloatTyID
@ FloatTyID
32-bit floating point type
Definition Type.h:58

llvm::Type::BFloatTyID
@ BFloatTyID
16-bit floating point type (7-bit significand)
Definition Type.h:57

llvm::Type::DoubleTyID
@ DoubleTyID
64-bit floating point type
Definition Type.h:59

llvm::Type::getVoidTy
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281

llvm::Type::getInt8Ty
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295

llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352

llvm::Type::getPrimitiveSizeInBits
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198

llvm::Type::getInt16Ty
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:296

llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128

llvm::Type::getInt1Ty
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294

llvm::Type::isFloatingPointTy
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184

llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240

llvm::Type::getTypeID
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136

llvm::Type::getIntNTy
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301

llvm::Type::getDoubleTy
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:286

llvm::Type::getFloatTy
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:285

llvm::Type::getBFloatTy
static LLVM_ABI Type * getBFloatTy(LLVMContext &C)
Definition Type.cpp:284

llvm::Type::getHalfTy
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:283

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition Use.h:35

llvm::Use::getUser
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61

llvm::User
Definition User.h:44

llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition User.h:232

llvm::User::getNumOperands
unsigned getNumOperands() const
Definition User.h:254

llvm::VTSDNode
This class is used to represent EVT's, which are used to parameterize some operations.
Definition SelectionDAGNodes.h:2488

llvm::VTSDNode::getVT
EVT getVT() const
Definition SelectionDAGNodes.h:2498

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256

llvm::Value::user_begin
user_iterator user_begin()
Definition Value.h:402

llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439

llvm::Value::replaceAllUsesWith
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546

llvm::Value::dump
LLVM_ABI void dump() const
Support for debugging, callable in GDB: V->dump()
Definition AsmWriter.cpp:5415

llvm::VectorType
Base class of all SIMD vector types.
Definition DerivedTypes.h:430

llvm::VectorType::getHalfElementsVectorType
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition DerivedTypes.h:536

llvm::VectorType::getElementCount
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition DerivedTypes.h:697

llvm::VectorType::getInteger
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition DerivedTypes.h:481

llvm::VectorType::getTruncatedElementVectorType
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
Definition DerivedTypes.h:500

llvm::VectorType::get
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.

llvm::VectorType::getElementType
Type * getElementType() const
Definition DerivedTypes.h:463

llvm::cl::opt
Definition CommandLine.h:1455

llvm::details::FixedOrScalableQuantity::getFixedValue
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201

llvm::details::FixedOrScalableQuantity::isScalable
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169

llvm::details::FixedOrScalableQuantity::getKnownMinValue
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166

llvm::details::FixedOrScalableQuantity::divideCoefficientBy
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:253

llvm::generic_gep_type_iterator::getIndexedType
Type * getIndexedType() const
Definition GetElementPtrTypeIterator.h:102

llvm::iterator_range
A range adaptor for a pair of iterators.
Definition iterator_range.h:32

uint32_t

uint64_t

unsigned

Changed
Changed
Definition ObjCARCOpts.cpp:2369

Analysis.h

ErrorHandling.h

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

TargetMachine.h

llvm::AArch64CC
Definition AArch64BaseInfo.h:250

llvm::AArch64CC::isValidCBCond
static bool isValidCBCond(AArch64CC::CondCode Code)
True, if a given condition code can be used in a fused compare-and-branch instructions,...
Definition AArch64BaseInfo.h:367

llvm::AArch64CC::CondCode
CondCode
Definition AArch64BaseInfo.h:254

llvm::AArch64CC::VC
@ VC
Definition AArch64BaseInfo.h:262

llvm::AArch64CC::NONE_ACTIVE
@ NONE_ACTIVE
Definition AArch64BaseInfo.h:278

llvm::AArch64CC::NE
@ NE
Definition AArch64BaseInfo.h:256

llvm::AArch64CC::GE
@ GE
Definition AArch64BaseInfo.h:265

llvm::AArch64CC::PL
@ PL
Definition AArch64BaseInfo.h:260

llvm::AArch64CC::LAST_ACTIVE
@ LAST_ACTIVE
Definition AArch64BaseInfo.h:277

llvm::AArch64CC::EQ
@ EQ
Definition AArch64BaseInfo.h:255

llvm::AArch64CC::HS
@ HS
Definition AArch64BaseInfo.h:257

llvm::AArch64CC::MI
@ MI
Definition AArch64BaseInfo.h:259

llvm::AArch64CC::GT
@ GT
Definition AArch64BaseInfo.h:267

llvm::AArch64CC::LT
@ LT
Definition AArch64BaseInfo.h:266

llvm::AArch64CC::VS
@ VS
Definition AArch64BaseInfo.h:261

llvm::AArch64CC::HI
@ HI
Definition AArch64BaseInfo.h:263

llvm::AArch64CC::FIRST_ACTIVE
@ FIRST_ACTIVE
Definition AArch64BaseInfo.h:276

llvm::AArch64CC::LO
@ LO
Definition AArch64BaseInfo.h:258

llvm::AArch64CC::AL
@ AL
Definition AArch64BaseInfo.h:269

llvm::AArch64CC::ANY_ACTIVE
@ ANY_ACTIVE
Definition AArch64BaseInfo.h:275

llvm::AArch64CC::LE
@ LE
Definition AArch64BaseInfo.h:268

llvm::AArch64CC::Invalid
@ Invalid
Definition AArch64BaseInfo.h:272

llvm::AArch64CC::NV
@ NV
Definition AArch64BaseInfo.h:270

llvm::AArch64CC::LS
@ LS
Definition AArch64BaseInfo.h:264

llvm::AArch64CC::getInvertedCondCode
static CondCode getInvertedCondCode(CondCode Code)
Definition AArch64BaseInfo.h:303

llvm::AArch64CC::getNZCVToSatisfyCondCode
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
Definition AArch64BaseInfo.h:343

llvm::AArch64II::MO_DLLIMPORT
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
Definition AArch64BaseInfo.h:871

llvm::AArch64II::MO_NC
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
Definition AArch64BaseInfo.h:860

llvm::AArch64II::MO_G1
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
Definition AArch64BaseInfo.h:836

llvm::AArch64II::MO_PAGEOFF
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
Definition AArch64BaseInfo.h:824

llvm::AArch64II::MO_GOT
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
Definition AArch64BaseInfo.h:855

llvm::AArch64II::MO_NO_FLAG
@ MO_NO_FLAG
Definition AArch64BaseInfo.h:812

llvm::AArch64II::MO_G0
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
Definition AArch64BaseInfo.h:840

llvm::AArch64II::MO_PAGE
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
Definition AArch64BaseInfo.h:819

llvm::AArch64II::MO_HI12
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
Definition AArch64BaseInfo.h:845

llvm::AArch64II::MO_TLS
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
Definition AArch64BaseInfo.h:866

llvm::AArch64II::MO_G2
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
Definition AArch64BaseInfo.h:832

llvm::AArch64II::MO_G3
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
Definition AArch64BaseInfo.h:828

llvm::AArch64II::MO_COFFSTUB
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
Definition AArch64BaseInfo.h:850

llvm::AArch64PACKey::ID
ID
Definition AArch64BaseInfo.h:902

llvm::AArch64PACKey::IB
@ IB
Definition AArch64BaseInfo.h:904

llvm::AArch64PACKey::LAST
@ LAST
Definition AArch64BaseInfo.h:907

llvm::AArch64PACKey::IA
@ IA
Definition AArch64BaseInfo.h:903

llvm::AArch64SME::ToggleCondition
ToggleCondition
Definition AArch64BaseInfo.h:681

llvm::AArch64SME::Always
@ Always
Definition AArch64BaseInfo.h:682

llvm::AArch64SME::IfCallerIsNonStreaming
@ IfCallerIsNonStreaming
Definition AArch64BaseInfo.h:684

llvm::AArch64SME::IfCallerIsStreaming
@ IfCallerIsStreaming
Definition AArch64BaseInfo.h:683

llvm::AArch64_AM::getShiftValue
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
Definition AArch64AddressingModes.h:85

llvm::AArch64_AM::isLogicalImmediate
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
Definition AArch64AddressingModes.h:275

llvm::AArch64_AM::encodeAdvSIMDModImmType2
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
Definition AArch64AddressingModes.h:471

llvm::AArch64_AM::isAdvSIMDModImmType9
static bool isAdvSIMDModImmType9(uint64_t Imm)
Definition AArch64AddressingModes.h:573

llvm::AArch64_AM::isAdvSIMDModImmType4
static bool isAdvSIMDModImmType4(uint64_t Imm)
Definition AArch64AddressingModes.h:496

llvm::AArch64_AM::getArithExtendImm
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
Definition AArch64AddressingModes.h:170

llvm::AArch64_AM::isAdvSIMDModImmType5
static bool isAdvSIMDModImmType5(uint64_t Imm)
Definition AArch64AddressingModes.h:511

llvm::AArch64_AM::getFP32Imm
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
Definition AArch64AddressingModes.h:393

llvm::AArch64_AM::encodeAdvSIMDModImmType7
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
Definition AArch64AddressingModes.h:548

llvm::AArch64_AM::encodeAdvSIMDModImmType12
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
Definition AArch64AddressingModes.h:733

llvm::AArch64_AM::encodeAdvSIMDModImmType10
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
Definition AArch64AddressingModes.h:635

llvm::AArch64_AM::encodeAdvSIMDModImmType9
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
Definition AArch64AddressingModes.h:579

llvm::AArch64_AM::UXTX
@ UXTX
Definition AArch64AddressingModes.h:43

llvm::AArch64_AM::encodeLogicalImmediate
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
Definition AArch64AddressingModes.h:282

llvm::AArch64_AM::isAdvSIMDModImmType7
static bool isAdvSIMDModImmType7(uint64_t Imm)
Definition AArch64AddressingModes.h:543

llvm::AArch64_AM::encodeAdvSIMDModImmType5
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
Definition AArch64AddressingModes.h:517

llvm::AArch64_AM::getFP64Imm
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
Definition AArch64AddressingModes.h:421

llvm::AArch64_AM::isAdvSIMDModImmType10
static bool isAdvSIMDModImmType10(uint64_t Imm)
Definition AArch64AddressingModes.h:593

llvm::AArch64_AM::getFP16Imm
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
Definition AArch64AddressingModes.h:367

llvm::AArch64_AM::decodeAdvSIMDModImmType10
static uint64_t decodeAdvSIMDModImmType10(uint8_t Imm)
Definition AArch64AddressingModes.h:663

llvm::AArch64_AM::encodeAdvSIMDModImmType8
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
Definition AArch64AddressingModes.h:568

llvm::AArch64_AM::isAdvSIMDModImmType12
static bool isAdvSIMDModImmType12(uint64_t Imm)
Definition AArch64AddressingModes.h:727

llvm::AArch64_AM::encodeAdvSIMDModImmType11
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
Definition AArch64AddressingModes.h:684

llvm::AArch64_AM::isSVECpyDupImm
static bool isSVECpyDupImm(int SizeInBits, int64_t Val, int32_t &Imm, int32_t &Shift)
Definition AArch64AddressingModes.h:874

llvm::AArch64_AM::isAdvSIMDModImmType11
static bool isAdvSIMDModImmType11(uint64_t Imm)
Definition AArch64AddressingModes.h:677

llvm::AArch64_AM::encodeAdvSIMDModImmType6
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
Definition AArch64AddressingModes.h:533

llvm::AArch64_AM::isAdvSIMDModImmType8
static bool isAdvSIMDModImmType8(uint64_t Imm)
Definition AArch64AddressingModes.h:558

llvm::AArch64_AM::encodeAdvSIMDModImmType4
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
Definition AArch64AddressingModes.h:501

llvm::AArch64_AM::isAdvSIMDModImmType6
static bool isAdvSIMDModImmType6(uint64_t Imm)
Definition AArch64AddressingModes.h:527

llvm::AArch64_AM::encodeAdvSIMDModImmType1
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
Definition AArch64AddressingModes.h:456

llvm::AArch64_AM::encodeAdvSIMDModImmType3
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
Definition AArch64AddressingModes.h:486

llvm::AArch64_AM::isAdvSIMDModImmType2
static bool isAdvSIMDModImmType2(uint64_t Imm)
Definition AArch64AddressingModes.h:466

llvm::AArch64_AM::isAdvSIMDModImmType3
static bool isAdvSIMDModImmType3(uint64_t Imm)
Definition AArch64AddressingModes.h:481

llvm::AArch64_AM::isAdvSIMDModImmType1
static bool isAdvSIMDModImmType1(uint64_t Imm)
Definition AArch64AddressingModes.h:451

llvm::AArch64_IMM::expandMOVImm
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
Definition AArch64ExpandImm.cpp:533

llvm::AArch64::getFPRArgRegs
ArrayRef< MCPhysReg > getFPRArgRegs()
Definition AArch64ISelLowering.cpp:180

llvm::AArch64::getSMEPseudoMap
int getSMEPseudoMap(uint16_t Opcode)

llvm::AArch64::rmMask
@ rmMask
Definition AArch64ISelLowering.h:36

llvm::AArch64::SVEMaxBitsPerVector
static constexpr unsigned SVEMaxBitsPerVector
Definition AArch64BaseInfo.h:949

llvm::AArch64::RoundingBitsPos
const unsigned RoundingBitsPos
Definition AArch64ISelLowering.h:40

llvm::AArch64::SMEMatrixTileD
@ SMEMatrixTileD
Definition AArch64InstrInfo.h:849

llvm::AArch64::SMEMatrixTileB
@ SMEMatrixTileB
Definition AArch64InstrInfo.h:846

llvm::AArch64::SMEMatrixTileQ
@ SMEMatrixTileQ
Definition AArch64InstrInfo.h:850

llvm::AArch64::SMEMatrixArray
@ SMEMatrixArray
Definition AArch64InstrInfo.h:851

llvm::AArch64::SMEMatrixTileS
@ SMEMatrixTileS
Definition AArch64InstrInfo.h:848

llvm::AArch64::SMEMatrixTileH
@ SMEMatrixTileH
Definition AArch64InstrInfo.h:847

llvm::AArch64::SMEMatrixTypeMask
@ SMEMatrixTypeMask
Definition AArch64InstrInfo.h:844

llvm::AArch64::ReservedFPControlBits
const uint64_t ReservedFPControlBits
Definition AArch64ISelLowering.h:43

llvm::AArch64::SVEBitsPerBlock
static constexpr unsigned SVEBitsPerBlock
Definition AArch64BaseInfo.h:948

llvm::AArch64::getGPRArgRegs
ArrayRef< MCPhysReg > getGPRArgRegs()
Definition AArch64ISelLowering.cpp:178

llvm::AArch64::createFastISel
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
Definition AArch64FastISel.cpp:5194

llvm::AMDGPU::HSAMD::Kernel::Arg::Key::Align
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
Definition AMDGPUMetadata.h:183

llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition AMDGPUMetadata.h:396

llvm::AMDGPU::HSAMD::Kernel::Key::Attrs
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
Definition AMDGPUMetadata.h:394

llvm::AMDGPU::HSAMD::ValueType::I16
@ I16
Definition AMDGPUMetadata.h:106

llvm::AMDGPU::Imm
@ Imm
Definition AMDGPURegBankLegalizeRules.h:129

llvm::ARM64AS::PTR64
@ PTR64
Definition AArch64ISelLowering.h:59

llvm::ARM64AS::PTR32_SPTR
@ PTR32_SPTR
Definition AArch64ISelLowering.h:59

llvm::ARM64AS::PTR32_UPTR
@ PTR32_UPTR
Definition AArch64ISelLowering.h:59

llvm::ARM_MB::LD
@ LD
Definition ARMBaseInfo.h:72

llvm::ARM_MB::ST
@ ST
Definition ARMBaseInfo.h:73

llvm::ARM::PredBlockMask::TT
@ TT
Definition ARMBaseInfo.h:107

llvm::ARM::ProfileKind::M
@ M
Definition ARMTargetParser.h:171

llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:127

llvm::COFF::Entry
@ Entry
Definition COFF.h:862

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::CallingConv::ARM64EC_Thunk_Native
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
Definition CallingConv.h:265

llvm::CallingConv::AArch64_VectorCall
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
Definition CallingConv.h:221

llvm::CallingConv::Swift
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69

llvm::CallingConv::AArch64_SVE_VectorCall
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
Definition CallingConv.h:224

llvm::CallingConv::CFGuard_Check
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82

llvm::CallingConv::PreserveMost
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63

llvm::CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition CallingConv.h:241

llvm::CallingConv::CXX_FAST_TLS
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72

llvm::CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
Definition CallingConv.h:238

llvm::CallingConv::GHC
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50

llvm::CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1
Preserve X1-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition CallingConv.h:271

llvm::CallingConv::PreserveAll
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66

llvm::CallingConv::Fast
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41

llvm::CallingConv::PreserveNone
@ PreserveNone
Used for runtime calls that preserves none general registers.
Definition CallingConv.h:90

llvm::CallingConv::Tail
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76

llvm::CallingConv::Win64
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
Definition CallingConv.h:159

llvm::CallingConv::SwiftTail
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87

llvm::CallingConv::GRAAL
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
Definition CallingConv.h:255

llvm::CallingConv::ARM64EC_Thunk_X64
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
Definition CallingConv.h:260

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::CodeModel::Model
Model
Definition CodeGen.h:31

llvm::CodeModel::Large
@ Large
Definition CodeGen.h:31

llvm::CodeModel::Tiny
@ Tiny
Definition CodeGen.h:31

llvm::CodeModel::Small
@ Small
Definition CodeGen.h:31

llvm::FPOpFusion::Fast
@ Fast
Definition TargetOptions.h:31

llvm::HexagonISD::CP
@ CP
Definition HexagonISelLowering.h:54

llvm::HexagonISD::JT
@ JT
Definition HexagonISelLowering.h:53

llvm::IRSimilarity::Legal
@ Legal
Definition IRSimilarityIdentifier.h:77

llvm::ISD::isConstantSplatVectorAllOnes
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
Definition SelectionDAG.cpp:182

llvm::ISD::isNormalMaskedLoad
bool isNormalMaskedLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed masked load.
Definition SelectionDAGNodes.h:3349

llvm::ISD::isNormalMaskedStore
bool isNormalMaskedStore(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed masked store.
Definition SelectionDAGNodes.h:3357

llvm::ISD::NodeType
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41

llvm::ISD::SETCC
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807

llvm::ISD::STRICT_FSETCC
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504

llvm::ISD::STRICT_FSIN
@ STRICT_FSIN
Definition ISDOpcodes.h:435

llvm::ISD::LOOP_DEPENDENCE_RAW_MASK
@ LOOP_DEPENDENCE_RAW_MASK
Definition ISDOpcodes.h:1571

llvm::ISD::JumpTable
@ JumpTable
Definition ISDOpcodes.h:91

llvm::ISD::SREM
@ SREM
Definition ISDOpcodes.h:264

llvm::ISD::SMUL_LOHI
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270

llvm::ISD::SSUBO_CARRY
@ SSUBO_CARRY
Definition ISDOpcodes.h:334

llvm::ISD::UDIV
@ UDIV
Definition ISDOpcodes.h:263

llvm::ISD::INSERT_SUBVECTOR
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593

llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition ISDOpcodes.h:869

llvm::ISD::UMIN
@ UMIN
Definition ISDOpcodes.h:726

llvm::ISD::BSWAP
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771

llvm::ISD::ROTR
@ ROTR
Definition ISDOpcodes.h:766

llvm::ISD::ConstantFP
@ ConstantFP
Definition ISDOpcodes.h:87

llvm::ISD::STRICT_FATAN2
@ STRICT_FATAN2
Definition ISDOpcodes.h:441

llvm::ISD::UADDO
@ UADDO
Definition ISDOpcodes.h:344

llvm::ISD::SDIV
@ SDIV
Definition ISDOpcodes.h:262

llvm::ISD::STRICT_FCEIL
@ STRICT_FCEIL
Definition ISDOpcodes.h:454

llvm::ISD::STRICT_FTANH
@ STRICT_FTANH
Definition ISDOpcodes.h:444

llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259

llvm::ISD::STRICT_FMA
@ STRICT_FMA
Definition ISDOpcodes.h:425

llvm::ISD::ANY_EXTEND
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841

llvm::ISD::FSUB
@ FSUB
Definition ISDOpcodes.h:411

llvm::ISD::FMA
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511

llvm::ISD::INTRINSIC_VOID
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215

llvm::ISD::RETURNADDR
@ RETURNADDR
Definition ISDOpcodes.h:111

llvm::ISD::GlobalAddress
@ GlobalAddress
Definition ISDOpcodes.h:88

llvm::ISD::STRICT_FMINIMUM
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464

llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868

llvm::ISD::CONCAT_VECTORS
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577

llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410

llvm::ISD::ABS
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:744

llvm::ISD::UDIVREM
@ UDIVREM
Definition ISDOpcodes.h:276

llvm::ISD::SDIVREM
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275

llvm::ISD::SRL
@ SRL
Definition ISDOpcodes.h:764

llvm::ISD::STRICT_FSETCCS
@ STRICT_FSETCCS
Definition ISDOpcodes.h:505

llvm::ISD::STRICT_FLOG2
@ STRICT_FLOG2
Definition ISDOpcodes.h:449

llvm::ISD::STRICT_FDIV
@ STRICT_FDIV
Definition ISDOpcodes.h:423

llvm::ISD::BUILD_PAIR
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249

llvm::ISD::STRICT_FSQRT
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431

llvm::ISD::BUILTIN_OP_END
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition ISDOpcodes.h:1580

llvm::ISD::GlobalTLSAddress
@ GlobalTLSAddress
Definition ISDOpcodes.h:89

llvm::ISD::SRA
@ SRA
Definition ISDOpcodes.h:763

llvm::ISD::STRICT_FMUL
@ STRICT_FMUL
Definition ISDOpcodes.h:422

llvm::ISD::USUBO
@ USUBO
Definition ISDOpcodes.h:348

llvm::ISD::AVGFLOORU
@ AVGFLOORU
Definition ISDOpcodes.h:708

llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832

llvm::ISD::STRICT_FASIN
@ STRICT_FASIN
Definition ISDOpcodes.h:438

llvm::ISD::AVGCEILS
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:712

llvm::ISD::STRICT_UINT_TO_FP
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478

llvm::ISD::SCALAR_TO_VECTOR
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:662

llvm::ISD::ADDROFRETURNADDR
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117

llvm::ISD::UADDSAT
@ UADDSAT
Definition ISDOpcodes.h:361

llvm::ISD::STRICT_FATAN
@ STRICT_FATAN
Definition ISDOpcodes.h:440

llvm::ISD::WRITE_REGISTER
@ WRITE_REGISTER
Definition ISDOpcodes.h:135

llvm::ISD::STRICT_FPOW
@ STRICT_FPOW
Definition ISDOpcodes.h:432

llvm::ISD::TRUNCATE_SSAT_U
@ TRUNCATE_SSAT_U
Definition ISDOpcodes.h:861

llvm::ISD::SETCCCARRY
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:815

llvm::ISD::STRICT_LROUND
@ STRICT_LROUND
Definition ISDOpcodes.h:459

llvm::ISD::CTTZ
@ CTTZ
Definition ISDOpcodes.h:772

llvm::ISD::SSUBO
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347

llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition ISDOpcodes.h:915

llvm::ISD::VECTOR_INTERLEAVE
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor to...
Definition ISDOpcodes.h:628

llvm::ISD::STEP_VECTOR
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition ISDOpcodes.h:688

llvm::ISD::OR
@ OR
Definition ISDOpcodes.h:737

llvm::ISD::FCANONICALIZE
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:534

llvm::ISD::SSUBSAT
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369

llvm::ISD::UMULO
@ UMULO
Definition ISDOpcodes.h:352

llvm::ISD::SRA_PARTS
@ SRA_PARTS
Definition ISDOpcodes.h:822

llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784

llvm::ISD::UMUL_LOHI
@ UMUL_LOHI
Definition ISDOpcodes.h:271

llvm::ISD::STRICT_FPOWI
@ STRICT_FPOWI
Definition ISDOpcodes.h:433

llvm::ISD::UNDEF
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228

llvm::ISD::SPLAT_VECTOR
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:669

llvm::ISD::FSHL
@ FSHL
Definition ISDOpcodes.h:767

llvm::ISD::AVGCEILU
@ AVGCEILU
Definition ISDOpcodes.h:713

llvm::ISD::SADDO
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343

llvm::ISD::FSHR
@ FSHR
Definition ISDOpcodes.h:768

llvm::ISD::STRICT_FTRUNC
@ STRICT_FTRUNC
Definition ISDOpcodes.h:458

llvm::ISD::USUBSAT
@ USUBSAT
Definition ISDOpcodes.h:370

llvm::ISD::GET_ROUNDING
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:958

llvm::ISD::MULHU
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701

llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762

llvm::ISD::VECTOR_SHUFFLE
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642

llvm::ISD::PtrAuthGlobalAddress
@ PtrAuthGlobalAddress
A ptrauth constant.
Definition ISDOpcodes.h:100

llvm::ISD::EXTRACT_SUBVECTOR
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607

llvm::ISD::STRICT_FMAXIMUM
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463

llvm::ISD::EntryToken
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48

llvm::ISD::STRICT_FMAXNUM
@ STRICT_FMAXNUM
Definition ISDOpcodes.h:452

llvm::ISD::READ_REGISTER
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:134

llvm::ISD::XOR
@ XOR
Definition ISDOpcodes.h:738

llvm::ISD::EXTRACT_VECTOR_ELT
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569

llvm::ISD::CopyToReg
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219

llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838

llvm::ISD::FP_TO_UINT_SAT
@ FP_TO_UINT_SAT
Definition ISDOpcodes.h:934

llvm::ISD::STRICT_FMINNUM
@ STRICT_FMINNUM
Definition ISDOpcodes.h:453

llvm::ISD::CTPOP
@ CTPOP
Definition ISDOpcodes.h:774

llvm::ISD::SELECT_CC
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799

llvm::ISD::FMUL
@ FMUL
Definition ISDOpcodes.h:412

llvm::ISD::STRICT_FSINH
@ STRICT_FSINH
Definition ISDOpcodes.h:442

llvm::ISD::SRL_PARTS
@ SRL_PARTS
Definition ISDOpcodes.h:823

llvm::ISD::SUB
@ SUB
Definition ISDOpcodes.h:260

llvm::ISD::MULHS
@ MULHS
Definition ISDOpcodes.h:702

llvm::ISD::SMULO
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351

llvm::ISD::PARITY
@ PARITY
Definition ISDOpcodes.h:776

llvm::ISD::STRICT_LRINT
@ STRICT_LRINT
Definition ISDOpcodes.h:461

llvm::ISD::ConstantPool
@ ConstantPool
Definition ISDOpcodes.h:92

llvm::ISD::SIGN_EXTEND_INREG
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876

llvm::ISD::SMIN
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724

llvm::ISD::Constant
@ Constant
Definition ISDOpcodes.h:86

llvm::ISD::STRICT_FREM
@ STRICT_FREM
Definition ISDOpcodes.h:424

llvm::ISD::VECTOR_REVERSE
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition ISDOpcodes.h:633

llvm::ISD::STRICT_FROUND
@ STRICT_FROUND
Definition ISDOpcodes.h:456

llvm::ISD::VSELECT
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793

llvm::ISD::UADDO_CARRY
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323

llvm::ISD::STRICT_SINT_TO_FP
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477

llvm::ISD::STRICT_FFLOOR
@ STRICT_FFLOOR
Definition ISDOpcodes.h:455

llvm::ISD::STRICT_FROUNDEVEN
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457

llvm::ISD::STRICT_FEXP
@ STRICT_FEXP
Definition ISDOpcodes.h:445

llvm::ISD::FDIV
@ FDIV
Definition ISDOpcodes.h:413

llvm::ISD::FRAMEADDR
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110

llvm::ISD::FREM
@ FREM
Definition ISDOpcodes.h:414

llvm::ISD::STRICT_FP_TO_UINT
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471

llvm::ISD::STRICT_FP_ROUND
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493

llvm::ISD::STRICT_FP_TO_SINT
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470

llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914

llvm::ISD::TargetConstant
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174

llvm::ISD::STRICT_FP_EXTEND
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498

llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736

llvm::ISD::INTRINSIC_WO_CHAIN
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200

llvm::ISD::USUBO_CARRY
@ USUBO_CARRY
Definition ISDOpcodes.h:324

llvm::ISD::STRICT_FCOSH
@ STRICT_FCOSH
Definition ISDOpcodes.h:443

llvm::ISD::AVGFLOORS
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:707

llvm::ISD::STRICT_FADD
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420

llvm::ISD::UREM
@ UREM
Definition ISDOpcodes.h:265

llvm::ISD::STRICT_FLOG10
@ STRICT_FLOG10
Definition ISDOpcodes.h:448

llvm::ISD::INSERT_VECTOR_ELT
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558

llvm::ISD::TokenFactor
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53

llvm::ISD::STRICT_LLRINT
@ STRICT_LLRINT
Definition ISDOpcodes.h:462

llvm::ISD::VECTOR_SPLICE
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition ISDOpcodes.h:654

llvm::ISD::STRICT_FEXP2
@ STRICT_FEXP2
Definition ISDOpcodes.h:446

llvm::ISD::STRICT_FSUB
@ STRICT_FSUB
Definition ISDOpcodes.h:421

llvm::ISD::STRICT_FLOG
@ STRICT_FLOG
Definition ISDOpcodes.h:447

llvm::ISD::STRICT_FTAN
@ STRICT_FTAN
Definition ISDOpcodes.h:437

llvm::ISD::MUL
@ MUL
Definition ISDOpcodes.h:261

llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947

llvm::ISD::VECTOR_COMPRESS
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:696

llvm::ISD::CTLZ
@ CTLZ
Definition ISDOpcodes.h:773

llvm::ISD::SPONENTRY
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122

llvm::ISD::CLEAR_CACHE
@ CLEAR_CACHE
Definition ISDOpcodes.h:1576

llvm::ISD::STRICT_FLDEXP
@ STRICT_FLDEXP
Definition ISDOpcodes.h:434

llvm::ISD::STRICT_LLROUND
@ STRICT_LLROUND
Definition ISDOpcodes.h:460

llvm::ISD::ZERO_EXTEND_VECTOR_INREG
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:909

llvm::ISD::STRICT_FCOS
@ STRICT_FCOS
Definition ISDOpcodes.h:436

llvm::ISD::STRICT_FNEARBYINT
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451

llvm::ISD::FP_TO_SINT_SAT
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:933

llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844

llvm::ISD::ROTL
@ ROTL
Definition ISDOpcodes.h:765

llvm::ISD::BlockAddress
@ BlockAddress
Definition ISDOpcodes.h:94

llvm::ISD::SHL_PARTS
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:821

llvm::ISD::AssertSext
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62

llvm::ISD::BITREVERSE
@ BITREVERSE
Definition ISDOpcodes.h:775

llvm::ISD::FCOPYSIGN
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527

llvm::ISD::SADDSAT
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360

llvm::ISD::AssertZext
@ AssertZext
Definition ISDOpcodes.h:63

llvm::ISD::SMAX
@ SMAX
Definition ISDOpcodes.h:725

llvm::ISD::STRICT_FRINT
@ STRICT_FRINT
Definition ISDOpcodes.h:450

llvm::ISD::VECTOR_DEINTERLEAVE
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor ...
Definition ISDOpcodes.h:617

llvm::ISD::UMAX
@ UMAX
Definition ISDOpcodes.h:727

llvm::ISD::TRUNCATE_SSAT_S
@ TRUNCATE_SSAT_S
TRUNCATE_[SU]SAT_[SU] - Truncate for saturated operand [SU] located in middle, prefix for SAT means i...
Definition ISDOpcodes.h:859

llvm::ISD::ABDS
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:719

llvm::ISD::TRUNCATE_USAT_U
@ TRUNCATE_USAT_U
Definition ISDOpcodes.h:863

llvm::ISD::SADDO_CARRY
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333

llvm::ISD::INTRINSIC_W_CHAIN
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208

llvm::ISD::STRICT_FACOS
@ STRICT_FACOS
Definition ISDOpcodes.h:439

llvm::ISD::ABDU
@ ABDU
Definition ISDOpcodes.h:720

llvm::ISD::BUILD_VECTOR
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549

llvm::ISD::LOOP_DEPENDENCE_WAR_MASK
@ LOOP_DEPENDENCE_WAR_MASK
Set rounding mode.
Definition ISDOpcodes.h:1570

llvm::ISD::isOverflowIntrOpRes
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
Definition SelectionDAGNodes.h:3400

llvm::ISD::isExtOpcode
bool isExtOpcode(unsigned Opcode)
Definition ISDOpcodes.h:1768

llvm::ISD::isConstantSplatVectorAllZeros
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
Definition SelectionDAG.cpp:228

llvm::ISD::isVectorShrinkable
LLVM_ABI bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
Definition SelectionDAG.cpp:301

llvm::ISD::getSetCCInverse
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
Definition SelectionDAG.cpp:628

llvm::ISD::getSetCCSwappedOperands
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
Definition SelectionDAG.cpp:605

llvm::ISD::MemIndexType
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
Definition ISDOpcodes.h:1659

llvm::ISD::isConstantSplatVector
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
Definition SelectionDAG.cpp:151

llvm::ISD::MemIndexedMode
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition ISDOpcodes.h:1646

llvm::ISD::POST_INC
@ POST_INC
Definition ISDOpcodes.h:1646

llvm::ISD::PRE_INC
@ PRE_INC
Definition ISDOpcodes.h:1646

llvm::ISD::UNINDEXED
@ UNINDEXED
Definition ISDOpcodes.h:1646

llvm::ISD::CondCode
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition ISDOpcodes.h:1697

llvm::ISD::SETOEQ
@ SETOEQ
Definition ISDOpcodes.h:1700

llvm::ISD::SETUNE
@ SETUNE
Definition ISDOpcodes.h:1713

llvm::ISD::SETUEQ
@ SETUEQ
Definition ISDOpcodes.h:1708

llvm::ISD::SETOLE
@ SETOLE
Definition ISDOpcodes.h:1704

llvm::ISD::SETOLT
@ SETOLT
Definition ISDOpcodes.h:1703

llvm::ISD::SETNE
@ SETNE
Definition ISDOpcodes.h:1722

llvm::ISD::SETUGT
@ SETUGT
Definition ISDOpcodes.h:1709

llvm::ISD::SETOGT
@ SETOGT
Definition ISDOpcodes.h:1701

llvm::ISD::SETULT
@ SETULT
Definition ISDOpcodes.h:1711

llvm::ISD::SETUO
@ SETUO
Definition ISDOpcodes.h:1707

llvm::ISD::SETONE
@ SETONE
Definition ISDOpcodes.h:1705

llvm::ISD::SETGT
@ SETGT
Definition ISDOpcodes.h:1718

llvm::ISD::SETLT
@ SETLT
Definition ISDOpcodes.h:1720

llvm::ISD::SETO
@ SETO
Definition ISDOpcodes.h:1706

llvm::ISD::SETGE
@ SETGE
Definition ISDOpcodes.h:1719

llvm::ISD::SETUGE
@ SETUGE
Definition ISDOpcodes.h:1710

llvm::ISD::SETLE
@ SETLE
Definition ISDOpcodes.h:1721

llvm::ISD::SETULE
@ SETULE
Definition ISDOpcodes.h:1712

llvm::ISD::SETOGE
@ SETOGE
Definition ISDOpcodes.h:1702

llvm::ISD::SETEQ
@ SETEQ
Definition ISDOpcodes.h:1717

llvm::ISD::isBuildVectorAllOnes
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
Definition SelectionDAG.cpp:267

llvm::ISD::getVecReduceBaseOpcode
LLVM_ABI NodeType getVecReduceBaseOpcode(unsigned VecReduceOpcode)
Get underlying scalar opcode for VECREDUCE opcode.
Definition SelectionDAG.cpp:437

llvm::ISD::LoadExtType
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition ISDOpcodes.h:1677

llvm::ISD::NON_EXTLOAD
@ NON_EXTLOAD
Definition ISDOpcodes.h:1677

llvm::ISD::SEXTLOAD
@ SEXTLOAD
Definition ISDOpcodes.h:1677

llvm::ISD::ZEXTLOAD
@ ZEXTLOAD
Definition ISDOpcodes.h:1677

llvm::ISD::EXTLOAD
@ EXTLOAD
Definition ISDOpcodes.h:1677

llvm::ISD::LAST_INDEXED_MODE
static const int LAST_INDEXED_MODE
Definition ISDOpcodes.h:1648

llvm::ISD::isNormalLoad
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Definition SelectionDAGNodes.h:3297

llvm::Intrinsic
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
Definition GenericSSAContext.h:27

llvm::Intrinsic::getOrInsertDeclaration
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition Intrinsics.cpp:723

llvm::Intrinsic::not_intrinsic
@ not_intrinsic
Definition Intrinsics.h:46

llvm::Intrinsic::ID
unsigned ID
Definition GenericSSAContext.h:28

llvm::LegacyLegalizeActions::Bitcast
@ Bitcast
Perform the operation on a different, but equivalently sized type.
Definition LegacyLegalizerInfo.h:56

llvm::M68k::MemAddrModeKind::U
@ U
Definition M68kBaseInfo.h:61

llvm::M68k::MemAddrModeKind::V
@ V
Definition M68kBaseInfo.h:63

llvm::M68k::MemAddrModeKind::L
@ L
Definition M68kBaseInfo.h:70

llvm::MipsISD::Highest
@ Highest
Definition MipsISelLowering.h:66

llvm::MipsISD::Ret
@ Ret
Definition MipsISelLowering.h:117

llvm::MipsISD::Ext
@ Ext
Definition MipsISelLowering.h:157

llvm::MipsISD::Ins
@ Ins
Definition MipsISelLowering.h:158

llvm::Mips::GPRIdx
@ GPRIdx
Definition MipsRegisterBankInfo.cpp:44

llvm::NVPTXAS::AddressSpace
AddressSpace
Definition NVPTXAddrSpace.h:21

llvm::NVPTX::Const
@ Const
Definition NVPTX.h:185

llvm::PatternMatch
Definition PatternMatch.h:47

llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition PatternMatch.h:49

llvm::PatternMatch::m_UIToFP
CastInst_match< OpTy, UIToFPInst > m_UIToFP(const OpTy &Op)
Definition PatternMatch.h:2299

llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition PatternMatch.h:105

llvm::RISCVFenceField::R
@ R
Definition RISCVBaseInfo.h:463

llvm::RISCVFenceField::O
@ O
Definition RISCVBaseInfo.h:462

llvm::RegState::Implicit
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Definition MachineInstrBuilder.h:49

llvm::RegState::Define
@ Define
Register definition.
Definition MachineInstrBuilder.h:47

llvm::RegState::ImplicitDefine
@ ImplicitDefine
Definition MachineInstrBuilder.h:66

llvm::Reloc::Model
Model
Definition CodeGen.h:25

llvm::SI
Definition SIInstrInfo.h:1754

llvm::SPII::Store
@ Store
Definition SparcInstrInfo.h:33

llvm::SPII::Load
@ Load
Definition SparcInstrInfo.h:32

llvm::Sched::Hybrid
@ Hybrid
Definition TargetLowering.h:107

llvm::SystemZ::VectorBits
const unsigned VectorBits
Definition SystemZ.h:155

llvm::TLSModel::Model
Model
Definition CodeGen.h:45

llvm::TLSModel::LocalDynamic
@ LocalDynamic
Definition CodeGen.h:47

llvm::TLSModel::InitialExec
@ InitialExec
Definition CodeGen.h:48

llvm::TLSModel::GeneralDynamic
@ GeneralDynamic
Definition CodeGen.h:46

llvm::TLSModel::LocalExec
@ LocalExec
Definition CodeGen.h:49

llvm::TargetStackID::ScalableVector
@ ScalableVector
Definition TargetFrameLowering.h:33

llvm::TargetStackID::ScalablePredicateVector
@ ScalablePredicateVector
Definition TargetFrameLowering.h:35

llvm::X86::FirstMacroFusionInstKind::Cmp
@ Cmp
Definition X86BaseInfo.h:109

llvm::bitc::NoNaNs
@ NoNaNs
Definition LLVMBitCodes.h:537

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:139

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:445

llvm::codeview::CompileSym2Flags::EC
@ EC
Definition CodeView.h:433

llvm::codeview::EncodedFramePtrReg::BasePtr
@ BasePtr
Definition CodeView.h:528

llvm::codeview::EncodedFramePtrReg::StackPtr
@ StackPtr
Definition CodeView.h:526

llvm::codeview::FrameCookieKind::Copy
@ Copy
Definition CodeView.h:495

llvm::codeview::ExportFlags::IsData
@ IsData
Definition CodeView.h:467

llvm::codeview::PublicSymFlags::Function
@ Function
Definition CodeView.h:409

llvm::dwarf_linker::DebugSectionKind::DebugLoc
@ DebugLoc
Definition DWARFLinkerBase.h:34

llvm::dwarf::Index
Index
Definition Dwarf.h:903

llvm::dxil::ElementType::I1
@ I1
Definition DXILABI.h:62

llvm::logicalview::LVOutputKind::Split
@ Split
Definition LVOptions.h:145

llvm::logicalview::LVAttributeKind::Argument
@ Argument
Definition LVOptions.h:95

llvm::logicalview::LVAttributeKind::Discriminator
@ Discriminator
Definition LVOptions.h:100

llvm::logicalview::LVAttributeKind::Zero
@ Zero
Definition LVOptions.h:130

llvm::memprof::Meta::Start
@ Start
Definition MemProf.h:69

llvm::ms_demangle::QualifierMangleMode::Result
@ Result
Definition MicrosoftDemangle.h:132

llvm::objcarc::getAttachedARCFunction
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition ObjCARCUtil.h:43

llvm::objcarc::attachedCallOpBundleNeedsMarker
bool attachedCallOpBundleNeedsMarker(const CallBase *CB)
This function determines whether the clang_arc_attachedcall should be emitted with or without the mar...
Definition ObjCARCUtil.h:58

llvm::objcarc::hasAttachedCallOpBundle
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition ObjCARCUtil.h:29

llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition OptimizationRemarkEmitter.h:139

llvm::pdb::PDB_MemoryType::Stack
@ Stack
Definition PDBTypes.h:328

llvm::pdb::PDB_TableType::Segments
@ Segments
Definition PDBTypes.h:92

llvm::pdb::PDB_SymType::Callee
@ Callee
Definition PDBTypes.h:282

llvm::pdb::PDB_LocType::TLS
@ TLS
Definition PDBTypes.h:295

llvm::pdb::PDB_LocType::Slot
@ Slot
Definition PDBTypes.h:300

llvm::rdf::Node
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381

llvm::rdf::Func
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393

llvm::sampleprof::Base
@ Base
Definition Discriminator.h:58

llvm::sframe::BaseReg::SP
@ SP
Definition SFrame.h:79

llvm::sframe::Flags
Flags
Definition SFrame.h:39

llvm::tgtok::Bits
@ Bits
Definition TGLexer.h:79

llvm::tgtok::TrueVal
@ TrueVal
Definition TGLexer.h:58

llvm::tgtok::In
@ In
Definition TGLexer.h:84

llvm::tgtok::FalseVal
@ FalseVal
Definition TGLexer.h:59

llvm::wasm::ValType::I32
@ I32
Definition Wasm.h:269

llvm::yaml::NodeKind::Scalar
@ Scalar
Definition YAMLTraits.h:45

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::drop_begin
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316

llvm::isPackedVectorType
bool isPackedVectorType(EVT SomeVT)
Definition VECustomDAG.cpp:22

llvm::Offset
@ Offset
Definition DWP.cpp:477

llvm::zip
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829

llvm::Value
FunctionAddr VTableAddr Value
Definition InstrProf.h:137

llvm::isDUPQMask
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
Definition AArch64PerfectShuffle.h:6729

llvm::CC_AArch64_Arm64EC_CFGuard_Check
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)

llvm::find
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751

llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725

llvm::maxUIntN
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207

llvm::GetReturnInfo
LLVM_ABI void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
Definition TargetLoweringBase.cpp:1741

llvm::CC_AArch64_Win64PCS
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)

llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition MachineInstrBuilder.h:369

llvm::Cost
InstructionCost Cost
Definition FunctionSpecialization.h:103

llvm::getIConstantVRegVal
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:294

llvm::isInt
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165

llvm::isNullConstant
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
Definition SelectionDAG.cpp:12825

llvm::Depth
@ Depth
Definition SIMachineScheduler.h:36

llvm::peekThroughBitcasts
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
Definition SelectionDAG.cpp:12917

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643

llvm::Log2_64_Ceil
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:352

llvm::CCAssignFn
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
Definition CallingConvLower.h:157

llvm::CC_AArch64_DarwinPCS
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)

llvm::Failed
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198

llvm::isIntOrFPConstant
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
Definition SelectionDAGNodes.h:1957

llvm::map_to_vector
auto map_to_vector(ContainerTy &&C, FuncTy &&F)
Map a range to a SmallVector with element types deduced from the mapping.
Definition SmallVectorExtras.h:38

llvm::isTRNMask
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResult)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> or <1,...
Definition AArch64PerfectShuffle.h:6687

llvm::bit_width
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289

llvm::isUIntN
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243

llvm::concatenateVectors
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Definition VectorUtils.cpp:1228

llvm::getSVEPredPatternFromNumElements
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
Definition AArch64BaseInfo.h:569

llvm::isPowerOf2_64
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284

llvm::widenShuffleMaskElts
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
Definition VectorUtils.cpp:540

llvm::isNullOrNullSplat
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1588

llvm::CC_AArch64_Arm64EC_Thunk
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)

llvm::operator==
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
Definition AddressRanges.h:151

llvm::reportFatalInternalError
LLVM_ABI void reportFatalInternalError(Error Err)
Report a fatal error that indicates a bug in LLVM.
Definition Error.cpp:177

llvm::Log2_64
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:339

llvm::countr_zero
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186

llvm::isShiftedMask_64
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273

llvm::M1
unsigned M1(unsigned Val)
Definition VE.h:377

llvm::isReleaseOrStronger
bool isReleaseOrStronger(AtomicOrdering AO)
Definition AtomicOrdering.h:133

llvm::dyn_cast_or_null
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753

llvm::getOffset
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
Definition RuntimeDyld.cpp:172

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732

llvm::getPerfectShuffleCost
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
Definition AArch64PerfectShuffle.h:6592

llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:333

llvm::RetCC_AArch64_Arm64EC_Thunk
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)

llvm::isBitwiseNot
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
Definition SelectionDAG.cpp:12958

llvm::CC_AArch64_Arm64EC_VarArg
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)

llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279

llvm::CC_AArch64_AAPCS
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)

llvm::get
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
Definition PointerIntPair.h:268

llvm::ComplexDeinterleavingOperation
ComplexDeinterleavingOperation
Definition ComplexDeinterleavingPass.h:35

llvm::ComplexDeinterleavingOperation::CDot
@ CDot
Definition ComplexDeinterleavingPass.h:38

llvm::ComplexDeinterleavingOperation::Splat
@ Splat
Definition ComplexDeinterleavingPass.h:42

llvm::ComplexDeinterleavingOperation::CAdd
@ CAdd
Definition ComplexDeinterleavingPass.h:36

llvm::ComplexDeinterleavingOperation::CMulPartial
@ CMulPartial
Definition ComplexDeinterleavingPass.h:37

llvm::getDeinterleaveIntrinsicFactor
LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID)
Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
Definition VectorUtils.cpp:272

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207

llvm::report_fatal_error
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167

llvm::isUZPMask
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
Definition AArch64PerfectShuffle.h:6660

llvm::gep_type_iterator
generic_gep_type_iterator<> gep_type_iterator
Definition GetElementPtrTypeIterator.h:171

llvm::isMask_64
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:261

llvm::isREVMask
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
Definition AArch64PerfectShuffle.h:6703

llvm::isUInt
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189

llvm::peekThroughOneUseBitcasts
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
Definition SelectionDAG.cpp:12923

llvm::classifyEHPersonality
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
Definition EHPersonalities.cpp:23

llvm::CaptureComponents::Address
@ Address
Definition ModRef.h:308

llvm::CodeGenOptLevel
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82

llvm::CodeGenOptLevel::Aggressive
@ Aggressive
-O3
Definition CodeGen.h:86

llvm::CodeGenOptLevel::None
@ None
-O0
Definition CodeGen.h:83

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1122

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547

llvm::Key
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
Definition PassManager.h:668

llvm::PoisonMaskElem
constexpr int PoisonMaskElem
Definition Instructions.h:1923

llvm::PackElem::Hi
@ Hi
Definition VECustomDAG.h:132

llvm::PackElem::Lo
@ Lo
Definition VECustomDAG.h:131

llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition AtomicOrdering.h:56

llvm::AtomicOrdering::Monotonic
@ Monotonic
Definition AtomicOrdering.h:59

llvm::AtomicOrdering::Unordered
@ Unordered
Definition AtomicOrdering.h:58

llvm::AtomicOrdering::AcquireRelease
@ AcquireRelease
Definition AtomicOrdering.h:63

llvm::AtomicOrdering::Acquire
@ Acquire
Definition AtomicOrdering.h:61

llvm::AtomicOrdering::Release
@ Release
Definition AtomicOrdering.h:62

llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
Definition AtomicOrdering.h:64

llvm::IRMemLocation::Other
@ Other
Any other memory.
Definition ModRef.h:68

llvm::CC_AArch64_DarwinPCS_VarArg
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)

llvm::ComplexDeinterleavingRotation
ComplexDeinterleavingRotation
Definition ComplexDeinterleavingPass.h:50

llvm::ComplexDeinterleavingRotation::Rotation_270
@ Rotation_270
Definition ComplexDeinterleavingPass.h:54

llvm::ComplexDeinterleavingRotation::Rotation_90
@ Rotation_90
Definition ComplexDeinterleavingPass.h:52

llvm::CC_AArch64_GHC
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)

llvm::TTI
TargetTransformInfo TTI
Definition TargetTransformInfo.h:218

llvm::Data
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189

llvm::CombineLevel
CombineLevel
Definition DAGCombine.h:15

llvm::isZIPMask
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
Definition AArch64PerfectShuffle.h:6627

llvm::CC_AArch64_Win64_VarArg
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)

llvm::LEB128Sign::Signed
@ Signed
Definition LEB128.h:234

llvm::getDeinterleavedVectorType
LLVM_ABI VectorType * getDeinterleavedVectorType(IntrinsicInst *DI)
Given a deinterleaveN intrinsic, return the (narrow) vector type of each factor.
Definition VectorUtils.cpp:293

llvm::RecurKind::Mul
@ Mul
Product of integers.
Definition IVDescriptors.h:40

llvm::RecurKind::And
@ And
Bitwise or logical AND of integers.
Definition IVDescriptors.h:42

llvm::RecurKind::Sub
@ Sub
Subtraction of integers.
Definition IVDescriptors.h:38

llvm::RecurKind::Add
@ Add
Sum of integers.
Definition IVDescriptors.h:37

llvm::MCPhysReg
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21

llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:22

llvm::ComputeValueVTs
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119

llvm::ArrayRef
ArrayRef(const T &OneElt) -> ArrayRef< T >

llvm::isAsynchronousEHPersonality
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
Definition EHPersonalities.h:51

llvm::isConstOrConstSplat
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
Definition SelectionDAG.cpp:12968

llvm::RetCC_AArch64_AAPCS
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)

llvm::isAcquireOrStronger
bool isAcquireOrStronger(AtomicOrdering AO)
Definition AtomicOrdering.h:129

llvm::BitWidth
constexpr unsigned BitWidth
Definition BitmaskEnum.h:220

llvm::count_if
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559

llvm::find_if
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758

llvm::gep_type_begin
gep_type_iterator gep_type_begin(const User *GEP)
Definition GetElementPtrTypeIterator.h:173

llvm::isOneConstant
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Definition SelectionDAG.cpp:12844

llvm::erase_if
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2120

llvm::isIntN
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248

llvm::getNumElementsFromSVEPredPattern
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
Definition AArch64BaseInfo.h:541

llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897

llvm::commonAlignment
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201

llvm::isNullFPConstant
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
Definition SelectionDAG.cpp:12834

llvm::all_equal
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2108

llvm::MOStridedAccess
static const MachineMemOperand::Flags MOStridedAccess
Definition AArch64InstrInfo.h:31

llvm::VFParamKind::Vector
@ Vector
Definition VFABIDemangler.h:27

llvm::InstructionUniformity::Default
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20

llvm::PGSOQueryType::Test
@ Test
Definition SizeOpts.h:37

llvm::CC_AArch64_DarwinPCS_ILP32_VarArg
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)

llvm::createSequentialMask
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
Definition VectorUtils.cpp:1173

llvm::isShiftedUInt
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:198

llvm::CC_AArch64_Arm64EC_Thunk_Native
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)

llvm::CC_AArch64_Win64_CFGuard_Check
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)

llvm::RetCC_AArch64_Arm64EC_CFGuard_Check
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)

llvm::isAllOnesConstant
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
Definition SelectionDAG.cpp:12839

llvm::CC_AArch64_Preserve_None
bool CC_AArch64_Preserve_None(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)

llvm::PerfectShuffleTable
static const unsigned PerfectShuffleTable[6561+1]
Definition AArch64PerfectShuffle.h:28

llvm::ColorMode::Enable
@ Enable
Enable colors.
Definition WithColor.h:47

llvm::reportFatalUsageError
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:180

std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869

raw_ostream.h

N
#define N

AArch64SetCCInfo
Helper structure to keep track of a SET_CC lowered into AArch64 code.
Definition AArch64ISelLowering.cpp:21122

AArch64SetCCInfo::Cmp
const SDValue * Cmp
Definition AArch64ISelLowering.cpp:21123

AArch64SetCCInfo::CC
AArch64CC::CondCode CC
Definition AArch64ISelLowering.cpp:21124

GenericSetCCInfo
Helper structure to keep track of ISD::SET_CC operands.
Definition AArch64ISelLowering.cpp:21115

GenericSetCCInfo::Opnd0
const SDValue * Opnd0
Definition AArch64ISelLowering.cpp:21116

GenericSetCCInfo::CC
ISD::CondCode CC
Definition AArch64ISelLowering.cpp:21118

GenericSetCCInfo::Opnd1
const SDValue * Opnd1
Definition AArch64ISelLowering.cpp:21117

SetCCInfoAndKind
Helper structure to be able to read SetCC information.
Definition AArch64ISelLowering.cpp:21136

SetCCInfoAndKind::IsAArch64
bool IsAArch64
Definition AArch64ISelLowering.cpp:21138

SetCCInfoAndKind::Info
SetCCInfo Info
Definition AArch64ISelLowering.cpp:21137

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39

llvm::DenormalMode
Represent subnormal handling kind for floating point instruction inputs and outputs.
Definition FloatingPointMode.h:71

llvm::EVT
Extended Value Type.
Definition ValueTypes.h:35

llvm::EVT::changeVectorElementTypeToInteger
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94

llvm::EVT::getStoreSize
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395

llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137

llvm::EVT::getVectorVT
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74

llvm::EVT::changeTypeToInteger
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121

llvm::EVT::getScalarStoreSize
uint64_t getScalarStoreSize() const
Definition ValueTypes.h:402

llvm::EVT::bitsGT
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284

llvm::EVT::bitsLT
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300

llvm::EVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147

llvm::EVT::getVectorElementCount
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350

llvm::EVT::getDoubleNumVectorElementsVT
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463

llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373

llvm::EVT::changeElementType
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113

llvm::EVT::getVectorMinNumElements
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359

llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385

llvm::EVT::getHalfSizedIntegerVT
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:430

llvm::EVT::isPow2VectorType
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470

llvm::EVT::getStoreSizeInBits
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412

llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316

llvm::EVT::is128BitVector
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207

llvm::EVT::getIntegerVT
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65

llvm::EVT::getFixedSizeInBits
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381

llvm::EVT::widenIntegerVectorElementType
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition ValueTypes.h:444

llvm::EVT::isScalableVT
bool isScalableVT() const
Return true if the type is a scalable type.
Definition ValueTypes.h:187

llvm::EVT::isFixedLengthVector
bool isFixedLengthVector() const
Definition ValueTypes.h:181

llvm::EVT::getFloatingPointVT
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59

llvm::EVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168

llvm::EVT::getScalarType
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323

llvm::EVT::bitsGE
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:292

llvm::EVT::is256BitVector
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212

llvm::EVT::bitsEq
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256

llvm::EVT::getTypeForEVT
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition ValueTypes.cpp:218

llvm::EVT::isScalableVector
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174

llvm::EVT::getVectorElementType
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328

llvm::EVT::isScalarInteger
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157

llvm::EVT::changeVectorElementType
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102

llvm::EVT::getFltSemantics
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition ValueTypes.cpp:332

llvm::EVT::getVectorNumElements
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336

llvm::EVT::getHalfNumVectorElementsVT
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453

llvm::EVT::isInteger
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152

llvm::EVT::is64BitVector
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202

llvm::ISD::ArgFlagsTy
Definition TargetCallingConv.h:27

llvm::ISD::ArgFlagsTy::isVarArg
bool isVarArg() const
Definition TargetCallingConv.h:150

llvm::ISD::OutputArg::VT
MVT VT
Definition TargetCallingConv.h:249

llvm::KnownBits
Definition KnownBits.h:24

llvm::KnownBits::makeConstant
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301

llvm::KnownBits::isZero
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80

llvm::KnownBits::ashr
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
Definition KnownBits.cpp:425

llvm::KnownBits::trunc
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161

llvm::KnownBits::getBitWidth
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44

llvm::KnownBits::lshr
static LLVM_ABI KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
Definition KnownBits.cpp:369

llvm::KnownBits::countMaxActiveBits
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296

llvm::KnownBits::intersectWith
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311

llvm::KnownBits::add
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347

llvm::KnownBits::One
APInt One
Definition KnownBits.h:26

llvm::KnownBits::mul
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition KnownBits.cpp:800

llvm::KnownBits::Zero
APInt Zero
Definition KnownBits.h:25

llvm::KnownBits::getSignedMinValue
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:135

llvm::KnownBits::shl
static LLVM_ABI KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Definition KnownBits.cpp:285

llvm::MIPatternMatch::And
Matching combinators.
Definition MIPatternMatch.h:313

llvm::MIPatternMatch::Or
Definition MIPatternMatch.h:332

llvm::MachineFunction::CallSiteInfo::ArgRegPairs
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
Definition MachineFunction.h:517

llvm::MachinePointerInfo
This class contains a discriminated union of information about pointers in memory operands,...
Definition MachineMemOperand.h:42

llvm::MachinePointerInfo::getAddrSpace
LLVM_ABI unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
Definition MachineOperand.cpp:1051

llvm::MachinePointerInfo::getStack
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
Definition MachineOperand.cpp:1090

llvm::MachinePointerInfo::getWithOffset
MachinePointerInfo getWithOffset(int64_t O) const
Definition MachineMemOperand.h:82

llvm::MachinePointerInfo::getUnknownStack
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
Definition MachineOperand.cpp:1095

llvm::MachinePointerInfo::getGOT
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
Definition MachineOperand.cpp:1086

llvm::MachinePointerInfo::getFixedStack
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Definition MachineOperand.cpp:1077

llvm::MemOp
Definition TargetLowering.h:118

llvm::PredicateConstraint
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
Definition PredicateInfo.h:74

llvm::SDNodeFlags
These are IR-level optimization flags that may be propagated to SDNodes.
Definition SelectionDAGNodes.h:384

llvm::SDNodeFlags::Exact
@ Exact
Definition SelectionDAGNodes.h:400

llvm::SDNodeFlags::PoisonGeneratingFlags
@ PoisonGeneratingFlags
Definition SelectionDAGNodes.h:425

llvm::SDNodeFlags::NoUnsignedWrap
@ NoUnsignedWrap
Definition SelectionDAGNodes.h:397

llvm::SDNodeFlags::AllowReassociation
@ AllowReassociation
Definition SelectionDAGNodes.h:409

llvm::SDNodeFlags::NoSignedZeros
@ NoSignedZeros
Definition SelectionDAGNodes.h:405

llvm::SDVTList
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
Definition SelectionDAGNodes.h:80

llvm::TPIDR2Object
Definition AArch64MachineFunctionInfo.h:40

llvm::TPIDR2Object::Uses
unsigned Uses
Definition AArch64MachineFunctionInfo.h:42

llvm::TPIDR2Object::FrameIndex
int FrameIndex
Definition AArch64MachineFunctionInfo.h:41

llvm::TargetLoweringBase::AddrMode
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Definition TargetLowering.h:2891

llvm::TargetLoweringBase::AddrMode::BaseOffs
int64_t BaseOffs
Definition TargetLowering.h:2893

llvm::TargetLoweringBase::AddrMode::BaseGV
GlobalValue * BaseGV
Definition TargetLowering.h:2892

llvm::TargetLoweringBase::AddrMode::HasBaseReg
bool HasBaseReg
Definition TargetLowering.h:2894

llvm::TargetLoweringBase::AddrMode::Scale
int64_t Scale
Definition TargetLowering.h:2895

llvm::TargetLoweringBase::AddrMode::ScalableOffset
int64_t ScalableOffset
Definition TargetLowering.h:2896

llvm::TargetLoweringBase::IntrinsicInfo
Definition TargetLowering.h:1218

llvm::TargetLowering::CallLoweringInfo
This structure contains all information that is necessary for lowering calls.
Definition TargetLowering.h:4698

llvm::TargetLowering::CallLoweringInfo::Args
ArgListTy Args
Definition TargetLowering.h:4725

llvm::TargetLowering::CallLoweringInfo::Callee
SDValue Callee
Definition TargetLowering.h:4724

llvm::TargetLowering::CallLoweringInfo::setLibCallee
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
Definition TargetLowering.h:4755

llvm::TargetLowering::CallLoweringInfo::IsVarArg
bool IsVarArg
Definition TargetLowering.h:4706

llvm::TargetLowering::CallLoweringInfo::setDebugLoc
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
Definition TargetLowering.h:4744

llvm::TargetLowering::CallLoweringInfo::CB
const CallBase * CB
Definition TargetLowering.h:4728

llvm::TargetLowering::CallLoweringInfo::Outs
SmallVector< ISD::OutputArg, 32 > Outs
Definition TargetLowering.h:4729

llvm::TargetLowering::CallLoweringInfo::CallConv
CallingConv::ID CallConv
Definition TargetLowering.h:4723

llvm::TargetLowering::CallLoweringInfo::DAG
SelectionDAG & DAG
Definition TargetLowering.h:4726

llvm::TargetLowering::CallLoweringInfo::setChain
CallLoweringInfo & setChain(SDValue InChain)
Definition TargetLowering.h:4749

llvm::TargetLowering::DAGCombinerInfo
Definition TargetLowering.h:4402

llvm::TargetLowering::DAGCombinerInfo::isBeforeLegalizeOps
bool isBeforeLegalizeOps() const
Definition TargetLowering.h:4414

llvm::TargetLowering::DAGCombinerInfo::isAfterLegalizeDAG
bool isAfterLegalizeDAG() const
Definition TargetLowering.h:4415

llvm::TargetLowering::DAGCombinerInfo::isCalledByLegalizer
bool isCalledByLegalizer() const
Definition TargetLowering.h:4417

llvm::TargetLowering::DAGCombinerInfo::isBeforeLegalize
bool isBeforeLegalize() const
Definition TargetLowering.h:4413

llvm::TargetLowering::DAGCombinerInfo::DAG
SelectionDAG & DAG
Definition TargetLowering.h:4408

llvm::TargetLowering::DAGCombinerInfo::CombineTo
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
Definition DAGCombiner.cpp:940

llvm::TargetLowering::DAGCombinerInfo::CommitTargetLoweringOpt
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
Definition DAGCombiner.cpp:960

llvm::TargetLowering::TargetLoweringOpt
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
Definition TargetLowering.h:4108

llvm::TargetLowering::TargetLoweringOpt::DAG
SelectionDAG & DAG
Definition TargetLowering.h:4109

llvm::TargetLowering::TargetLoweringOpt::CombineTo
bool CombineTo(SDValue O, SDValue N)
Definition TargetLowering.h:4122

llvm::TargetLowering::TargetLoweringOpt::LegalOps
bool LegalOps
Definition TargetLowering.h:4111

llvm::cl::desc
Definition CommandLine.h:411

SetCCInfo
Helper structure to keep track of SetCC information.
Definition AArch64ISelLowering.cpp:21128

SetCCInfo::Generic
GenericSetCCInfo Generic
Definition AArch64ISelLowering.cpp:21129

SetCCInfo::AArch64
AArch64SetCCInfo AArch64
Definition AArch64ISelLowering.cpp:21130