doxygen/HexagonVectorCombine_8cpp_source.html

//===-- HexagonVectorCombine.cpp ------------------------------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

// HexagonVectorCombine is a utility class implementing a variety of functions

// that assist in vector-based optimizations.

//

// AlignVectors: replace unaligned vector loads and stores with aligned ones.

// HvxIdioms: recognize various opportunities to generate HVX intrinsic code.

//===----------------------------------------------------------------------===//


#include "llvm/ADT/APInt.h"

#include "llvm/ADT/ArrayRef.h"

#include "llvm/ADT/DenseMap.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/ADT/SmallVector.h"

#include "llvm/Analysis/AliasAnalysis.h"

#include "llvm/Analysis/AssumptionCache.h"

#include "llvm/Analysis/InstSimplifyFolder.h"

#include "llvm/Analysis/InstructionSimplify.h"

#include "llvm/Analysis/ScalarEvolutionExpressions.h"

#include "llvm/Analysis/TargetLibraryInfo.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/Analysis/VectorUtils.h"

#include "llvm/CodeGen/TargetPassConfig.h"

#include "llvm/CodeGen/ValueTypes.h"

#include "llvm/IR/Dominators.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/Intrinsics.h"

#include "llvm/IR/IntrinsicsHexagon.h"

#include "llvm/IR/Metadata.h"

#include "llvm/IR/PatternMatch.h"

#include "llvm/InitializePasses.h"

#include "llvm/Pass.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/KnownBits.h"

#include "llvm/Support/MathExtras.h"

#include "llvm/Support/raw_ostream.h"

#include "llvm/Target/TargetMachine.h"

#include "llvm/Transforms/Utils/Local.h"


#include "Hexagon.h"

#include "HexagonSubtarget.h"

#include "HexagonTargetMachine.h"


#include <algorithm>

#include <deque>

#include <map>

#include <optional>

#include <set>

#include <utility>

#include <vector>


#define DEBUG_TYPE "hexagon-vc"


// This is a const that represents default HVX VTCM page size.

// It is boot time configurable, so we probably want an API to

// read it, but for now assume 128KB

#define DEFAULT_HVX_VTCM_PAGE_SIZE 131072


using namespace llvm;


namespace {

cl::opt<bool> DumpModule("hvc-dump-module", cl::Hidden);

cl::opt<bool> VAEnabled("hvc-va", cl::Hidden, cl::init(true)); // Align

cl::opt<bool> VIEnabled("hvc-vi", cl::Hidden, cl::init(true)); // Idioms

cl::opt<bool> VADoFullStores("hvc-va-full-stores", cl::Hidden);


cl::opt<unsigned> VAGroupCountLimit("hvc-va-group-count-limit", cl::Hidden,

                                    cl::init(~0));

cl::opt<unsigned> VAGroupSizeLimit("hvc-va-group-size-limit", cl::Hidden,

                                   cl::init(~0));


class HexagonVectorCombine {

public:

  HexagonVectorCombine(Function &F_, AliasAnalysis &AA_, AssumptionCache &AC_,

                       DominatorTree &DT_, ScalarEvolution &SE_,

                       TargetLibraryInfo &TLI_, const TargetMachine &TM_)

      : F(F_), DL(F.getDataLayout()), AA(AA_), AC(AC_), DT(DT_),

        SE(SE_), TLI(TLI_),

        HST(static_cast<const HexagonSubtarget &>(*TM_.getSubtargetImpl(F))) {}


  bool run();


  // Common integer type.

  IntegerType *getIntTy(unsigned Width = 32) const;

  // Byte type: either scalar (when Length = 0), or vector with given

  // element count.

  Type *getByteTy(int ElemCount = 0) const;

  // Boolean type: either scalar (when Length = 0), or vector with given

  // element count.

  Type *getBoolTy(int ElemCount = 0) const;

  // Create a ConstantInt of type returned by getIntTy with the value Val.

  ConstantInt *getConstInt(int Val, unsigned Width = 32) const;

  // Get the integer value of V, if it exists.

  std::optional<APInt> getIntValue(const Value *Val) const;

  // Is Val a constant 0, or a vector of 0s?

  bool isZero(const Value *Val) const;

  // Is Val an undef value?

  bool isUndef(const Value *Val) const;

  // Is Val a scalar (i1 true) or a vector of (i1 true)?

  bool isTrue(const Value *Val) const;

  // Is Val a scalar (i1 false) or a vector of (i1 false)?

  bool isFalse(const Value *Val) const;


  // Get HVX vector type with the given element type.

  VectorType *getHvxTy(Type *ElemTy, bool Pair = false) const;


  enum SizeKind {

    Store, // Store size

    Alloc, // Alloc size

  };

  int getSizeOf(const Value *Val, SizeKind Kind = Store) const;

  int getSizeOf(const Type *Ty, SizeKind Kind = Store) const;

  int getTypeAlignment(Type *Ty) const;

  size_t length(Value *Val) const;

  size_t length(Type *Ty) const;


  Constant *getNullValue(Type *Ty) const;

  Constant *getFullValue(Type *Ty) const;

  Constant *getConstSplat(Type *Ty, int Val) const;


  Value *simplify(Value *Val) const;


  Value *insertb(IRBuilderBase &Builder, Value *Dest, Value *Src, int Start,

                 int Length, int Where) const;

  Value *vlalignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,

                  Value *Amt) const;

  Value *vralignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,

                  Value *Amt) const;

  Value *concat(IRBuilderBase &Builder, ArrayRef<Value *> Vecs) const;

  Value *vresize(IRBuilderBase &Builder, Value *Val, int NewSize,

                 Value *Pad) const;

  Value *rescale(IRBuilderBase &Builder, Value *Mask, Type *FromTy,

                 Type *ToTy) const;

  Value *vlsb(IRBuilderBase &Builder, Value *Val) const;

  Value *vbytes(IRBuilderBase &Builder, Value *Val) const;

  Value *subvector(IRBuilderBase &Builder, Value *Val, unsigned Start,

                   unsigned Length) const;

  Value *sublo(IRBuilderBase &Builder, Value *Val) const;

  Value *subhi(IRBuilderBase &Builder, Value *Val) const;

  Value *vdeal(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;

  Value *vshuff(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;


  Value *createHvxIntrinsic(IRBuilderBase &Builder, Intrinsic::ID IntID,

                            Type *RetTy, ArrayRef<Value *> Args,

                            ArrayRef<Type *> ArgTys = {},

                            ArrayRef<Value *> MDSources = {}) const;

  SmallVector<Value *> splitVectorElements(IRBuilderBase &Builder, Value *Vec,

                                           unsigned ToWidth) const;

  Value *joinVectorElements(IRBuilderBase &Builder, ArrayRef<Value *> Values,

                            VectorType *ToType) const;


  std::optional<int> calculatePointerDifference(Value *Ptr0, Value *Ptr1) const;


  unsigned getNumSignificantBits(const Value *V,

                                 const Instruction *CtxI = nullptr) const;

  KnownBits getKnownBits(const Value *V,

                         const Instruction *CtxI = nullptr) const;


  bool isSafeToClone(const Instruction &In) const;


  template <typename T = std::vector<Instruction *>>

  bool isSafeToMoveBeforeInBB(const Instruction &In,

                              BasicBlock::const_iterator To,

                              const T &IgnoreInsts = {}) const;


  // This function is only used for assertions at the moment.

  [[maybe_unused]] bool isByteVecTy(Type *Ty) const;


  Function &F;

  const DataLayout &DL;

  AliasAnalysis &AA;

  AssumptionCache &AC;

  DominatorTree &DT;

  ScalarEvolution &SE;

  TargetLibraryInfo &TLI;

  const HexagonSubtarget &HST;


private:

  Value *getElementRange(IRBuilderBase &Builder, Value *Lo, Value *Hi,

                         int Start, int Length) const;

};


class AlignVectors {

  // This code tries to replace unaligned vector loads/stores with aligned

  // ones.

  // Consider unaligned load:

  //   %v = original_load %some_addr, align <bad>

  //   %user = %v

  // It will generate

  //      = load ..., align <good>

  //      = load ..., align <good>

  //      = valign

  //      etc.

  //   %synthesize = combine/shuffle the loaded data so that it looks

  //                 exactly like what "original_load" has loaded.

  //   %user = %synthesize

  // Similarly for stores.

public:

  AlignVectors(const HexagonVectorCombine &HVC_) : HVC(HVC_) {}


  bool run();


private:

  using InstList = std::vector<Instruction *>;

  using InstMap = DenseMap<Instruction *, Instruction *>;


  struct AddrInfo {

    AddrInfo(const AddrInfo &) = default;

    AddrInfo(const HexagonVectorCombine &HVC, Instruction *I, Value *A, Type *T,

             Align H)

        : Inst(I), Addr(A), ValTy(T), HaveAlign(H),

          NeedAlign(HVC.getTypeAlignment(ValTy)) {}

    AddrInfo &operator=(const AddrInfo &) = default;


    // XXX: add Size member?

    Instruction *Inst;

    Value *Addr;

    Type *ValTy;

    Align HaveAlign;

    Align NeedAlign;

    int Offset = 0; // Offset (in bytes) from the first member of the

                    // containing AddrList.

  };

  using AddrList = std::vector<AddrInfo>;


  struct InstrLess {

    bool operator()(const Instruction *A, const Instruction *B) const {

      return A->comesBefore(B);

    }

  };

  using DepList = std::set<Instruction *, InstrLess>;


  struct MoveGroup {

    MoveGroup(const AddrInfo &AI, Instruction *B, bool Hvx, bool Load)

        : Base(B), Main{AI.Inst}, Clones{}, IsHvx(Hvx), IsLoad(Load) {}

    MoveGroup() = default;

    Instruction *Base; // Base instruction of the parent address group.

    InstList Main;     // Main group of instructions.

    InstList Deps;     // List of dependencies.

    InstMap Clones;    // Map from original Deps to cloned ones.

    bool IsHvx;        // Is this group of HVX instructions?

    bool IsLoad;       // Is this a load group?

  };

  using MoveList = std::vector<MoveGroup>;


  struct ByteSpan {

    // A representation of "interesting" bytes within a given span of memory.

    // These bytes are those that are loaded or stored, and they don't have

    // to cover the entire span of memory.

    //

    // The representation works by picking a contiguous sequence of bytes

    // from somewhere within a llvm::Value, and placing it at a given offset

    // within the span.

    //

    // The sequence of bytes from llvm:Value is represented by Segment.

    // Block is Segment, plus where it goes in the span.

    //

    // An important feature of ByteSpan is being able to make a "section",

    // i.e. creating another ByteSpan corresponding to a range of offsets

    // relative to the source span.


    struct Segment {

      // Segment of a Value: 'Len' bytes starting at byte 'Begin'.

      Segment(Value *Val, int Begin, int Len)

          : Val(Val), Start(Begin), Size(Len) {}

      Segment(const Segment &Seg) = default;

      Segment &operator=(const Segment &Seg) = default;

      Value *Val; // Value representable as a sequence of bytes.

      int Start;  // First byte of the value that belongs to the segment.

      int Size;   // Number of bytes in the segment.

    };


    struct Block {

      Block(Value *Val, int Len, int Pos) : Seg(Val, 0, Len), Pos(Pos) {}

      Block(Value *Val, int Off, int Len, int Pos)

          : Seg(Val, Off, Len), Pos(Pos) {}

      Block(const Block &Blk) = default;

      Block &operator=(const Block &Blk) = default;

      Segment Seg; // Value segment.

      int Pos;     // Position (offset) of the block in the span.

    };


    int extent() const;

    ByteSpan section(int Start, int Length) const;

    ByteSpan &shift(int Offset);

    SmallVector<Value *, 8> values() const;


    int size() const { return Blocks.size(); }

    Block &operator[](int i) { return Blocks[i]; }

    const Block &operator[](int i) const { return Blocks[i]; }


    std::vector<Block> Blocks;


    using iterator = decltype(Blocks)::iterator;

    iterator begin() { return Blocks.begin(); }

    iterator end() { return Blocks.end(); }

    using const_iterator = decltype(Blocks)::const_iterator;

    const_iterator begin() const { return Blocks.begin(); }

    const_iterator end() const { return Blocks.end(); }

  };


  std::optional<AddrInfo> getAddrInfo(Instruction &In) const;

  bool isHvx(const AddrInfo &AI) const;

  // This function is only used for assertions at the moment.

  [[maybe_unused]] bool isSectorTy(Type *Ty) const;


  Value *getPayload(Value *Val) const;

  Value *getMask(Value *Val) const;

  Value *getPassThrough(Value *Val) const;


  Value *createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,

                               int Adjust,

                               const InstMap &CloneMap = InstMap()) const;

  Value *createAlignedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,

                              int Alignment,

                              const InstMap &CloneMap = InstMap()) const;


  Value *createLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,

                    Value *Predicate, int Alignment, Value *Mask,

                    Value *PassThru, ArrayRef<Value *> MDSources = {}) const;

  Value *createSimpleLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,

                          int Alignment,

                          ArrayRef<Value *> MDSources = {}) const;


  Value *createStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,

                     Value *Predicate, int Alignment, Value *Mask,

                     ArrayRef<Value *> MDSources = {}) const;

  Value *createSimpleStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,

                           int Alignment,

                           ArrayRef<Value *> MDSources = {}) const;


  Value *createPredicatedLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,

                              Value *Predicate, int Alignment,

                              ArrayRef<Value *> MDSources = {}) const;

  Value *createPredicatedStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,

                               Value *Predicate, int Alignment,

                               ArrayRef<Value *> MDSources = {}) const;


  DepList getUpwardDeps(Instruction *In, Instruction *Base) const;

  bool createAddressGroups();

  MoveList createLoadGroups(const AddrList &Group) const;

  MoveList createStoreGroups(const AddrList &Group) const;

  bool moveTogether(MoveGroup &Move) const;

  template <typename T>

  InstMap cloneBefore(BasicBlock::iterator To, T &&Insts) const;


  void realignLoadGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,

                        int ScLen, Value *AlignVal, Value *AlignAddr) const;

  void realignStoreGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,

                         int ScLen, Value *AlignVal, Value *AlignAddr) const;

  bool realignGroup(const MoveGroup &Move) const;


  Value *makeTestIfUnaligned(IRBuilderBase &Builder, Value *AlignVal,

                             int Alignment) const;


  friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);

  friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);

  friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan::Block &B);

  friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);


  std::map<Instruction *, AddrList> AddrGroups;

  const HexagonVectorCombine &HVC;

};


[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,

                                         const AlignVectors::AddrInfo &AI) {

  OS << "Inst: " << AI.Inst << "  " << *AI.Inst << '\n';

  OS << "Addr: " << *AI.Addr << '\n';

  OS << "Type: " << *AI.ValTy << '\n';

  OS << "HaveAlign: " << AI.HaveAlign.value() << '\n';

  OS << "NeedAlign: " << AI.NeedAlign.value() << '\n';

  OS << "Offset: " << AI.Offset;

  return OS;

}


[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,

                                         const AlignVectors::MoveGroup &MG) {

  OS << "IsLoad:" << (MG.IsLoad ? "yes" : "no");

  OS << ", IsHvx:" << (MG.IsHvx ? "yes" : "no") << '\n';

  OS << "Main\n";

  for (Instruction *I : MG.Main)

    OS << "  " << *I << '\n';

  OS << "Deps\n";

  for (Instruction *I : MG.Deps)

    OS << "  " << *I << '\n';

  OS << "Clones\n";

  for (auto [K, V] : MG.Clones) {

    OS << "    ";

    K->printAsOperand(OS, false);

    OS << "\t-> " << *V << '\n';

  }

  return OS;

}


[[maybe_unused]] raw_ostream &

operator<<(raw_ostream &OS, const AlignVectors::ByteSpan::Block &B) {

  OS << "  @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] ";

  if (B.Seg.Val == reinterpret_cast<const Value *>(&B)) {

    OS << "(self:" << B.Seg.Val << ')';

  } else if (B.Seg.Val != nullptr) {

    OS << *B.Seg.Val;

  } else {

    OS << "(null)";

  }

  return OS;

}


[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,

                                         const AlignVectors::ByteSpan &BS) {

  OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';

  for (const AlignVectors::ByteSpan::Block &B : BS)

    OS << B << '\n';

  OS << ']';

  return OS;

}


class HvxIdioms {

public:

  enum DstQualifier {

    Undefined = 0,

    Arithmetic,

    LdSt,

    LLVM_Gather,

    LLVM_Scatter,

    HEX_Gather_Scatter,

    HEX_Gather,

    HEX_Scatter,

    Call

  };


  HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) {

    auto *Int32Ty = HVC.getIntTy(32);

    HvxI32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/false);

    HvxP32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/true);

  }


  bool run();


private:

  enum Signedness { Positive, Signed, Unsigned };


  // Value + sign

  // This is to keep track of whether the value should be treated as signed

  // or unsigned, or is known to be positive.

  struct SValue {

    Value *Val;

    Signedness Sgn;

  };


  struct FxpOp {

    unsigned Opcode;

    unsigned Frac; // Number of fraction bits

    SValue X, Y;

    // If present, add 1 << RoundAt before shift:

    std::optional<unsigned> RoundAt;

    VectorType *ResTy;

  };


  auto getNumSignificantBits(Value *V, Instruction *In) const

      -> std::pair<unsigned, Signedness>;

  auto canonSgn(SValue X, SValue Y) const -> std::pair<SValue, SValue>;


  auto matchFxpMul(Instruction &In) const -> std::optional<FxpOp>;

  auto processFxpMul(Instruction &In, const FxpOp &Op) const -> Value *;


  auto processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,

                            const FxpOp &Op) const -> Value *;

  auto createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,

                    bool Rounding) const -> Value *;

  auto createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,

                    bool Rounding) const -> Value *;

  // Return {Result, Carry}, where Carry is a vector predicate.

  auto createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,

                      Value *CarryIn = nullptr) const

      -> std::pair<Value *, Value *>;

  auto createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const -> Value *;

  auto createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const

      -> Value *;

  auto createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const

      -> std::pair<Value *, Value *>;

  auto createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,

                     ArrayRef<Value *> WordY) const -> SmallVector<Value *>;

  auto createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,

                     Signedness SgnX, ArrayRef<Value *> WordY,

                     Signedness SgnY) const -> SmallVector<Value *>;

  // Vector manipulations for Ripple

  bool matchScatter(Instruction &In) const;

  bool matchGather(Instruction &In) const;

  Value *processVScatter(Instruction &In) const;

  Value *processVGather(Instruction &In) const;


  VectorType *HvxI32Ty;

  VectorType *HvxP32Ty;

  const HexagonVectorCombine &HVC;


  friend raw_ostream &operator<<(raw_ostream &, const FxpOp &);

};


[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,

                                         const HvxIdioms::FxpOp &Op) {

  static const char *SgnNames[] = {"Positive", "Signed", "Unsigned"};

  OS << Instruction::getOpcodeName(Op.Opcode) << '.' << Op.Frac;

  if (Op.RoundAt.has_value()) {

    if (Op.Frac != 0 && *Op.RoundAt == Op.Frac - 1) {

      OS << ":rnd";

    } else {

      OS << " + 1<<" << *Op.RoundAt;

    }

  }

  OS << "\n  X:(" << SgnNames[Op.X.Sgn] << ") " << *Op.X.Val << "\n"

     << "  Y:(" << SgnNames[Op.Y.Sgn] << ") " << *Op.Y.Val;

  return OS;

}


} // namespace


namespace {


template <typename T> T *getIfUnordered(T *MaybeT) {

  return MaybeT && MaybeT->isUnordered() ? MaybeT : nullptr;

}

template <typename T> T *isCandidate(Instruction *In) {

  return dyn_cast<T>(In);

}

template <> LoadInst *isCandidate<LoadInst>(Instruction *In) {

  return getIfUnordered(dyn_cast<LoadInst>(In));

}

template <> StoreInst *isCandidate<StoreInst>(Instruction *In) {

  return getIfUnordered(dyn_cast<StoreInst>(In));

}


#if !defined(_MSC_VER) || _MSC_VER >= 1926

// VS2017 and some versions of VS2019 have trouble compiling this:

// error C2976: 'std::map': too few template arguments

// VS 2019 16.x is known to work, except for 16.4/16.5 (MSC_VER 1924/1925)

template <typename Pred, typename... Ts>

void erase_if(std::map<Ts...> &map, Pred p)

#else

template <typename Pred, typename T, typename U>

void erase_if(std::map<T, U> &map, Pred p)

#endif

{

  for (auto i = map.begin(), e = map.end(); i != e;) {

    if (p(*i))

      i = map.erase(i);

    else

      i = std::next(i);

  }

}


// Forward other erase_ifs to the LLVM implementations.

template <typename Pred, typename T> void erase_if(T &&container, Pred p) {

  llvm::erase_if(std::forward<T>(container), p);

}


} // namespace


// --- Begin AlignVectors


// For brevity, only consider loads. We identify a group of loads where we

// know the relative differences between their addresses, so we know how they

// are laid out in memory (relative to one another). These loads can overlap,

// can be shorter or longer than the desired vector length.

// Ultimately we want to generate a sequence of aligned loads that will load

// every byte that the original loads loaded, and have the program use these

// loaded values instead of the original loads.

// We consider the contiguous memory area spanned by all these loads.

//

// Let's say that a single aligned vector load can load 16 bytes at a time.

// If the program wanted to use a byte at offset 13 from the beginning of the

// original span, it will be a byte at offset 13+x in the aligned data for

// some x>=0. This may happen to be in the first aligned load, or in the load

// following it. Since we generally don't know what the that alignment value

// is at compile time, we proactively do valigns on the aligned loads, so that

// byte that was at offset 13 is still at offset 13 after the valigns.

//

// This will be the starting point for making the rest of the program use the

// data loaded by the new loads.

// For each original load, and its users:

//   %v = load ...

//   ... = %v

//   ... = %v

// we create

//   %new_v = extract/combine/shuffle data from loaded/valigned vectors so

//            it contains the same value as %v did before

// then replace all users of %v with %new_v.

//   ... = %new_v

//   ... = %new_v


auto AlignVectors::ByteSpan::extent() const -> int {

  if (size() == 0)

    return 0;

  int Min = Blocks[0].Pos;

  int Max = Blocks[0].Pos + Blocks[0].Seg.Size;

  for (int i = 1, e = size(); i != e; ++i) {

    Min = std::min(Min, Blocks[i].Pos);

    Max = std::max(Max, Blocks[i].Pos + Blocks[i].Seg.Size);

  }

  return Max - Min;

}


auto AlignVectors::ByteSpan::section(int Start, int Length) const -> ByteSpan {

  ByteSpan Section;

  for (const ByteSpan::Block &B : Blocks) {

    int L = std::max(B.Pos, Start);                       // Left end.

    int R = std::min(B.Pos + B.Seg.Size, Start + Length); // Right end+1.

    if (L < R) {

      // How much to chop off the beginning of the segment:

      int Off = L > B.Pos ? L - B.Pos : 0;

      Section.Blocks.emplace_back(B.Seg.Val, B.Seg.Start + Off, R - L, L);

    }

  }

  return Section;

}


auto AlignVectors::ByteSpan::shift(int Offset) -> ByteSpan & {

  for (Block &B : Blocks)

    B.Pos += Offset;

  return *this;

}


auto AlignVectors::ByteSpan::values() const -> SmallVector<Value *, 8> {

  SmallVector<Value *, 8> Values(Blocks.size());

  for (int i = 0, e = Blocks.size(); i != e; ++i)

    Values[i] = Blocks[i].Seg.Val;

  return Values;

}


auto AlignVectors::getAddrInfo(Instruction &In) const

    -> std::optional<AddrInfo> {

  if (auto *L = isCandidate<LoadInst>(&In))

    return AddrInfo(HVC, L, L->getPointerOperand(), L->getType(),

                    L->getAlign());

  if (auto *S = isCandidate<StoreInst>(&In))

    return AddrInfo(HVC, S, S->getPointerOperand(),

                    S->getValueOperand()->getType(), S->getAlign());

  if (auto *II = isCandidate<IntrinsicInst>(&In)) {

    Intrinsic::ID ID = II->getIntrinsicID();

    switch (ID) {

    case Intrinsic::masked_load:

      return AddrInfo(HVC, II, II->getArgOperand(0), II->getType(),

                      II->getParamAlign(0).valueOrOne());

    case Intrinsic::masked_store:

      return AddrInfo(HVC, II, II->getArgOperand(1),

                      II->getArgOperand(0)->getType(),

                      II->getParamAlign(1).valueOrOne());

    }

  }

  return std::nullopt;

}


auto AlignVectors::isHvx(const AddrInfo &AI) const -> bool {

  return HVC.HST.isTypeForHVX(AI.ValTy);

}


auto AlignVectors::getPayload(Value *Val) const -> Value * {

  if (auto *In = dyn_cast<Instruction>(Val)) {

    Intrinsic::ID ID = 0;

    if (auto *II = dyn_cast<IntrinsicInst>(In))

      ID = II->getIntrinsicID();

    if (isa<StoreInst>(In) || ID == Intrinsic::masked_store)

      return In->getOperand(0);

  }

  return Val;

}


auto AlignVectors::getMask(Value *Val) const -> Value * {

  if (auto *II = dyn_cast<IntrinsicInst>(Val)) {

    switch (II->getIntrinsicID()) {

    case Intrinsic::masked_load:

      return II->getArgOperand(1);

    case Intrinsic::masked_store:

      return II->getArgOperand(2);

    }

  }


  Type *ValTy = getPayload(Val)->getType();

  if (auto *VecTy = dyn_cast<VectorType>(ValTy))

    return HVC.getFullValue(HVC.getBoolTy(HVC.length(VecTy)));

  return HVC.getFullValue(HVC.getBoolTy());

}


auto AlignVectors::getPassThrough(Value *Val) const -> Value * {

  if (auto *II = dyn_cast<IntrinsicInst>(Val)) {

    if (II->getIntrinsicID() == Intrinsic::masked_load)

      return II->getArgOperand(2);

  }

  return UndefValue::get(getPayload(Val)->getType());

}


auto AlignVectors::createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr,

                                         Type *ValTy, int Adjust,

                                         const InstMap &CloneMap) const

    -> Value * {

  if (auto *I = dyn_cast<Instruction>(Ptr))

    if (Instruction *New = CloneMap.lookup(I))

      Ptr = New;

  return Builder.CreatePtrAdd(Ptr, HVC.getConstInt(Adjust), "gep");

}


auto AlignVectors::createAlignedPointer(IRBuilderBase &Builder, Value *Ptr,

                                        Type *ValTy, int Alignment,

                                        const InstMap &CloneMap) const

    -> Value * {

  auto remap = [&](Value *V) -> Value * {

    if (auto *I = dyn_cast<Instruction>(V)) {

      for (auto [Old, New] : CloneMap)

        I->replaceUsesOfWith(Old, New);

      return I;

    }

    return V;

  };

  Value *AsInt = Builder.CreatePtrToInt(Ptr, HVC.getIntTy(), "pti");

  Value *Mask = HVC.getConstInt(-Alignment);

  Value *And = Builder.CreateAnd(remap(AsInt), Mask, "and");

  return Builder.CreateIntToPtr(

      And, PointerType::getUnqual(ValTy->getContext()), "itp");

}


auto AlignVectors::createLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,

                              Value *Predicate, int Alignment, Value *Mask,

                              Value *PassThru,

                              ArrayRef<Value *> MDSources) const -> Value * {

  bool HvxHasPredLoad = HVC.HST.useHVXV62Ops();

  // Predicate is nullptr if not creating predicated load

  if (Predicate) {

    assert(!Predicate->getType()->isVectorTy() &&

           "Expectning scalar predicate");

    if (HVC.isFalse(Predicate))

      return UndefValue::get(ValTy);

    if (!HVC.isTrue(Predicate) && HvxHasPredLoad) {

      Value *Load = createPredicatedLoad(Builder, ValTy, Ptr, Predicate,

                                         Alignment, MDSources);

      return Builder.CreateSelect(Mask, Load, PassThru);

    }

    // Predicate == true here.

  }

  assert(!HVC.isUndef(Mask)); // Should this be allowed?

  if (HVC.isZero(Mask))

    return PassThru;

  if (HVC.isTrue(Mask))

    return createSimpleLoad(Builder, ValTy, Ptr, Alignment, MDSources);


  Instruction *Load = Builder.CreateMaskedLoad(ValTy, Ptr, Align(Alignment),

                                               Mask, PassThru, "mld");

  propagateMetadata(Load, MDSources);

  return Load;

}


auto AlignVectors::createSimpleLoad(IRBuilderBase &Builder, Type *ValTy,

                                    Value *Ptr, int Alignment,

                                    ArrayRef<Value *> MDSources) const

    -> Value * {

  Instruction *Load =

      Builder.CreateAlignedLoad(ValTy, Ptr, Align(Alignment), "ald");

  propagateMetadata(Load, MDSources);

  return Load;

}


auto AlignVectors::createPredicatedLoad(IRBuilderBase &Builder, Type *ValTy,

                                        Value *Ptr, Value *Predicate,

                                        int Alignment,

                                        ArrayRef<Value *> MDSources) const

    -> Value * {

  assert(HVC.HST.isTypeForHVX(ValTy) &&

         "Predicates 'scalar' vector loads not yet supported");

  assert(Predicate);

  assert(!Predicate->getType()->isVectorTy() && "Expectning scalar predicate");

  assert(HVC.getSizeOf(ValTy, HVC.Alloc) % Alignment == 0);

  if (HVC.isFalse(Predicate))

    return UndefValue::get(ValTy);

  if (HVC.isTrue(Predicate))

    return createSimpleLoad(Builder, ValTy, Ptr, Alignment, MDSources);


  auto V6_vL32b_pred_ai = HVC.HST.getIntrinsicId(Hexagon::V6_vL32b_pred_ai);

  // FIXME: This may not put the offset from Ptr into the vmem offset.

  return HVC.createHvxIntrinsic(Builder, V6_vL32b_pred_ai, ValTy,

                                {Predicate, Ptr, HVC.getConstInt(0)}, {},

                                MDSources);

}


auto AlignVectors::createStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,

                               Value *Predicate, int Alignment, Value *Mask,

                               ArrayRef<Value *> MDSources) const -> Value * {

  if (HVC.isZero(Mask) || HVC.isUndef(Val) || HVC.isUndef(Mask))

    return UndefValue::get(Val->getType());

  assert(!Predicate || (!Predicate->getType()->isVectorTy() &&

                        "Expectning scalar predicate"));

  if (Predicate) {

    if (HVC.isFalse(Predicate))

      return UndefValue::get(Val->getType());

    if (HVC.isTrue(Predicate))

      Predicate = nullptr;

  }

  // Here both Predicate and Mask are true or unknown.


  if (HVC.isTrue(Mask)) {

    if (Predicate) { // Predicate unknown

      return createPredicatedStore(Builder, Val, Ptr, Predicate, Alignment,

                                   MDSources);

    }

    // Predicate is true:

    return createSimpleStore(Builder, Val, Ptr, Alignment, MDSources);

  }


  // Mask is unknown

  if (!Predicate) {

    Instruction *Store =

        Builder.CreateMaskedStore(Val, Ptr, Align(Alignment), Mask);

    propagateMetadata(Store, MDSources);

    return Store;

  }


  // Both Predicate and Mask are unknown.

  // Emulate masked store with predicated-load + mux + predicated-store.

  Value *PredLoad = createPredicatedLoad(Builder, Val->getType(), Ptr,

                                         Predicate, Alignment, MDSources);

  Value *Mux = Builder.CreateSelect(Mask, Val, PredLoad);

  return createPredicatedStore(Builder, Mux, Ptr, Predicate, Alignment,

                               MDSources);

}


auto AlignVectors::createSimpleStore(IRBuilderBase &Builder, Value *Val,

                                     Value *Ptr, int Alignment,

                                     ArrayRef<Value *> MDSources) const

    -> Value * {

  Instruction *Store = Builder.CreateAlignedStore(Val, Ptr, Align(Alignment));

  propagateMetadata(Store, MDSources);

  return Store;

}


auto AlignVectors::createPredicatedStore(IRBuilderBase &Builder, Value *Val,

                                         Value *Ptr, Value *Predicate,

                                         int Alignment,

                                         ArrayRef<Value *> MDSources) const

    -> Value * {

  assert(HVC.HST.isTypeForHVX(Val->getType()) &&

         "Predicates 'scalar' vector stores not yet supported");

  assert(Predicate);

  if (HVC.isFalse(Predicate))

    return UndefValue::get(Val->getType());

  if (HVC.isTrue(Predicate))

    return createSimpleStore(Builder, Val, Ptr, Alignment, MDSources);


  assert(HVC.getSizeOf(Val, HVC.Alloc) % Alignment == 0);

  auto V6_vS32b_pred_ai = HVC.HST.getIntrinsicId(Hexagon::V6_vS32b_pred_ai);

  // FIXME: This may not put the offset from Ptr into the vmem offset.

  return HVC.createHvxIntrinsic(Builder, V6_vS32b_pred_ai, nullptr,

                                {Predicate, Ptr, HVC.getConstInt(0), Val}, {},

                                MDSources);

}


auto AlignVectors::getUpwardDeps(Instruction *In, Instruction *Base) const

    -> DepList {

  BasicBlock *Parent = Base->getParent();

  assert(In->getParent() == Parent &&

         "Base and In should be in the same block");

  assert(Base->comesBefore(In) && "Base should come before In");


  DepList Deps;

  std::deque<Instruction *> WorkQ = {In};

  while (!WorkQ.empty()) {

    Instruction *D = WorkQ.front();

    WorkQ.pop_front();

    if (D != In)

      Deps.insert(D);

    for (Value *Op : D->operands()) {

      if (auto *I = dyn_cast<Instruction>(Op)) {

        if (I->getParent() == Parent && Base->comesBefore(I))

          WorkQ.push_back(I);

      }

    }

  }

  return Deps;

}


auto AlignVectors::createAddressGroups() -> bool {

  // An address group created here may contain instructions spanning

  // multiple basic blocks.

  AddrList WorkStack;


  auto findBaseAndOffset = [&](AddrInfo &AI) -> std::pair<Instruction *, int> {

    for (AddrInfo &W : WorkStack) {

      if (auto D = HVC.calculatePointerDifference(AI.Addr, W.Addr))

        return std::make_pair(W.Inst, *D);

    }

    return std::make_pair(nullptr, 0);

  };


  auto traverseBlock = [&](DomTreeNode *DomN, auto Visit) -> void {

    BasicBlock &Block = *DomN->getBlock();

    for (Instruction &I : Block) {

      auto AI = this->getAddrInfo(I); // Use this-> for gcc6.

      if (!AI)

        continue;

      auto F = findBaseAndOffset(*AI);

      Instruction *GroupInst;

      if (Instruction *BI = F.first) {

        AI->Offset = F.second;

        GroupInst = BI;

      } else {

        WorkStack.push_back(*AI);

        GroupInst = AI->Inst;

      }

      AddrGroups[GroupInst].push_back(*AI);

    }


    for (DomTreeNode *C : DomN->children())

      Visit(C, Visit);


    while (!WorkStack.empty() && WorkStack.back().Inst->getParent() == &Block)

      WorkStack.pop_back();

  };


  traverseBlock(HVC.DT.getRootNode(), traverseBlock);

  assert(WorkStack.empty());


  // AddrGroups are formed.


  // Remove groups of size 1.

  erase_if(AddrGroups, [](auto &G) { return G.second.size() == 1; });

  // Remove groups that don't use HVX types.

  erase_if(AddrGroups, [&](auto &G) {

    return llvm::none_of(

        G.second, [&](auto &I) { return HVC.HST.isTypeForHVX(I.ValTy); });

  });


  return !AddrGroups.empty();

}


auto AlignVectors::createLoadGroups(const AddrList &Group) const -> MoveList {

  // Form load groups.

  // To avoid complications with moving code across basic blocks, only form

  // groups that are contained within a single basic block.

  unsigned SizeLimit = VAGroupSizeLimit;

  if (SizeLimit == 0)

    return {};


  auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {

    assert(!Move.Main.empty() && "Move group should have non-empty Main");

    if (Move.Main.size() >= SizeLimit)

      return false;

    // Don't mix HVX and non-HVX instructions.

    if (Move.IsHvx != isHvx(Info))

      return false;

    // Leading instruction in the load group.

    Instruction *Base = Move.Main.front();

    if (Base->getParent() != Info.Inst->getParent())

      return false;

    // Check if it's safe to move the load.

    if (!HVC.isSafeToMoveBeforeInBB(*Info.Inst, Base->getIterator()))

      return false;

    // And if it's safe to clone the dependencies.

    auto isSafeToCopyAtBase = [&](const Instruction *I) {

      return HVC.isSafeToMoveBeforeInBB(*I, Base->getIterator()) &&

             HVC.isSafeToClone(*I);

    };

    DepList Deps = getUpwardDeps(Info.Inst, Base);

    if (!llvm::all_of(Deps, isSafeToCopyAtBase))

      return false;


    Move.Main.push_back(Info.Inst);

    llvm::append_range(Move.Deps, Deps);

    return true;

  };


  MoveList LoadGroups;


  for (const AddrInfo &Info : Group) {

    if (!Info.Inst->mayReadFromMemory())

      continue;

    if (LoadGroups.empty() || !tryAddTo(Info, LoadGroups.back()))

      LoadGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), true);

  }


  // Erase singleton groups.

  erase_if(LoadGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });


  // Erase HVX groups on targets < HvxV62 (due to lack of predicated loads).

  if (!HVC.HST.useHVXV62Ops())

    erase_if(LoadGroups, [](const MoveGroup &G) { return G.IsHvx; });


  return LoadGroups;

}


auto AlignVectors::createStoreGroups(const AddrList &Group) const -> MoveList {

  // Form store groups.

  // To avoid complications with moving code across basic blocks, only form

  // groups that are contained within a single basic block.

  unsigned SizeLimit = VAGroupSizeLimit;

  if (SizeLimit == 0)

    return {};


  auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {

    assert(!Move.Main.empty() && "Move group should have non-empty Main");

    if (Move.Main.size() >= SizeLimit)

      return false;

    // For stores with return values we'd have to collect downward dependencies.

    // There are no such stores that we handle at the moment, so omit that.

    assert(Info.Inst->getType()->isVoidTy() &&

           "Not handling stores with return values");

    // Don't mix HVX and non-HVX instructions.

    if (Move.IsHvx != isHvx(Info))

      return false;

    // For stores we need to be careful whether it's safe to move them.

    // Stores that are otherwise safe to move together may not appear safe

    // to move over one another (i.e. isSafeToMoveBefore may return false).

    Instruction *Base = Move.Main.front();

    if (Base->getParent() != Info.Inst->getParent())

      return false;

    if (!HVC.isSafeToMoveBeforeInBB(*Info.Inst, Base->getIterator(), Move.Main))

      return false;

    Move.Main.push_back(Info.Inst);

    return true;

  };


  MoveList StoreGroups;


  for (auto I = Group.rbegin(), E = Group.rend(); I != E; ++I) {

    const AddrInfo &Info = *I;

    if (!Info.Inst->mayWriteToMemory())

      continue;

    if (StoreGroups.empty() || !tryAddTo(Info, StoreGroups.back()))

      StoreGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), false);

  }


  // Erase singleton groups.

  erase_if(StoreGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });


  // Erase HVX groups on targets < HvxV62 (due to lack of predicated loads).

  if (!HVC.HST.useHVXV62Ops())

    erase_if(StoreGroups, [](const MoveGroup &G) { return G.IsHvx; });


  // Erase groups where every store is a full HVX vector. The reason is that

  // aligning predicated stores generates complex code that may be less

  // efficient than a sequence of unaligned vector stores.

  if (!VADoFullStores) {

    erase_if(StoreGroups, [this](const MoveGroup &G) {

      return G.IsHvx && llvm::all_of(G.Main, [this](Instruction *S) {

               auto MaybeInfo = this->getAddrInfo(*S);

               assert(MaybeInfo.has_value());

               return HVC.HST.isHVXVectorType(

                   EVT::getEVT(MaybeInfo->ValTy, false));

             });

    });

  }


  return StoreGroups;

}


auto AlignVectors::moveTogether(MoveGroup &Move) const -> bool {

  // Move all instructions to be adjacent.

  assert(!Move.Main.empty() && "Move group should have non-empty Main");

  Instruction *Where = Move.Main.front();


  if (Move.IsLoad) {

    // Move all the loads (and dependencies) to where the first load is.

    // Clone all deps to before Where, keeping order.

    Move.Clones = cloneBefore(Where->getIterator(), Move.Deps);

    // Move all main instructions to after Where, keeping order.

    ArrayRef<Instruction *> Main(Move.Main);

    for (Instruction *M : Main) {

      if (M != Where)

        M->moveAfter(Where);

      for (auto [Old, New] : Move.Clones)

        M->replaceUsesOfWith(Old, New);

      Where = M;

    }

    // Replace Deps with the clones.

    for (int i = 0, e = Move.Deps.size(); i != e; ++i)

      Move.Deps[i] = Move.Clones[Move.Deps[i]];

  } else {

    // Move all the stores to where the last store is.

    // NOTE: Deps are empty for "store" groups. If they need to be

    // non-empty, decide on the order.

    assert(Move.Deps.empty());

    // Move all main instructions to before Where, inverting order.

    ArrayRef<Instruction *> Main(Move.Main);

    for (Instruction *M : Main.drop_front(1)) {

      M->moveBefore(Where->getIterator());

      Where = M;

    }

  }


  return Move.Main.size() + Move.Deps.size() > 1;

}


template <typename T>

auto AlignVectors::cloneBefore(BasicBlock::iterator To, T &&Insts) const

    -> InstMap {

  InstMap Map;


  for (Instruction *I : Insts) {

    assert(HVC.isSafeToClone(*I));

    Instruction *C = I->clone();

    C->setName(Twine("c.") + I->getName() + ".");

    C->insertBefore(To);


    for (auto [Old, New] : Map)

      C->replaceUsesOfWith(Old, New);

    Map.insert(std::make_pair(I, C));

  }

  return Map;

}


auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,

                                    const ByteSpan &VSpan, int ScLen,

                                    Value *AlignVal, Value *AlignAddr) const

    -> void {

  LLVM_DEBUG(dbgs() << __func__ << "\n");


  Type *SecTy = HVC.getByteTy(ScLen);

  int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;

  bool DoAlign = !HVC.isZero(AlignVal);

  BasicBlock::iterator BasePos = Builder.GetInsertPoint();

  BasicBlock *BaseBlock = Builder.GetInsertBlock();


  ByteSpan ASpan;

  auto *True = HVC.getFullValue(HVC.getBoolTy(ScLen));

  auto *Undef = UndefValue::get(SecTy);


  // Created load does not have to be "Instruction" (e.g. "undef").

  SmallVector<Value *> Loads(NumSectors + DoAlign, nullptr);


  // We could create all of the aligned loads, and generate the valigns

  // at the location of the first load, but for large load groups, this

  // could create highly suboptimal code (there have been groups of 140+

  // loads in real code).

  // Instead, place the loads/valigns as close to the users as possible.

  // In any case we need to have a mapping from the blocks of VSpan (the

  // span covered by the pre-existing loads) to ASpan (the span covered

  // by the aligned loads). There is a small problem, though: ASpan needs

  // to have pointers to the loads/valigns, but we don't have these loads

  // because we don't know where to put them yet. We find out by creating

  // a section of ASpan that corresponds to values (blocks) from VSpan,

  // and checking where the new load should be placed. We need to attach

  // this location information to each block in ASpan somehow, so we put

  // distincts values for Seg.Val in each ASpan.Blocks[i], and use a map

  // to store the location for each Seg.Val.

  // The distinct values happen to be Blocks[i].Seg.Val = &Blocks[i],

  // which helps with printing ByteSpans without crashing when printing

  // Segments with these temporary identifiers in place of Val.


  // Populate the blocks first, to avoid reallocations of the vector

  // interfering with generating the placeholder addresses.

  for (int Index = 0; Index != NumSectors; ++Index)

    ASpan.Blocks.emplace_back(nullptr, ScLen, Index * ScLen);

  for (int Index = 0; Index != NumSectors; ++Index) {

    ASpan.Blocks[Index].Seg.Val =

        reinterpret_cast<Value *>(&ASpan.Blocks[Index]);

  }


  // Multiple values from VSpan can map to the same value in ASpan. Since we

  // try to create loads lazily, we need to find the earliest use for each

  // value from ASpan.

  DenseMap<void *, Instruction *> EarliestUser;

  auto isEarlier = [](Instruction *A, Instruction *B) {

    if (B == nullptr)

      return true;

    if (A == nullptr)

      return false;

    assert(A->getParent() == B->getParent());

    return A->comesBefore(B);

  };

  auto earliestUser = [&](const auto &Uses) {

    Instruction *User = nullptr;

    for (const Use &U : Uses) {

      auto *I = dyn_cast<Instruction>(U.getUser());

      assert(I != nullptr && "Load used in a non-instruction?");

      // Make sure we only consider users in this block, but we need

      // to remember if there were users outside the block too. This is

      // because if no users are found, aligned loads will not be created.

      if (I->getParent() == BaseBlock) {

        if (!isa<PHINode>(I))

          User = std::min(User, I, isEarlier);

      } else {

        User = std::min(User, BaseBlock->getTerminator(), isEarlier);

      }

    }

    return User;

  };


  for (const ByteSpan::Block &B : VSpan) {

    ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size);

    for (const ByteSpan::Block &S : ASection) {

      auto &EU = EarliestUser[S.Seg.Val];

      EU = std::min(EU, earliestUser(B.Seg.Val->uses()), isEarlier);

    }

  }


  LLVM_DEBUG({

    dbgs() << "ASpan:\n" << ASpan << '\n';

    dbgs() << "Earliest users of ASpan:\n";

    for (auto &[Val, User] : EarliestUser) {

      dbgs() << Val << "\n ->" << *User << '\n';

    }

  });


  auto createLoad = [&](IRBuilderBase &Builder, const ByteSpan &VSpan,

                        int Index, bool MakePred) {

    Value *Ptr =

        createAdjustedPointer(Builder, AlignAddr, SecTy, Index * ScLen);

    Value *Predicate =

        MakePred ? makeTestIfUnaligned(Builder, AlignVal, ScLen) : nullptr;


    // If vector shifting is potentially needed, accumulate metadata

    // from source sections of twice the load width.

    int Start = (Index - DoAlign) * ScLen;

    int Width = (1 + DoAlign) * ScLen;

    return this->createLoad(Builder, SecTy, Ptr, Predicate, ScLen, True, Undef,

                            VSpan.section(Start, Width).values());

  };


  auto moveBefore = [this](BasicBlock::iterator In, BasicBlock::iterator To) {

    // Move In and its upward dependencies to before To.

    assert(In->getParent() == To->getParent());

    DepList Deps = getUpwardDeps(&*In, &*To);

    In->moveBefore(To);

    // DepList is sorted with respect to positions in the basic block.

    InstMap Map = cloneBefore(In, Deps);

    for (auto [Old, New] : Map)

      In->replaceUsesOfWith(Old, New);

  };


  // Generate necessary loads at appropriate locations.

  LLVM_DEBUG(dbgs() << "Creating loads for ASpan sectors\n");

  for (int Index = 0; Index != NumSectors + 1; ++Index) {

    // In ASpan, each block will be either a single aligned load, or a

    // valign of a pair of loads. In the latter case, an aligned load j

    // will belong to the current valign, and the one in the previous

    // block (for j > 0).

    // Place the load at a location which will dominate the valign, assuming

    // the valign will be placed right before the earliest user.

    Instruction *PrevAt =

        DoAlign && Index > 0 ? EarliestUser[&ASpan[Index - 1]] : nullptr;

    Instruction *ThisAt =

        Index < NumSectors ? EarliestUser[&ASpan[Index]] : nullptr;

    if (auto *Where = std::min(PrevAt, ThisAt, isEarlier)) {

      Builder.SetInsertPoint(Where);

      Loads[Index] =

          createLoad(Builder, VSpan, Index, DoAlign && Index == NumSectors);

      // We know it's safe to put the load at BasePos, but we'd prefer to put

      // it at "Where". To see if the load is safe to be placed at Where, put

      // it there first and then check if it's safe to move it to BasePos.

      // If not, then the load needs to be placed at BasePos.

      // We can't do this check proactively because we need the load to exist

      // in order to check legality.

      if (auto *Load = dyn_cast<Instruction>(Loads[Index])) {

        if (!HVC.isSafeToMoveBeforeInBB(*Load, BasePos))

          moveBefore(Load->getIterator(), BasePos);

      }

      LLVM_DEBUG(dbgs() << "Loads[" << Index << "]:" << *Loads[Index] << '\n');

    }

  }


  // Generate valigns if needed, and fill in proper values in ASpan

  LLVM_DEBUG(dbgs() << "Creating values for ASpan sectors\n");

  for (int Index = 0; Index != NumSectors; ++Index) {

    ASpan[Index].Seg.Val = nullptr;

    if (auto *Where = EarliestUser[&ASpan[Index]]) {

      Builder.SetInsertPoint(Where);

      Value *Val = Loads[Index];

      assert(Val != nullptr);

      if (DoAlign) {

        Value *NextLoad = Loads[Index + 1];

        assert(NextLoad != nullptr);

        Val = HVC.vralignb(Builder, Val, NextLoad, AlignVal);

      }

      ASpan[Index].Seg.Val = Val;

      LLVM_DEBUG(dbgs() << "ASpan[" << Index << "]:" << *Val << '\n');

    }

  }


  for (const ByteSpan::Block &B : VSpan) {

    ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);

    Value *Accum = UndefValue::get(HVC.getByteTy(B.Seg.Size));

    Builder.SetInsertPoint(cast<Instruction>(B.Seg.Val));


    // We're generating a reduction, where each instruction depends on

    // the previous one, so we need to order them according to the position

    // of their inputs in the code.

    std::vector<ByteSpan::Block *> ABlocks;

    for (ByteSpan::Block &S : ASection) {

      if (S.Seg.Val != nullptr)

        ABlocks.push_back(&S);

    }

    llvm::sort(ABlocks,

               [&](const ByteSpan::Block *A, const ByteSpan::Block *B) {

                 return isEarlier(cast<Instruction>(A->Seg.Val),

                                  cast<Instruction>(B->Seg.Val));

               });

    for (ByteSpan::Block *S : ABlocks) {

      // The processing of the data loaded by the aligned loads

      // needs to be inserted after the data is available.

      Instruction *SegI = cast<Instruction>(S->Seg.Val);

      Builder.SetInsertPoint(&*std::next(SegI->getIterator()));

      Value *Pay = HVC.vbytes(Builder, getPayload(S->Seg.Val));

      Accum =

          HVC.insertb(Builder, Accum, Pay, S->Seg.Start, S->Seg.Size, S->Pos);

    }

    // Instead of casting everything to bytes for the vselect, cast to the

    // original value type. This will avoid complications with casting masks.

    // For example, in cases when the original mask applied to i32, it could

    // be converted to a mask applicable to i8 via pred_typecast intrinsic,

    // but if the mask is not exactly of HVX length, extra handling would be

    // needed to make it work.

    Type *ValTy = getPayload(B.Seg.Val)->getType();

    Value *Cast = Builder.CreateBitCast(Accum, ValTy, "cst");

    Value *Sel = Builder.CreateSelect(getMask(B.Seg.Val), Cast,

                                      getPassThrough(B.Seg.Val), "sel");

    B.Seg.Val->replaceAllUsesWith(Sel);

  }

}


auto AlignVectors::realignStoreGroup(IRBuilderBase &Builder,

                                     const ByteSpan &VSpan, int ScLen,

                                     Value *AlignVal, Value *AlignAddr) const

    -> void {

  LLVM_DEBUG(dbgs() << __func__ << "\n");


  Type *SecTy = HVC.getByteTy(ScLen);

  int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;

  bool DoAlign = !HVC.isZero(AlignVal);


  // Stores.

  ByteSpan ASpanV, ASpanM;


  // Return a vector value corresponding to the input value Val:

  // either <1 x Val> for scalar Val, or Val itself for vector Val.

  auto MakeVec = [](IRBuilderBase &Builder, Value *Val) -> Value * {

    Type *Ty = Val->getType();

    if (Ty->isVectorTy())

      return Val;

    auto *VecTy = VectorType::get(Ty, 1, /*Scalable=*/false);

    return Builder.CreateBitCast(Val, VecTy, "cst");

  };


  // Create an extra "undef" sector at the beginning and at the end.

  // They will be used as the left/right filler in the vlalign step.

  for (int Index = (DoAlign ? -1 : 0); Index != NumSectors + DoAlign; ++Index) {

    // For stores, the size of each section is an aligned vector length.

    // Adjust the store offsets relative to the section start offset.

    ByteSpan VSection =

        VSpan.section(Index * ScLen, ScLen).shift(-Index * ScLen);

    Value *Undef = UndefValue::get(SecTy);

    Value *Zero = HVC.getNullValue(SecTy);

    Value *AccumV = Undef;

    Value *AccumM = Zero;

    for (ByteSpan::Block &S : VSection) {

      Value *Pay = getPayload(S.Seg.Val);

      Value *Mask = HVC.rescale(Builder, MakeVec(Builder, getMask(S.Seg.Val)),

                                Pay->getType(), HVC.getByteTy());

      Value *PartM = HVC.insertb(Builder, Zero, HVC.vbytes(Builder, Mask),

                                 S.Seg.Start, S.Seg.Size, S.Pos);

      AccumM = Builder.CreateOr(AccumM, PartM);


      Value *PartV = HVC.insertb(Builder, Undef, HVC.vbytes(Builder, Pay),

                                 S.Seg.Start, S.Seg.Size, S.Pos);


      AccumV = Builder.CreateSelect(

          Builder.CreateICmp(CmpInst::ICMP_NE, PartM, Zero), PartV, AccumV);

    }

    ASpanV.Blocks.emplace_back(AccumV, ScLen, Index * ScLen);

    ASpanM.Blocks.emplace_back(AccumM, ScLen, Index * ScLen);

  }


  LLVM_DEBUG({

    dbgs() << "ASpanV before vlalign:\n" << ASpanV << '\n';

    dbgs() << "ASpanM before vlalign:\n" << ASpanM << '\n';

  });


  // vlalign

  if (DoAlign) {

    for (int Index = 1; Index != NumSectors + 2; ++Index) {

      Value *PrevV = ASpanV[Index - 1].Seg.Val, *ThisV = ASpanV[Index].Seg.Val;

      Value *PrevM = ASpanM[Index - 1].Seg.Val, *ThisM = ASpanM[Index].Seg.Val;

      assert(isSectorTy(PrevV->getType()) && isSectorTy(PrevM->getType()));

      ASpanV[Index - 1].Seg.Val = HVC.vlalignb(Builder, PrevV, ThisV, AlignVal);

      ASpanM[Index - 1].Seg.Val = HVC.vlalignb(Builder, PrevM, ThisM, AlignVal);

    }

  }


  LLVM_DEBUG({

    dbgs() << "ASpanV after vlalign:\n" << ASpanV << '\n';

    dbgs() << "ASpanM after vlalign:\n" << ASpanM << '\n';

  });


  auto createStore = [&](IRBuilderBase &Builder, const ByteSpan &ASpanV,

                         const ByteSpan &ASpanM, int Index, bool MakePred) {

    Value *Val = ASpanV[Index].Seg.Val;

    Value *Mask = ASpanM[Index].Seg.Val; // bytes

    if (HVC.isUndef(Val) || HVC.isZero(Mask))

      return;

    Value *Ptr =

        createAdjustedPointer(Builder, AlignAddr, SecTy, Index * ScLen);

    Value *Predicate =

        MakePred ? makeTestIfUnaligned(Builder, AlignVal, ScLen) : nullptr;


    // If vector shifting is potentially needed, accumulate metadata

    // from source sections of twice the store width.

    int Start = (Index - DoAlign) * ScLen;

    int Width = (1 + DoAlign) * ScLen;

    this->createStore(Builder, Val, Ptr, Predicate, ScLen,

                      HVC.vlsb(Builder, Mask),

                      VSpan.section(Start, Width).values());

  };


  for (int Index = 0; Index != NumSectors + DoAlign; ++Index) {

    createStore(Builder, ASpanV, ASpanM, Index, DoAlign && Index == NumSectors);

  }

}


auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {

  LLVM_DEBUG(dbgs() << "Realigning group:\n" << Move << '\n');


  // TODO: Needs support for masked loads/stores of "scalar" vectors.

  if (!Move.IsHvx)

    return false;


  // Return the element with the maximum alignment from Range,

  // where GetValue obtains the value to compare from an element.

  auto getMaxOf = [](auto Range, auto GetValue) {

    return *llvm::max_element(Range, [&GetValue](auto &A, auto &B) {

      return GetValue(A) < GetValue(B);

    });

  };


  const AddrList &BaseInfos = AddrGroups.at(Move.Base);


  // Conceptually, there is a vector of N bytes covering the addresses

  // starting from the minimum offset (i.e. Base.Addr+Start). This vector

  // represents a contiguous memory region that spans all accessed memory

  // locations.

  // The correspondence between loaded or stored values will be expressed

  // in terms of this vector. For example, the 0th element of the vector

  // from the Base address info will start at byte Start from the beginning

  // of this conceptual vector.

  //

  // This vector will be loaded/stored starting at the nearest down-aligned

  // address and the amount od the down-alignment will be AlignVal:

  //   valign(load_vector(align_down(Base+Start)), AlignVal)


  std::set<Instruction *> TestSet(Move.Main.begin(), Move.Main.end());

  AddrList MoveInfos;

  llvm::copy_if(

      BaseInfos, std::back_inserter(MoveInfos),

      [&TestSet](const AddrInfo &AI) { return TestSet.count(AI.Inst); });


  // Maximum alignment present in the whole address group.

  const AddrInfo &WithMaxAlign =

      getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.HaveAlign; });

  Align MaxGiven = WithMaxAlign.HaveAlign;


  // Minimum alignment present in the move address group.

  const AddrInfo &WithMinOffset =

      getMaxOf(MoveInfos, [](const AddrInfo &AI) { return -AI.Offset; });


  const AddrInfo &WithMaxNeeded =

      getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.NeedAlign; });

  Align MinNeeded = WithMaxNeeded.NeedAlign;


  // Set the builder's insertion point right before the load group, or

  // immediately after the store group. (Instructions in a store group are

  // listed in reverse order.)

  Instruction *InsertAt = Move.Main.front();

  if (!Move.IsLoad) {

    // There should be a terminator (which store isn't, but check anyways).

    assert(InsertAt->getIterator() != InsertAt->getParent()->end());

    InsertAt = &*std::next(InsertAt->getIterator());

  }


  IRBuilder Builder(InsertAt->getParent(), InsertAt->getIterator(),

                    InstSimplifyFolder(HVC.DL));

  Value *AlignAddr = nullptr; // Actual aligned address.

  Value *AlignVal = nullptr;  // Right-shift amount (for valign).


  if (MinNeeded <= MaxGiven) {

    int Start = WithMinOffset.Offset;

    int OffAtMax = WithMaxAlign.Offset;

    // Shift the offset of the maximally aligned instruction (OffAtMax)

    // back by just enough multiples of the required alignment to cover the

    // distance from Start to OffAtMax.

    // Calculate the address adjustment amount based on the address with the

    // maximum alignment. This is to allow a simple gep instruction instead

    // of potential bitcasts to i8*.

    int Adjust = -alignTo(OffAtMax - Start, MinNeeded.value());

    AlignAddr = createAdjustedPointer(Builder, WithMaxAlign.Addr,

                                      WithMaxAlign.ValTy, Adjust, Move.Clones);

    int Diff = Start - (OffAtMax + Adjust);

    AlignVal = HVC.getConstInt(Diff);

    assert(Diff >= 0);

    assert(static_cast<decltype(MinNeeded.value())>(Diff) < MinNeeded.value());

  } else {

    // WithMinOffset is the lowest address in the group,

    //   WithMinOffset.Addr = Base+Start.

    // Align instructions for both HVX (V6_valign) and scalar (S2_valignrb)

    // mask off unnecessary bits, so it's ok to just the original pointer as

    // the alignment amount.

    // Do an explicit down-alignment of the address to avoid creating an

    // aligned instruction with an address that is not really aligned.

    AlignAddr =

        createAlignedPointer(Builder, WithMinOffset.Addr, WithMinOffset.ValTy,

                             MinNeeded.value(), Move.Clones);

    AlignVal =

        Builder.CreatePtrToInt(WithMinOffset.Addr, HVC.getIntTy(), "pti");

    if (auto *I = dyn_cast<Instruction>(AlignVal)) {

      for (auto [Old, New] : Move.Clones)

        I->replaceUsesOfWith(Old, New);

    }

  }


  ByteSpan VSpan;

  for (const AddrInfo &AI : MoveInfos) {

    VSpan.Blocks.emplace_back(AI.Inst, HVC.getSizeOf(AI.ValTy),

                              AI.Offset - WithMinOffset.Offset);

  }


  // The aligned loads/stores will use blocks that are either scalars,

  // or HVX vectors. Let "sector" be the unified term for such a block.

  // blend(scalar, vector) -> sector...

  int ScLen = Move.IsHvx ? HVC.HST.getVectorLength()

                         : std::max<int>(MinNeeded.value(), 4);

  assert(!Move.IsHvx || ScLen == 64 || ScLen == 128);

  assert(Move.IsHvx || ScLen == 4 || ScLen == 8);


  LLVM_DEBUG({

    dbgs() << "ScLen:  " << ScLen << "\n";

    dbgs() << "AlignVal:" << *AlignVal << "\n";

    dbgs() << "AlignAddr:" << *AlignAddr << "\n";

    dbgs() << "VSpan:\n" << VSpan << '\n';

  });


  if (Move.IsLoad)

    realignLoadGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);

  else

    realignStoreGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);


  for (auto *Inst : Move.Main)

    Inst->eraseFromParent();


  return true;

}


auto AlignVectors::makeTestIfUnaligned(IRBuilderBase &Builder, Value *AlignVal,

                                       int Alignment) const -> Value * {

  auto *AlignTy = AlignVal->getType();

  Value *And = Builder.CreateAnd(

      AlignVal, ConstantInt::get(AlignTy, Alignment - 1), "and");

  Value *Zero = ConstantInt::get(AlignTy, 0);

  return Builder.CreateICmpNE(And, Zero, "isz");

}


auto AlignVectors::isSectorTy(Type *Ty) const -> bool {

  if (!HVC.isByteVecTy(Ty))

    return false;

  int Size = HVC.getSizeOf(Ty);

  if (HVC.HST.isTypeForHVX(Ty))

    return Size == static_cast<int>(HVC.HST.getVectorLength());

  return Size == 4 || Size == 8;

}


auto AlignVectors::run() -> bool {

  LLVM_DEBUG(dbgs() << "\nRunning HVC::AlignVectors on " << HVC.F.getName()

                    << '\n');

  if (!createAddressGroups())

    return false;


  LLVM_DEBUG({

    dbgs() << "Address groups(" << AddrGroups.size() << "):\n";

    for (auto &[In, AL] : AddrGroups) {

      for (const AddrInfo &AI : AL)

        dbgs() << "---\n" << AI << '\n';

    }

  });


  bool Changed = false;

  MoveList LoadGroups, StoreGroups;


  for (auto &G : AddrGroups) {

    llvm::append_range(LoadGroups, createLoadGroups(G.second));

    llvm::append_range(StoreGroups, createStoreGroups(G.second));

  }


  LLVM_DEBUG({

    dbgs() << "\nLoad groups(" << LoadGroups.size() << "):\n";

    for (const MoveGroup &G : LoadGroups)

      dbgs() << G << "\n";

    dbgs() << "Store groups(" << StoreGroups.size() << "):\n";

    for (const MoveGroup &G : StoreGroups)

      dbgs() << G << "\n";

  });


  // Cumulative limit on the number of groups.

  unsigned CountLimit = VAGroupCountLimit;

  if (CountLimit == 0)

    return false;


  if (LoadGroups.size() > CountLimit) {

    LoadGroups.resize(CountLimit);

    StoreGroups.clear();

  } else {

    unsigned StoreLimit = CountLimit - LoadGroups.size();

    if (StoreGroups.size() > StoreLimit)

      StoreGroups.resize(StoreLimit);

  }


  for (auto &M : LoadGroups)

    Changed |= moveTogether(M);

  for (auto &M : StoreGroups)

    Changed |= moveTogether(M);


  LLVM_DEBUG(dbgs() << "After moveTogether:\n" << HVC.F);


  for (auto &M : LoadGroups)

    Changed |= realignGroup(M);

  for (auto &M : StoreGroups)

    Changed |= realignGroup(M);


  return Changed;

}


// --- End AlignVectors


// --- Begin HvxIdioms


auto HvxIdioms::getNumSignificantBits(Value *V, Instruction *In) const

    -> std::pair<unsigned, Signedness> {

  unsigned Bits = HVC.getNumSignificantBits(V, In);

  // The significant bits are calculated including the sign bit. This may

  // add an extra bit for zero-extended values, e.g. (zext i32 to i64) may

  // result in 33 significant bits. To avoid extra words, skip the extra

  // sign bit, but keep information that the value is to be treated as

  // unsigned.

  KnownBits Known = HVC.getKnownBits(V, In);

  Signedness Sign = Signed;

  unsigned NumToTest = 0; // Number of bits used in test for unsignedness.

  if (isPowerOf2_32(Bits))

    NumToTest = Bits;

  else if (Bits > 1 && isPowerOf2_32(Bits - 1))

    NumToTest = Bits - 1;


  if (NumToTest != 0 && Known.Zero.ashr(NumToTest).isAllOnes()) {

    Sign = Unsigned;

    Bits = NumToTest;

  }


  // If the top bit of the nearest power-of-2 is zero, this value is

  // positive. It could be treated as either signed or unsigned.

  if (unsigned Pow2 = PowerOf2Ceil(Bits); Pow2 != Bits) {

    if (Known.Zero.ashr(Pow2 - 1).isAllOnes())

      Sign = Positive;

  }

  return {Bits, Sign};

}


auto HvxIdioms::canonSgn(SValue X, SValue Y) const

    -> std::pair<SValue, SValue> {

  // Canonicalize the signedness of X and Y, so that the result is one of:

  //   S, S

  //   U/P, S

  //   U/P, U/P

  if (X.Sgn == Signed && Y.Sgn != Signed)

    std::swap(X, Y);

  return {X, Y};

}


// Match

//   (X * Y) [>> N], or

//   ((X * Y) + (1 << M)) >> N

auto HvxIdioms::matchFxpMul(Instruction &In) const -> std::optional<FxpOp> {

  using namespace PatternMatch;

  auto *Ty = In.getType();


  if (!Ty->isVectorTy() || !Ty->getScalarType()->isIntegerTy())

    return std::nullopt;


  unsigned Width = cast<IntegerType>(Ty->getScalarType())->getBitWidth();


  FxpOp Op;

  Value *Exp = &In;


  // Fixed-point multiplication is always shifted right (except when the

  // fraction is 0 bits).

  auto m_Shr = [](auto &&V, auto &&S) {

    return m_CombineOr(m_LShr(V, S), m_AShr(V, S));

  };


  uint64_t Qn = 0;

  if (Value *T; match(Exp, m_Shr(m_Value(T), m_ConstantInt(Qn)))) {

    Op.Frac = Qn;

    Exp = T;

  } else {

    Op.Frac = 0;

  }


  if (Op.Frac > Width)

    return std::nullopt;


  // Check if there is rounding added.

  uint64_t CV;

  if (Value *T;

      Op.Frac > 0 && match(Exp, m_Add(m_Value(T), m_ConstantInt(CV)))) {

    if (CV != 0 && !isPowerOf2_64(CV))

      return std::nullopt;

    if (CV != 0)

      Op.RoundAt = Log2_64(CV);

    Exp = T;

  }


  // Check if the rest is a multiplication.

  if (match(Exp, m_Mul(m_Value(Op.X.Val), m_Value(Op.Y.Val)))) {

    Op.Opcode = Instruction::Mul;

    // FIXME: The information below is recomputed.

    Op.X.Sgn = getNumSignificantBits(Op.X.Val, &In).second;

    Op.Y.Sgn = getNumSignificantBits(Op.Y.Val, &In).second;

    Op.ResTy = cast<VectorType>(Ty);

    return Op;

  }


  return std::nullopt;

}


auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const

    -> Value * {

  assert(Op.X.Val->getType() == Op.Y.Val->getType());


  auto *VecTy = dyn_cast<VectorType>(Op.X.Val->getType());

  if (VecTy == nullptr)

    return nullptr;

  auto *ElemTy = cast<IntegerType>(VecTy->getElementType());

  unsigned ElemWidth = ElemTy->getBitWidth();


  // TODO: This can be relaxed after legalization is done pre-isel.

  if ((HVC.length(VecTy) * ElemWidth) % (8 * HVC.HST.getVectorLength()) != 0)

    return nullptr;


  // There are no special intrinsics that should be used for multiplying

  // signed 8-bit values, so just skip them. Normal codegen should handle

  // this just fine.

  if (ElemWidth <= 8)

    return nullptr;

  // Similarly, if this is just a multiplication that can be handled without

  // intervention, then leave it alone.

  if (ElemWidth <= 32 && Op.Frac == 0)

    return nullptr;


  auto [BitsX, SignX] = getNumSignificantBits(Op.X.Val, &In);

  auto [BitsY, SignY] = getNumSignificantBits(Op.Y.Val, &In);


  // TODO: Add multiplication of vectors by scalar registers (up to 4 bytes).


  Value *X = Op.X.Val, *Y = Op.Y.Val;

  IRBuilder Builder(In.getParent(), In.getIterator(),

                    InstSimplifyFolder(HVC.DL));


  auto roundUpWidth = [](unsigned Width) -> unsigned {

    if (Width <= 32 && !isPowerOf2_32(Width)) {

      // If the element width is not a power of 2, round it up

      // to the next one. Do this for widths not exceeding 32.

      return PowerOf2Ceil(Width);

    }

    if (Width > 32 && Width % 32 != 0) {

      // For wider elements, round it up to the multiple of 32.

      return alignTo(Width, 32u);

    }

    return Width;

  };


  BitsX = roundUpWidth(BitsX);

  BitsY = roundUpWidth(BitsY);


  // For elementwise multiplication vectors must have the same lengths, so

  // resize the elements of both inputs to the same width, the max of the

  // calculated significant bits.

  unsigned Width = std::max(BitsX, BitsY);


  auto *ResizeTy = VectorType::get(HVC.getIntTy(Width), VecTy);

  if (Width < ElemWidth) {

    X = Builder.CreateTrunc(X, ResizeTy, "trn");

    Y = Builder.CreateTrunc(Y, ResizeTy, "trn");

  } else if (Width > ElemWidth) {

    X = SignX == Signed ? Builder.CreateSExt(X, ResizeTy, "sxt")

                        : Builder.CreateZExt(X, ResizeTy, "zxt");

    Y = SignY == Signed ? Builder.CreateSExt(Y, ResizeTy, "sxt")

                        : Builder.CreateZExt(Y, ResizeTy, "zxt");

  };


  assert(X->getType() == Y->getType() && X->getType() == ResizeTy);


  unsigned VecLen = HVC.length(ResizeTy);

  unsigned ChopLen = (8 * HVC.HST.getVectorLength()) / std::min(Width, 32u);


  SmallVector<Value *> Results;

  FxpOp ChopOp = Op;

  ChopOp.ResTy = VectorType::get(Op.ResTy->getElementType(), ChopLen, false);


  for (unsigned V = 0; V != VecLen / ChopLen; ++V) {

    ChopOp.X.Val = HVC.subvector(Builder, X, V * ChopLen, ChopLen);

    ChopOp.Y.Val = HVC.subvector(Builder, Y, V * ChopLen, ChopLen);

    Results.push_back(processFxpMulChopped(Builder, In, ChopOp));

    if (Results.back() == nullptr)

      break;

  }


  if (Results.empty() || Results.back() == nullptr)

    return nullptr;


  Value *Cat = HVC.concat(Builder, Results);

  Value *Ext = SignX == Signed || SignY == Signed

                   ? Builder.CreateSExt(Cat, VecTy, "sxt")

                   : Builder.CreateZExt(Cat, VecTy, "zxt");

  return Ext;

}


inline bool HvxIdioms::matchScatter(Instruction &In) const {

  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);

  if (!II)

    return false;

  return (II->getIntrinsicID() == Intrinsic::masked_scatter);

}


inline bool HvxIdioms::matchGather(Instruction &In) const {

  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);

  if (!II)

    return false;

  return (II->getIntrinsicID() == Intrinsic::masked_gather);

}


Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual);


// Binary instructions we want to handle as users of gather/scatter.


inline bool isArithmetic(unsigned Opc) {

  switch (Opc) {

  case Instruction::Add:

  case Instruction::Sub:

  case Instruction::Mul:

  case Instruction::And:

  case Instruction::Or:

  case Instruction::Xor:

  case Instruction::AShr:

  case Instruction::LShr:

  case Instruction::Shl:

  case Instruction::UDiv:

    return true;

  }

  return false;

}


// TODO: Maybe use MemoryLocation for this. See getLocOrNone above.


inline Value *getPointer(Value *Ptr) {

  assert(Ptr && "Unable to extract pointer");

  if (isa<AllocaInst>(Ptr) || isa<Argument>(Ptr) || isa<GlobalValue>(Ptr))

    return Ptr;

  if (isa<LoadInst>(Ptr) || isa<StoreInst>(Ptr))

    return getLoadStorePointerOperand(Ptr);

  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Ptr)) {

    if (II->getIntrinsicID() == Intrinsic::masked_store)

      return II->getOperand(1);

  }

  return nullptr;

}


static Instruction *selectDestination(Instruction *In,

                                      HvxIdioms::DstQualifier &Qual) {

  Instruction *Destination = nullptr;

  if (!In)

    return Destination;

  if (isa<StoreInst>(In)) {

    Destination = In;

    Qual = HvxIdioms::LdSt;

  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {

    if (II->getIntrinsicID() == Intrinsic::masked_gather) {

      Destination = In;

      Qual = HvxIdioms::LLVM_Gather;

    } else if (II->getIntrinsicID() == Intrinsic::masked_scatter) {

      Destination = In;

      Qual = HvxIdioms::LLVM_Scatter;

    } else if (II->getIntrinsicID() == Intrinsic::masked_store) {

      Destination = In;

      Qual = HvxIdioms::LdSt;

    } else if (II->getIntrinsicID() ==

               Intrinsic::hexagon_V6_vgather_vscattermh) {

      Destination = In;

      Qual = HvxIdioms::HEX_Gather_Scatter;

    } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vscattermh_128B) {

      Destination = In;

      Qual = HvxIdioms::HEX_Scatter;

    } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vgathermh_128B) {

      Destination = In;

      Qual = HvxIdioms::HEX_Gather;

    }

  } else if (isa<ZExtInst>(In)) {

    return locateDestination(In, Qual);

  } else if (isa<CastInst>(In)) {

    return locateDestination(In, Qual);

  } else if (isa<CallInst>(In)) {

    Destination = In;

    Qual = HvxIdioms::Call;

  } else if (isa<GetElementPtrInst>(In)) {

    return locateDestination(In, Qual);

  } else if (isArithmetic(In->getOpcode())) {

    Destination = In;

    Qual = HvxIdioms::Arithmetic;

  } else {

    LLVM_DEBUG(dbgs() << "Unhandled destination : " << *In << "\n");

  }

  return Destination;

}


// This method attempts to find destination (user) for a given intrinsic.

// Given that these are produced only by Ripple, the number of options is

// limited. Simplest case is explicit store which in fact is redundant (since

// HVX gater creates its own store during packetization). Nevertheless we need

// to figure address where we storing. Other cases are more complicated, but

// still few.


Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual) {

  Instruction *Destination = nullptr;

  if (!In)

    return Destination;

  // Get all possible destinations

  SmallVector<Instruction *> Users;

  // Iterate over the uses of the instruction

  for (auto &U : In->uses()) {

    if (auto *UI = dyn_cast<Instruction>(U.getUser())) {

      Destination = selectDestination(UI, Qual);

      if (Destination)

        Users.push_back(Destination);

    }

  }

  // Now see which of the users (if any) is a memory destination.

  for (auto *I : Users)

    if (getPointer(I))

      return I;

  return Destination;

}


// The two intrinsics we handle here have GEP in a different position.


inline GetElementPtrInst *locateGepFromIntrinsic(Instruction *In) {

  assert(In && "Bad instruction");

  IntrinsicInst *IIn = dyn_cast<IntrinsicInst>(In);

  assert((IIn && (IIn->getIntrinsicID() == Intrinsic::masked_gather ||

                  IIn->getIntrinsicID() == Intrinsic::masked_scatter)) &&

         "Not a gather Intrinsic");

  GetElementPtrInst *GEPIndex = nullptr;

  if (IIn->getIntrinsicID() == Intrinsic::masked_gather)

    GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(0));

  else

    GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(1));

  return GEPIndex;

}


// Given the intrinsic find its GEP argument and extract base address it uses.

// The method relies on the way how Ripple typically forms the GEP for

// scatter/gather.


static Value *locateAddressFromIntrinsic(Instruction *In) {

  GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In);

  if (!GEPIndex) {

    LLVM_DEBUG(dbgs() << "  No GEP in intrinsic\n");

    return nullptr;

  }

  Value *BaseAddress = GEPIndex->getPointerOperand();

  auto *IndexLoad = dyn_cast<LoadInst>(BaseAddress);

  if (IndexLoad)

    return IndexLoad;


  auto *IndexZEx = dyn_cast<ZExtInst>(BaseAddress);

  if (IndexZEx) {

    IndexLoad = dyn_cast<LoadInst>(IndexZEx->getOperand(0));

    if (IndexLoad)

      return IndexLoad;

    IntrinsicInst *II = dyn_cast<IntrinsicInst>(IndexZEx->getOperand(0));

    if (II && II->getIntrinsicID() == Intrinsic::masked_gather)

      return locateAddressFromIntrinsic(II);

  }

  auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(BaseAddress);

  if (BaseShuffle) {

    IndexLoad = dyn_cast<LoadInst>(BaseShuffle->getOperand(0));

    if (IndexLoad)

      return IndexLoad;

    auto *IE = dyn_cast<InsertElementInst>(BaseShuffle->getOperand(0));

    if (IE) {

      auto *Src = IE->getOperand(1);

      IndexLoad = dyn_cast<LoadInst>(Src);

      if (IndexLoad)

        return IndexLoad;

      auto *Alloca = dyn_cast<AllocaInst>(Src);

      if (Alloca)

        return Alloca;

      if (isa<Argument>(Src)) {

        return Src;

      }

      if (isa<GlobalValue>(Src)) {

        return Src;

      }

    }

  }

  LLVM_DEBUG(dbgs() << "  Unable to locate Address from intrinsic\n");

  return nullptr;

}


static Type *getIndexType(Value *In) {

  if (!In)

    return nullptr;


  if (isa<LoadInst>(In) || isa<StoreInst>(In))

    return getLoadStoreType(In);


  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {

    if (II->getIntrinsicID() == Intrinsic::masked_load)

      return II->getType();

    if (II->getIntrinsicID() == Intrinsic::masked_store)

      return II->getOperand(0)->getType();

  }

  return In->getType();

}


static Value *locateIndexesFromGEP(Value *In) {

  if (!In)

    return nullptr;

  if (isa<LoadInst>(In))

    return In;

  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {

    if (II->getIntrinsicID() == Intrinsic::masked_load)

      return In;

    if (II->getIntrinsicID() == Intrinsic::masked_gather)

      return In;

  }

  if (auto *IndexZEx = dyn_cast<ZExtInst>(In))

    return locateIndexesFromGEP(IndexZEx->getOperand(0));

  if (auto *IndexSEx = dyn_cast<SExtInst>(In))

    return locateIndexesFromGEP(IndexSEx->getOperand(0));

  if (auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(In))

    return locateIndexesFromGEP(BaseShuffle->getOperand(0));

  if (auto *IE = dyn_cast<InsertElementInst>(In))

    return locateIndexesFromGEP(IE->getOperand(1));

  if (auto *cstDataVector = dyn_cast<ConstantDataVector>(In))

    return cstDataVector;

  if (auto *GEPIndex = dyn_cast<GetElementPtrInst>(In))

    return GEPIndex->getOperand(0);

  return nullptr;

}


// Given the intrinsic find its GEP argument and extract offsetts from the base

// address it uses.


static Value *locateIndexesFromIntrinsic(Instruction *In) {

  GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In);

  if (!GEPIndex) {

    LLVM_DEBUG(dbgs() << "  No GEP in intrinsic\n");

    return nullptr;

  }

  Value *Indexes = GEPIndex->getOperand(1);

  if (auto *IndexLoad = locateIndexesFromGEP(Indexes))

    return IndexLoad;


  LLVM_DEBUG(dbgs() << "  Unable to locate Index from intrinsic\n");

  return nullptr;

}


// Because of aukward definition of many Hex intrinsics we often have to

// reinterprete HVX native <64 x i16> as <32 x i32> which in practice is a NOP

// for all use cases, so this only exist to make IR builder happy.


inline Value *getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC,

                                               IRBuilderBase &Builder,

                                               LLVMContext &Ctx, Value *I) {

  assert(I && "Unable to reinterprete cast");

  Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);

  std::vector<unsigned> shuffleMask;

  for (unsigned i = 0; i < 64; ++i)

    shuffleMask.push_back(i);

  Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);

  Value *CastShuffle =

      Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");

  return Builder.CreateBitCast(CastShuffle, NT, "cst64_i16_to_32_i32");

}


// Recast <128 x i8> as <32 x i32>


inline Value *getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC,

                                              IRBuilderBase &Builder,

                                              LLVMContext &Ctx, Value *I) {

  assert(I && "Unable to reinterprete cast");

  Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);

  std::vector<unsigned> shuffleMask;

  for (unsigned i = 0; i < 128; ++i)

    shuffleMask.push_back(i);

  Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);

  Value *CastShuffle =

      Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");

  return Builder.CreateBitCast(CastShuffle, NT, "cst128_i8_to_32_i32");

}


// Create <32 x i32> mask reinterpreted as <128 x i1> with a given pattern


inline Value *get_i32_Mask(const HexagonVectorCombine &HVC,

                           IRBuilderBase &Builder, LLVMContext &Ctx,

                           unsigned int pattern) {

  std::vector<unsigned int> byteMask;

  for (unsigned i = 0; i < 32; ++i)

    byteMask.push_back(pattern);


  return Builder.CreateIntrinsic(

      HVC.getBoolTy(128), HVC.HST.getIntrinsicId(Hexagon::V6_vandvrt),

      {llvm::ConstantDataVector::get(Ctx, byteMask), HVC.getConstInt(~0)},

      nullptr);

}


Value *HvxIdioms::processVScatter(Instruction &In) const {

  auto *InpTy = dyn_cast<VectorType>(In.getOperand(0)->getType());

  assert(InpTy && "Cannot handle no vector type for llvm.scatter/gather");

  unsigned InpSize = HVC.getSizeOf(InpTy);

  auto *F = In.getFunction();

  LLVMContext &Ctx = F->getContext();

  auto *ElemTy = dyn_cast<IntegerType>(InpTy->getElementType());

  assert(ElemTy && "llvm.scatter needs integer type argument");

  unsigned ElemWidth = HVC.DL.getTypeAllocSize(ElemTy);

  LLVM_DEBUG({

    unsigned Elements = HVC.length(InpTy);

    dbgs() << "\n[Process scatter](" << In << ")\n" << *In.getParent() << "\n";

    dbgs() << "  Input type(" << *InpTy << ") elements(" << Elements

           << ") VecLen(" << InpSize << ") type(" << *ElemTy << ") ElemWidth("

           << ElemWidth << ")\n";

  });


  IRBuilder Builder(In.getParent(), In.getIterator(),

                    InstSimplifyFolder(HVC.DL));


  auto *ValueToScatter = In.getOperand(0);

  LLVM_DEBUG(dbgs() << "  ValueToScatter   : " << *ValueToScatter << "\n");


  if (HVC.HST.getVectorLength() != InpSize) {

    LLVM_DEBUG(dbgs() << "Unhandled vector size(" << InpSize

                      << ") for vscatter\n");

    return nullptr;

  }


  // Base address of indexes.

  auto *IndexLoad = locateAddressFromIntrinsic(&In);

  if (!IndexLoad)

    return nullptr;

  LLVM_DEBUG(dbgs() << "  IndexLoad        : " << *IndexLoad << "\n");


  // Address of destination. Must be in VTCM.

  auto *Ptr = getPointer(IndexLoad);

  if (!Ptr)

    return nullptr;

  LLVM_DEBUG(dbgs() << "  Ptr              : " << *Ptr << "\n");

  // Indexes/offsets

  auto *Indexes = locateIndexesFromIntrinsic(&In);

  if (!Indexes)

    return nullptr;

  LLVM_DEBUG(dbgs() << "  Indexes          : " << *Indexes << "\n");

  Value *CastedDst = Builder.CreateBitOrPointerCast(Ptr, Type::getInt32Ty(Ctx),

                                                    "cst_ptr_to_i32");

  LLVM_DEBUG(dbgs() << "  CastedDst        : " << *CastedDst << "\n");

  // Adjust Indexes

  auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);

  Value *CastIndex = nullptr;

  if (cstDataVector) {

    // Our indexes are represented as a constant. We need it in a reg.

    AllocaInst *IndexesAlloca =

        Builder.CreateAlloca(HVC.getHvxTy(HVC.getIntTy(32), false));

    [[maybe_unused]] auto *StoreIndexes =

        Builder.CreateStore(cstDataVector, IndexesAlloca);

    LLVM_DEBUG(dbgs() << "  StoreIndexes     : " << *StoreIndexes << "\n");

    CastIndex = Builder.CreateLoad(IndexesAlloca->getAllocatedType(),

                                   IndexesAlloca, "reload_index");

  } else {

    if (ElemWidth == 2)

      CastIndex = getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);

    else

      CastIndex = Indexes;

  }

  LLVM_DEBUG(dbgs() << "  Cast index       : " << *CastIndex << ")\n");


  if (ElemWidth == 1) {

    // v128i8 There is no native instruction for this.

    // Do this as two Hi/Lo gathers with masking.

    Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);

    // Extend indexes. We assume that indexes are in 128i8 format - need to

    // expand them to Hi/Lo 64i16

    Value *CastIndexes = Builder.CreateBitCast(CastIndex, NT, "cast_to_32i32");

    auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);

    auto *UnpackedIndexes = Builder.CreateIntrinsic(

        HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastIndexes, nullptr);

    LLVM_DEBUG(dbgs() << "  UnpackedIndexes  : " << *UnpackedIndexes << ")\n");


    auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);

    auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);

    [[maybe_unused]] Value *IndexHi =

        HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);

    [[maybe_unused]] Value *IndexLo =

        HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);

    LLVM_DEBUG(dbgs() << "  UnpackedIndHi    : " << *IndexHi << ")\n");

    LLVM_DEBUG(dbgs() << "  UnpackedIndLo    : " << *IndexLo << ")\n");

    // Now unpack values to scatter

    Value *CastSrc =

        getReinterpretiveCast_i8_to_i32(HVC, Builder, Ctx, ValueToScatter);

    LLVM_DEBUG(dbgs() << "  CastSrc          : " << *CastSrc << ")\n");

    auto *UnpackedValueToScatter = Builder.CreateIntrinsic(

        HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastSrc, nullptr);

    LLVM_DEBUG(dbgs() << "  UnpackedValToScat: " << *UnpackedValueToScatter

                      << ")\n");


    [[maybe_unused]] Value *UVSHi =

        HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedValueToScatter);

    [[maybe_unused]] Value *UVSLo =

        HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedValueToScatter);

    LLVM_DEBUG(dbgs() << "  UVSHi            : " << *UVSHi << ")\n");

    LLVM_DEBUG(dbgs() << "  UVSLo            : " << *UVSLo << ")\n");


    // Create the mask for individual bytes

    auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);

    LLVM_DEBUG(dbgs() << "  QByteMask        : " << *QByteMask << "\n");

    [[maybe_unused]] auto *ResHi = Builder.CreateIntrinsic(

        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,

        {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),

         IndexHi, UVSHi},

        nullptr);

    LLVM_DEBUG(dbgs() << "  ResHi            : " << *ResHi << ")\n");

    return Builder.CreateIntrinsic(

        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,

        {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),

         IndexLo, UVSLo},

        nullptr);

  } else if (ElemWidth == 2) {

    Value *CastSrc =

        getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, ValueToScatter);

    LLVM_DEBUG(dbgs() << "  CastSrc        : " << *CastSrc << ")\n");

    return Builder.CreateIntrinsic(

        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermh_128B,

        {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,

         CastSrc},

        nullptr);

  } else if (ElemWidth == 4) {

    return Builder.CreateIntrinsic(

        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermw_128B,

        {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,

         ValueToScatter},

        nullptr);

  } else {

    LLVM_DEBUG(dbgs() << "Unhandled element type for vscatter\n");

    return nullptr;

  }

}


Value *HvxIdioms::processVGather(Instruction &In) const {

  [[maybe_unused]] auto *InpTy =

      dyn_cast<VectorType>(In.getOperand(0)->getType());

  assert(InpTy && "Cannot handle no vector type for llvm.gather");

  [[maybe_unused]] auto *ElemTy =

      dyn_cast<PointerType>(InpTy->getElementType());

  assert(ElemTy && "llvm.gather needs vector of ptr argument");

  auto *F = In.getFunction();

  LLVMContext &Ctx = F->getContext();

  LLVM_DEBUG(dbgs() << "\n[Process gather](" << In << ")\n"

                    << *In.getParent() << "\n");

  LLVM_DEBUG(dbgs() << "  Input type(" << *InpTy << ") elements("

                    << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)

                    << ") type(" << *ElemTy << ") Access alignment("

                    << *In.getOperand(1) << ") AddressSpace("

                    << ElemTy->getAddressSpace() << ")\n");


  // TODO: Handle masking of elements.

  assert(dyn_cast<VectorType>(In.getOperand(2)->getType()) &&

         "llvm.gather needs vector for mask");

  IRBuilder Builder(In.getParent(), In.getIterator(),

                    InstSimplifyFolder(HVC.DL));


  // See who is using the result. The difference between LLVM and HVX vgather

  // Intrinsic makes it impossible to handle all cases with temp storage. Alloca

  // in VTCM is not yet supported, so for now we just bail out for those cases.

  HvxIdioms::DstQualifier Qual = HvxIdioms::Undefined;

  Instruction *Dst = locateDestination(&In, Qual);

  if (!Dst) {

    LLVM_DEBUG(dbgs() << "  Unable to locate vgather destination\n");

    return nullptr;

  }

  LLVM_DEBUG(dbgs() << "  Destination    : " << *Dst << " Qual(" << Qual

                    << ")\n");


  // Address of destination. Must be in VTCM.

  auto *Ptr = getPointer(Dst);

  if (!Ptr) {

    LLVM_DEBUG(dbgs() << "Could not locate vgather destination ptr\n");

    return nullptr;

  }


  // Result type. Assume it is a vector type.

  auto *DstType = cast<VectorType>(getIndexType(Dst));

  assert(DstType && "Cannot handle non vector dst type for llvm.gather");


  // Base address for sources to be loaded

  auto *IndexLoad = locateAddressFromIntrinsic(&In);

  if (!IndexLoad)

    return nullptr;

  LLVM_DEBUG(dbgs() << "  IndexLoad      : " << *IndexLoad << "\n");


  // Gather indexes/offsets

  auto *Indexes = locateIndexesFromIntrinsic(&In);

  if (!Indexes)

    return nullptr;

  LLVM_DEBUG(dbgs() << "  Indexes        : " << *Indexes << "\n");


  Instruction *Gather = nullptr;

  Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);

  if (Qual == HvxIdioms::LdSt || Qual == HvxIdioms::Arithmetic) {

    // We fully assume the address space is in VTCM. We also assume that all

    // pointers in Operand(0) have the same base(!).

    // This is the most basic case of all the above.

    unsigned OutputSize = HVC.getSizeOf(DstType);

    auto *DstElemTy = cast<IntegerType>(DstType->getElementType());

    unsigned ElemWidth = HVC.DL.getTypeAllocSize(DstElemTy);

    LLVM_DEBUG(dbgs() << "  Buffer type    : " << *Ptr->getType()

                      << "  Address space ("

                      << Ptr->getType()->getPointerAddressSpace() << ")\n"

                      << "  Result type    : " << *DstType

                      << "\n  Size in bytes  : " << OutputSize

                      << " element type(" << *DstElemTy

                      << ")\n  ElemWidth      : " << ElemWidth << " bytes\n");


    auto *IndexType = cast<VectorType>(getIndexType(Indexes));

    assert(IndexType && "Cannot handle non vector index type for llvm.gather");

    unsigned IndexWidth = HVC.DL.getTypeAllocSize(IndexType->getElementType());

    LLVM_DEBUG(dbgs() << "  IndexWidth(" << IndexWidth << ")\n");


    // Intrinsic takes i32 instead of pointer so cast.

    Value *CastedPtr = Builder.CreateBitOrPointerCast(

        IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");

    // [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ...]

    // int_hexagon_V6_vgathermh       [... , llvm_v16i32_ty]

    // int_hexagon_V6_vgathermh_128B  [... , llvm_v32i32_ty]

    // int_hexagon_V6_vgathermhw      [... , llvm_v32i32_ty]

    // int_hexagon_V6_vgathermhw_128B [... , llvm_v64i32_ty]

    // int_hexagon_V6_vgathermw       [... , llvm_v16i32_ty]

    // int_hexagon_V6_vgathermw_128B  [... , llvm_v32i32_ty]

    if (HVC.HST.getVectorLength() == OutputSize) {

      if (ElemWidth == 1) {

        // v128i8 There is no native instruction for this.

        // Do this as two Hi/Lo gathers with masking.

        // Unpack indexes. We assume that indexes are in 128i8 format - need to

        // expand them to Hi/Lo 64i16

        Value *CastIndexes =

            Builder.CreateBitCast(Indexes, NT, "cast_to_32i32");

        auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);

        auto *UnpackedIndexes =

            Builder.CreateIntrinsic(HVC.getHvxTy(HVC.getIntTy(32), true),

                                    V6_vunpack, CastIndexes, nullptr);

        LLVM_DEBUG(dbgs() << "  UnpackedIndexes : " << *UnpackedIndexes

                          << ")\n");


        auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);

        auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);

        [[maybe_unused]] Value *IndexHi =

            HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);

        [[maybe_unused]] Value *IndexLo =

            HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);

        LLVM_DEBUG(dbgs() << "  UnpackedIndHi   : " << *IndexHi << ")\n");

        LLVM_DEBUG(dbgs() << "  UnpackedIndLo   : " << *IndexLo << ")\n");

        // Create the mask for individual bytes

        auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);

        LLVM_DEBUG(dbgs() << "  QByteMask       : " << *QByteMask << "\n");

        // We use our destination allocation as a temp storage

        // This is unlikely to work properly for masked gather.

        auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermhq);

        [[maybe_unused]] auto GatherHi = Builder.CreateIntrinsic(

            Type::getVoidTy(Ctx), V6_vgather,

            {Ptr, QByteMask, CastedPtr,

             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexHi},

            nullptr);

        LLVM_DEBUG(dbgs() << "  GatherHi        : " << *GatherHi << ")\n");

        // Rematerialize the result

        [[maybe_unused]] Value *LoadedResultHi = Builder.CreateLoad(

            HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_hi");

        LLVM_DEBUG(dbgs() << "  LoadedResultHi : " << *LoadedResultHi << "\n");

        // Same for the low part. Here we use Gather to return non-NULL result

        // from this function and continue to iterate. We also are deleting Dst

        // store below.

        Gather = Builder.CreateIntrinsic(

            Type::getVoidTy(Ctx), V6_vgather,

            {Ptr, QByteMask, CastedPtr,

             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexLo},

            nullptr);

        LLVM_DEBUG(dbgs() << "  GatherLo        : " << *Gather << ")\n");

        Value *LoadedResultLo = Builder.CreateLoad(

            HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_lo");

        LLVM_DEBUG(dbgs() << "  LoadedResultLo : " << *LoadedResultLo << "\n");

        // Now we have properly sized bytes in every other position

        // B b A a c a A b B c f F g G h H is presented as

        // B . b . A . a . c . a . A . b . B . c . f . F . g . G . h . H

        // Use vpack to gather them

        auto V6_vpackeb = HVC.HST.getIntrinsicId(Hexagon::V6_vpackeb);

        [[maybe_unused]] auto Res = Builder.CreateIntrinsic(

            NT, V6_vpackeb, {LoadedResultHi, LoadedResultLo}, nullptr);

        LLVM_DEBUG(dbgs() << "  ScaledRes      : " << *Res << "\n");

        [[maybe_unused]] auto *StoreRes = Builder.CreateStore(Res, Ptr);

        LLVM_DEBUG(dbgs() << "  StoreRes       : " << *StoreRes << "\n");

      } else if (ElemWidth == 2) {

        // v32i16

        if (IndexWidth == 2) {

          // Reinterprete 64i16 as 32i32. Only needed for syntactic IR match.

          Value *CastIndex =

              getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);

          LLVM_DEBUG(dbgs() << "  Cast index: " << *CastIndex << ")\n");

          // shift all i16 left by 1 to match short addressing mode instead of

          // byte.

          auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);

          Value *AdjustedIndex = HVC.createHvxIntrinsic(

              Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});

          LLVM_DEBUG(dbgs()

                     << "  Shifted half index: " << *AdjustedIndex << ")\n");


          auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermh);

          // The 3rd argument is the size of the region to gather from. Probably

          // want to set it to max VTCM size.

          Gather = Builder.CreateIntrinsic(

              Type::getVoidTy(Ctx), V6_vgather,

              {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),

               AdjustedIndex},

              nullptr);

          for (auto &U : Dst->uses()) {

            if (auto *UI = dyn_cast<Instruction>(U.getUser()))

              dbgs() << "    dst used by: " << *UI << "\n";

          }

          for (auto &U : In.uses()) {

            if (auto *UI = dyn_cast<Instruction>(U.getUser()))

              dbgs() << "    In used by : " << *UI << "\n";

          }

          // Create temp load from result in case the result is used by any

          // other instruction.

          Value *LoadedResult = Builder.CreateLoad(

              HVC.getHvxTy(HVC.getIntTy(16), false), Ptr, "temp_result");

          LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n");

          In.replaceAllUsesWith(LoadedResult);

        } else {

          dbgs() << "    Unhandled index type for vgather\n";

          return nullptr;

        }

      } else if (ElemWidth == 4) {

        if (IndexWidth == 4) {

          // v32i32

          auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);

          Value *AdjustedIndex = HVC.createHvxIntrinsic(

              Builder, V6_vaslh, NT, {Indexes, HVC.getConstInt(2)});

          LLVM_DEBUG(dbgs()

                     << "  Shifted word index: " << *AdjustedIndex << ")\n");

          Gather = Builder.CreateIntrinsic(

              Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermw_128B,

              {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),

               AdjustedIndex},

              nullptr);

        } else {

          LLVM_DEBUG(dbgs() << "    Unhandled index type for vgather\n");

          return nullptr;

        }

      } else {

        LLVM_DEBUG(dbgs() << "    Unhandled element type for vgather\n");

        return nullptr;

      }

    } else if (HVC.HST.getVectorLength() == OutputSize * 2) {

      // This is half of the reg width, duplicate low in high

      LLVM_DEBUG(dbgs() << "    Unhandled half of register size\n");

      return nullptr;

    } else if (HVC.HST.getVectorLength() * 2 == OutputSize) {

      LLVM_DEBUG(dbgs() << "    Unhandle twice the register size\n");

      return nullptr;

    }

    // Erase the original intrinsic and store that consumes it.

    // HVX will create a pseudo for gather that is expanded to gather + store

    // during packetization.

    Dst->eraseFromParent();

  } else if (Qual == HvxIdioms::LLVM_Scatter) {

    // Gather feeds directly into scatter.

    auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType());

    assert(DstInpTy && "Cannot handle no vector type for llvm.scatter");

    [[maybe_unused]] unsigned DstInpSize = HVC.getSizeOf(DstInpTy);

    [[maybe_unused]] unsigned DstElements = HVC.length(DstInpTy);

    [[maybe_unused]] auto *DstElemTy =

        cast<PointerType>(DstInpTy->getElementType());

    assert(DstElemTy && "llvm.scatter needs vector of ptr argument");

    LLVM_DEBUG(dbgs() << "  Gather feeds into scatter\n  Values to scatter : "

                      << *Dst->getOperand(0) << "\n");

    LLVM_DEBUG(dbgs() << "  Dst type(" << *DstInpTy << ") elements("

                      << DstElements << ") VecLen(" << DstInpSize << ") type("

                      << *DstElemTy << ") Access alignment("

                      << *Dst->getOperand(2) << ")\n");

    // Address of source

    auto *Src = getPointer(IndexLoad);

    if (!Src)

      return nullptr;

    LLVM_DEBUG(dbgs() << "  Src            : " << *Src << "\n");


    if (!isa<PointerType>(Src->getType())) {

      LLVM_DEBUG(dbgs() << "    Source is not a pointer type...\n");

      return nullptr;

    }


    Value *CastedSrc = Builder.CreateBitOrPointerCast(

        Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");

    LLVM_DEBUG(dbgs() << "  CastedSrc: " << *CastedSrc << "\n");


    auto *DstLoad = locateAddressFromIntrinsic(Dst);

    if (!DstLoad) {

      LLVM_DEBUG(dbgs() << "  Unable to locate DstLoad\n");

      return nullptr;

    }

    LLVM_DEBUG(dbgs() << "  DstLoad  : " << *DstLoad << "\n");


    Value *Ptr = getPointer(DstLoad);

    if (!Ptr)

      return nullptr;

    LLVM_DEBUG(dbgs() << "  Ptr      : " << *Ptr << "\n");

    Value *CastIndex =

        getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, IndexLoad);

    LLVM_DEBUG(dbgs() << "  Cast index: " << *CastIndex << ")\n");

    // Shift all i16 left by 1 to match short addressing mode instead of

    // byte.

    auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);

    Value *AdjustedIndex = HVC.createHvxIntrinsic(

        Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});

    LLVM_DEBUG(dbgs() << "  Shifted half index: " << *AdjustedIndex << ")\n");


    return Builder.CreateIntrinsic(

        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,

        {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),

         AdjustedIndex},

        nullptr);

  } else if (Qual == HvxIdioms::HEX_Gather_Scatter) {

    // Gather feeds into previously inserted pseudo intrinsic.

    // These could not be in the same packet, so we need to generate another

    // pseudo that is expanded to .tmp + store V6_vgathermh_pseudo

    // V6_vgathermh_pseudo (ins IntRegs:$_dst_, s4_0Imm:$Ii, IntRegs:$Rt,

    // ModRegs:$Mu, HvxVR:$Vv)

    if (isa<AllocaInst>(IndexLoad)) {

      auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);

      if (cstDataVector) {

        // Our indexes are represented as a constant. We need THEM in a reg.

        // This most likely will not work properly since alloca gives us DDR

        // stack location. This will be fixed once we teach compiler about VTCM.

        AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);

        [[maybe_unused]] auto *StoreIndexes =

            Builder.CreateStore(cstDataVector, IndexesAlloca);

        LLVM_DEBUG(dbgs() << "  StoreIndexes   : " << *StoreIndexes << "\n");

        Value *LoadedIndex = Builder.CreateLoad(

            IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index");

        AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);

        LLVM_DEBUG(dbgs() << "  ResultAlloca   : " << *ResultAlloca << "\n");


        Value *CastedSrc = Builder.CreateBitOrPointerCast(

            IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");

        LLVM_DEBUG(dbgs() << "  CastedSrc      : " << *CastedSrc << "\n");


        Gather = Builder.CreateIntrinsic(

            Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,

            {ResultAlloca, CastedSrc,

             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},

            nullptr);

        Value *LoadedResult = Builder.CreateLoad(

            HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");

        LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n");

        LLVM_DEBUG(dbgs() << "  Gather         : " << *Gather << "\n");

        In.replaceAllUsesWith(LoadedResult);

      }

    } else {

      // Address of source

      auto *Src = getPointer(IndexLoad);

      if (!Src)

        return nullptr;

      LLVM_DEBUG(dbgs() << "  Src      : " << *Src << "\n");


      Value *CastedSrc = Builder.CreateBitOrPointerCast(

          Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");

      LLVM_DEBUG(dbgs() << "  CastedSrc: " << *CastedSrc << "\n");


      auto *DstLoad = locateAddressFromIntrinsic(Dst);

      if (!DstLoad)

        return nullptr;

      LLVM_DEBUG(dbgs() << "  DstLoad  : " << *DstLoad << "\n");

      auto *Ptr = getPointer(DstLoad);

      if (!Ptr)

        return nullptr;

      LLVM_DEBUG(dbgs() << "  Ptr      : " << *Ptr << "\n");


      Gather = Builder.CreateIntrinsic(

          Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgather_vscattermh,

          {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),

           Indexes},

          nullptr);

    }

    return Gather;

  } else if (Qual == HvxIdioms::HEX_Scatter) {

    // This is the case when result of a gather is used as an argument to

    // Intrinsic::hexagon_V6_vscattermh_128B. Most likely we just inserted it

    // ourselves. We have to create alloca, store to it, and replace all uses

    // with that.

    AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);

    Value *CastedSrc = Builder.CreateBitOrPointerCast(

        IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");

    LLVM_DEBUG(dbgs() << "  CastedSrc      : " << *CastedSrc << "\n");

    Value *CastIndex =

        getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);

    LLVM_DEBUG(dbgs() << "  Cast index     : " << *CastIndex << ")\n");


    Gather = Builder.CreateIntrinsic(

        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,

        {ResultAlloca, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),

         CastIndex},

        nullptr);

    Value *LoadedResult = Builder.CreateLoad(

        HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");

    LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n");

    In.replaceAllUsesWith(LoadedResult);

  } else if (Qual == HvxIdioms::HEX_Gather) {

    // Gather feeds to another gather but already replaced with

    // hexagon_V6_vgathermh_128B

    if (isa<AllocaInst>(IndexLoad)) {

      auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);

      if (cstDataVector) {

        // Our indexes are represented as a constant. We need it in a reg.

        AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);


        [[maybe_unused]] auto *StoreIndexes =

            Builder.CreateStore(cstDataVector, IndexesAlloca);

        LLVM_DEBUG(dbgs() << "  StoreIndexes   : " << *StoreIndexes << "\n");

        Value *LoadedIndex = Builder.CreateLoad(

            IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index");

        AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);

        LLVM_DEBUG(dbgs() << "  ResultAlloca   : " << *ResultAlloca

                          << "\n  AddressSpace: "

                          << ResultAlloca->getAddressSpace() << "\n";);


        Value *CastedSrc = Builder.CreateBitOrPointerCast(

            IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");

        LLVM_DEBUG(dbgs() << "  CastedSrc      : " << *CastedSrc << "\n");


        Gather = Builder.CreateIntrinsic(

            Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,

            {ResultAlloca, CastedSrc,

             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},

            nullptr);

        Value *LoadedResult = Builder.CreateLoad(

            HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");

        LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n");

        LLVM_DEBUG(dbgs() << "  Gather         : " << *Gather << "\n");

        In.replaceAllUsesWith(LoadedResult);

      }

    }

  } else if (Qual == HvxIdioms::LLVM_Gather) {

    // Gather feeds into another gather

    errs() << " Underimplemented vgather to vgather sequence\n";

    return nullptr;

  } else

    llvm_unreachable("Unhandled Qual enum");


  return Gather;

}


auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,

                                     const FxpOp &Op) const -> Value * {

  assert(Op.X.Val->getType() == Op.Y.Val->getType());

  auto *InpTy = cast<VectorType>(Op.X.Val->getType());

  unsigned Width = InpTy->getScalarSizeInBits();

  bool Rounding = Op.RoundAt.has_value();


  if (!Op.RoundAt || *Op.RoundAt == Op.Frac - 1) {

    // The fixed-point intrinsics do signed multiplication.

    if (Width == Op.Frac + 1 && Op.X.Sgn != Unsigned && Op.Y.Sgn != Unsigned) {

      Value *QMul = nullptr;

      if (Width == 16) {

        QMul = createMulQ15(Builder, Op.X, Op.Y, Rounding);

      } else if (Width == 32) {

        QMul = createMulQ31(Builder, Op.X, Op.Y, Rounding);

      }

      if (QMul != nullptr)

        return QMul;

    }

  }


  assert(Width >= 32 || isPowerOf2_32(Width)); // Width <= 32 => Width is 2^n

  assert(Width < 32 || Width % 32 == 0);       // Width > 32 => Width is 32*k


  // If Width < 32, then it should really be 16.

  if (Width < 32) {

    if (Width < 16)

      return nullptr;

    // Getting here with Op.Frac == 0 isn't wrong, but suboptimal: here we

    // generate a full precision products, which is unnecessary if there is

    // no shift.

    assert(Width == 16);

    assert(Op.Frac != 0 && "Unshifted mul should have been skipped");

    if (Op.Frac == 16) {

      // Multiply high

      if (Value *MulH = createMulH16(Builder, Op.X, Op.Y))

        return MulH;

    }

    // Do full-precision multiply and shift.

    Value *Prod32 = createMul16(Builder, Op.X, Op.Y);

    if (Rounding) {

      Value *RoundVal = HVC.getConstSplat(Prod32->getType(), 1 << *Op.RoundAt);

      Prod32 = Builder.CreateAdd(Prod32, RoundVal, "add");

    }


    Value *ShiftAmt = HVC.getConstSplat(Prod32->getType(), Op.Frac);

    Value *Shifted = Op.X.Sgn == Signed || Op.Y.Sgn == Signed

                         ? Builder.CreateAShr(Prod32, ShiftAmt, "asr")

                         : Builder.CreateLShr(Prod32, ShiftAmt, "lsr");

    return Builder.CreateTrunc(Shifted, InpTy, "trn");

  }


  // Width >= 32


  // Break up the arguments Op.X and Op.Y into vectors of smaller widths

  // in preparation of doing the multiplication by 32-bit parts.

  auto WordX = HVC.splitVectorElements(Builder, Op.X.Val, /*ToWidth=*/32);

  auto WordY = HVC.splitVectorElements(Builder, Op.Y.Val, /*ToWidth=*/32);

  auto WordP = createMulLong(Builder, WordX, Op.X.Sgn, WordY, Op.Y.Sgn);


  auto *HvxWordTy = cast<VectorType>(WordP.front()->getType());


  // Add the optional rounding to the proper word.

  if (Op.RoundAt.has_value()) {

    Value *Zero = HVC.getNullValue(WordX[0]->getType());

    SmallVector<Value *> RoundV(WordP.size(), Zero);

    RoundV[*Op.RoundAt / 32] =

        HVC.getConstSplat(HvxWordTy, 1 << (*Op.RoundAt % 32));

    WordP = createAddLong(Builder, WordP, RoundV);

  }


  // createRightShiftLong?


  // Shift all products right by Op.Frac.

  unsigned SkipWords = Op.Frac / 32;

  Constant *ShiftAmt = HVC.getConstSplat(HvxWordTy, Op.Frac % 32);


  for (int Dst = 0, End = WordP.size() - SkipWords; Dst != End; ++Dst) {

    int Src = Dst + SkipWords;

    Value *Lo = WordP[Src];

    if (Src + 1 < End) {

      Value *Hi = WordP[Src + 1];

      WordP[Dst] = Builder.CreateIntrinsic(HvxWordTy, Intrinsic::fshr,

                                           {Hi, Lo, ShiftAmt},

                                           /*FMFSource*/ nullptr, "int");

    } else {

      // The shift of the most significant word.

      WordP[Dst] = Builder.CreateAShr(Lo, ShiftAmt, "asr");

    }

  }

  if (SkipWords != 0)

    WordP.resize(WordP.size() - SkipWords);


  return HVC.joinVectorElements(Builder, WordP, Op.ResTy);

}


auto HvxIdioms::createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,

                             bool Rounding) const -> Value * {

  assert(X.Val->getType() == Y.Val->getType());

  assert(X.Val->getType()->getScalarType() == HVC.getIntTy(16));

  assert(HVC.HST.isHVXVectorType(EVT::getEVT(X.Val->getType(), false)));


  // There is no non-rounding intrinsic for i16.

  if (!Rounding || X.Sgn == Unsigned || Y.Sgn == Unsigned)

    return nullptr;


  auto V6_vmpyhvsrs = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhvsrs);

  return HVC.createHvxIntrinsic(Builder, V6_vmpyhvsrs, X.Val->getType(),

                                {X.Val, Y.Val});

}


auto HvxIdioms::createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,

                             bool Rounding) const -> Value * {

  Type *InpTy = X.Val->getType();

  assert(InpTy == Y.Val->getType());

  assert(InpTy->getScalarType() == HVC.getIntTy(32));

  assert(HVC.HST.isHVXVectorType(EVT::getEVT(InpTy, false)));


  if (X.Sgn == Unsigned || Y.Sgn == Unsigned)

    return nullptr;


  auto V6_vmpyewuh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyewuh);

  auto V6_vmpyo_acc = Rounding

                          ? HVC.HST.getIntrinsicId(Hexagon::V6_vmpyowh_rnd_sacc)

                          : HVC.HST.getIntrinsicId(Hexagon::V6_vmpyowh_sacc);

  Value *V1 =

      HVC.createHvxIntrinsic(Builder, V6_vmpyewuh, InpTy, {X.Val, Y.Val});

  return HVC.createHvxIntrinsic(Builder, V6_vmpyo_acc, InpTy,

                                {V1, X.Val, Y.Val});

}


auto HvxIdioms::createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,

                               Value *CarryIn) const

    -> std::pair<Value *, Value *> {

  assert(X->getType() == Y->getType());

  auto VecTy = cast<VectorType>(X->getType());

  if (VecTy == HvxI32Ty && HVC.HST.useHVXV62Ops()) {

    SmallVector<Value *> Args = {X, Y};

    Intrinsic::ID AddCarry;

    if (CarryIn == nullptr && HVC.HST.useHVXV66Ops()) {

      AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarryo);

    } else {

      AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarry);

      if (CarryIn == nullptr)

        CarryIn = HVC.getNullValue(HVC.getBoolTy(HVC.length(VecTy)));

      Args.push_back(CarryIn);

    }

    Value *Ret = HVC.createHvxIntrinsic(Builder, AddCarry,

                                        /*RetTy=*/nullptr, Args);

    Value *Result = Builder.CreateExtractValue(Ret, {0}, "ext");

    Value *CarryOut = Builder.CreateExtractValue(Ret, {1}, "ext");

    return {Result, CarryOut};

  }


  // In other cases, do a regular add, and unsigned compare-less-than.

  // The carry-out can originate in two places: adding the carry-in or adding

  // the two input values.

  Value *Result1 = X; // Result1 = X + CarryIn

  if (CarryIn != nullptr) {

    unsigned Width = VecTy->getScalarSizeInBits();

    uint32_t Mask = 1;

    if (Width < 32) {

      for (unsigned i = 0, e = 32 / Width; i != e; ++i)

        Mask = (Mask << Width) | 1;

    }

    auto V6_vandqrt = HVC.HST.getIntrinsicId(Hexagon::V6_vandqrt);

    Value *ValueIn =

        HVC.createHvxIntrinsic(Builder, V6_vandqrt, /*RetTy=*/nullptr,

                               {CarryIn, HVC.getConstInt(Mask)});

    Result1 = Builder.CreateAdd(X, ValueIn, "add");

  }


  Value *CarryOut1 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result1, X, "cmp");

  Value *Result2 = Builder.CreateAdd(Result1, Y, "add");

  Value *CarryOut2 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result2, Y, "cmp");

  return {Result2, Builder.CreateOr(CarryOut1, CarryOut2, "orb")};

}


auto HvxIdioms::createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const

    -> Value * {

  Intrinsic::ID V6_vmpyh = 0;

  std::tie(X, Y) = canonSgn(X, Y);


  if (X.Sgn == Signed) {

    V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhv);

  } else if (Y.Sgn == Signed) {

    // In vmpyhus the second operand is unsigned

    V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhus);

  } else {

    V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyuhv);

  }


  // i16*i16 -> i32 / interleaved

  Value *P =

      HVC.createHvxIntrinsic(Builder, V6_vmpyh, HvxP32Ty, {Y.Val, X.Val});

  // Deinterleave

  return HVC.vshuff(Builder, HVC.sublo(Builder, P), HVC.subhi(Builder, P));

}


auto HvxIdioms::createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const

    -> Value * {

  Type *HvxI16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/false);


  if (HVC.HST.useHVXV69Ops()) {

    if (X.Sgn != Signed && Y.Sgn != Signed) {

      auto V6_vmpyuhvs = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyuhvs);

      return HVC.createHvxIntrinsic(Builder, V6_vmpyuhvs, HvxI16Ty,

                                    {X.Val, Y.Val});

    }

  }


  Type *HvxP16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/true);

  Value *Pair16 =

      Builder.CreateBitCast(createMul16(Builder, X, Y), HvxP16Ty, "cst");

  unsigned Len = HVC.length(HvxP16Ty) / 2;


  SmallVector<int, 128> PickOdd(Len);

  for (int i = 0; i != static_cast<int>(Len); ++i)

    PickOdd[i] = 2 * i + 1;


  return Builder.CreateShuffleVector(

      HVC.sublo(Builder, Pair16), HVC.subhi(Builder, Pair16), PickOdd, "shf");

}


auto HvxIdioms::createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const

    -> std::pair<Value *, Value *> {

  assert(X.Val->getType() == Y.Val->getType());

  assert(X.Val->getType() == HvxI32Ty);


  Intrinsic::ID V6_vmpy_parts;

  std::tie(X, Y) = canonSgn(X, Y);


  if (X.Sgn == Signed) {

    V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyss_parts;

  } else if (Y.Sgn == Signed) {

    V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyus_parts;

  } else {

    V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyuu_parts;

  }


  Value *Parts = HVC.createHvxIntrinsic(Builder, V6_vmpy_parts, nullptr,

                                        {X.Val, Y.Val}, {HvxI32Ty});

  Value *Hi = Builder.CreateExtractValue(Parts, {0}, "ext");

  Value *Lo = Builder.CreateExtractValue(Parts, {1}, "ext");

  return {Lo, Hi};

}


auto HvxIdioms::createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,

                              ArrayRef<Value *> WordY) const

    -> SmallVector<Value *> {

  assert(WordX.size() == WordY.size());

  unsigned Idx = 0, Length = WordX.size();

  SmallVector<Value *> Sum(Length);


  while (Idx != Length) {

    if (HVC.isZero(WordX[Idx]))

      Sum[Idx] = WordY[Idx];

    else if (HVC.isZero(WordY[Idx]))

      Sum[Idx] = WordX[Idx];

    else

      break;

    ++Idx;

  }


  Value *Carry = nullptr;

  for (; Idx != Length; ++Idx) {

    std::tie(Sum[Idx], Carry) =

        createAddCarry(Builder, WordX[Idx], WordY[Idx], Carry);

  }


  // This drops the final carry beyond the highest word.

  return Sum;

}


auto HvxIdioms::createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,

                              Signedness SgnX, ArrayRef<Value *> WordY,

                              Signedness SgnY) const -> SmallVector<Value *> {

  SmallVector<SmallVector<Value *>> Products(WordX.size() + WordY.size());


  // WordX[i] * WordY[j] produces words i+j and i+j+1 of the results,

  // that is halves 2(i+j), 2(i+j)+1, 2(i+j)+2, 2(i+j)+3.

  for (int i = 0, e = WordX.size(); i != e; ++i) {

    for (int j = 0, f = WordY.size(); j != f; ++j) {

      // Check the 4 halves that this multiplication can generate.

      Signedness SX = (i + 1 == e) ? SgnX : Unsigned;

      Signedness SY = (j + 1 == f) ? SgnY : Unsigned;

      auto [Lo, Hi] = createMul32(Builder, {WordX[i], SX}, {WordY[j], SY});

      Products[i + j + 0].push_back(Lo);

      Products[i + j + 1].push_back(Hi);

    }

  }


  Value *Zero = HVC.getNullValue(WordX[0]->getType());


  auto pop_back_or_zero = [Zero](auto &Vector) -> Value * {

    if (Vector.empty())

      return Zero;

    auto Last = Vector.back();

    Vector.pop_back();

    return Last;

  };


  for (int i = 0, e = Products.size(); i != e; ++i) {

    while (Products[i].size() > 1) {

      Value *Carry = nullptr; // no carry-in

      for (int j = i; j != e; ++j) {

        auto &ProdJ = Products[j];

        auto [Sum, CarryOut] = createAddCarry(Builder, pop_back_or_zero(ProdJ),

                                              pop_back_or_zero(ProdJ), Carry);

        ProdJ.insert(ProdJ.begin(), Sum);

        Carry = CarryOut;

      }

    }

  }


  SmallVector<Value *> WordP;

  for (auto &P : Products) {

    assert(P.size() == 1 && "Should have been added together");

    WordP.push_back(P.front());

  }


  return WordP;

}


auto HvxIdioms::run() -> bool {

  bool Changed = false;


  for (BasicBlock &B : HVC.F) {

    for (auto It = B.rbegin(); It != B.rend(); ++It) {

      if (auto Fxm = matchFxpMul(*It)) {

        Value *New = processFxpMul(*It, *Fxm);

        // Always report "changed" for now.

        Changed = true;

        if (!New)

          continue;

        bool StartOver = !isa<Instruction>(New);

        It->replaceAllUsesWith(New);

        RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI);

        It = StartOver ? B.rbegin()

                       : cast<Instruction>(New)->getReverseIterator();

        Changed = true;

      } else if (matchGather(*It)) {

        Value *New = processVGather(*It);

        if (!New)

          continue;

        LLVM_DEBUG(dbgs() << "  Gather : " << *New << "\n");

        // We replace original intrinsic with a new pseudo call.

        It->eraseFromParent();

        It = cast<Instruction>(New)->getReverseIterator();

        RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI);

        Changed = true;

      } else if (matchScatter(*It)) {

        Value *New = processVScatter(*It);

        if (!New)

          continue;

        LLVM_DEBUG(dbgs() << "  Scatter : " << *New << "\n");

        // We replace original intrinsic with a new pseudo call.

        It->eraseFromParent();

        It = cast<Instruction>(New)->getReverseIterator();

        RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI);

        Changed = true;

      }

    }

  }


  return Changed;

}


// --- End HvxIdioms


auto HexagonVectorCombine::run() -> bool {

  if (DumpModule)

    dbgs() << "Module before HexagonVectorCombine\n" << *F.getParent();


  bool Changed = false;

  if (HST.useHVXOps()) {

    if (VAEnabled)

      Changed |= AlignVectors(*this).run();

    if (VIEnabled)

      Changed |= HvxIdioms(*this).run();

  }


  if (DumpModule) {

    dbgs() << "Module " << (Changed ? "(modified)" : "(unchanged)")

           << " after HexagonVectorCombine\n"

           << *F.getParent();

  }

  return Changed;

}


auto HexagonVectorCombine::getIntTy(unsigned Width) const -> IntegerType * {

  return IntegerType::get(F.getContext(), Width);

}


auto HexagonVectorCombine::getByteTy(int ElemCount) const -> Type * {

  assert(ElemCount >= 0);

  IntegerType *ByteTy = Type::getInt8Ty(F.getContext());

  if (ElemCount == 0)

    return ByteTy;

  return VectorType::get(ByteTy, ElemCount, /*Scalable=*/false);

}


auto HexagonVectorCombine::getBoolTy(int ElemCount) const -> Type * {

  assert(ElemCount >= 0);

  IntegerType *BoolTy = Type::getInt1Ty(F.getContext());

  if (ElemCount == 0)

    return BoolTy;

  return VectorType::get(BoolTy, ElemCount, /*Scalable=*/false);

}


auto HexagonVectorCombine::getConstInt(int Val, unsigned Width) const

    -> ConstantInt * {

  return ConstantInt::getSigned(getIntTy(Width), Val);

}


auto HexagonVectorCombine::isZero(const Value *Val) const -> bool {

  if (auto *C = dyn_cast<Constant>(Val))

    return C->isZeroValue();

  return false;

}


auto HexagonVectorCombine::getIntValue(const Value *Val) const

    -> std::optional<APInt> {

  if (auto *CI = dyn_cast<ConstantInt>(Val))

    return CI->getValue();

  return std::nullopt;

}


auto HexagonVectorCombine::isUndef(const Value *Val) const -> bool {

  return isa<UndefValue>(Val);

}


auto HexagonVectorCombine::isTrue(const Value *Val) const -> bool {

  return Val == ConstantInt::getTrue(Val->getType());

}


auto HexagonVectorCombine::isFalse(const Value *Val) const -> bool {

  return isZero(Val);

}


auto HexagonVectorCombine::getHvxTy(Type *ElemTy, bool Pair) const

    -> VectorType * {

  EVT ETy = EVT::getEVT(ElemTy, false);

  assert(ETy.isSimple() && "Invalid HVX element type");

  // Do not allow boolean types here: they don't have a fixed length.

  assert(HST.isHVXElementType(ETy.getSimpleVT(), /*IncludeBool=*/false) &&

         "Invalid HVX element type");

  unsigned HwLen = HST.getVectorLength();

  unsigned NumElems = (8 * HwLen) / ETy.getSizeInBits();

  return VectorType::get(ElemTy, Pair ? 2 * NumElems : NumElems,

                         /*Scalable=*/false);

}


auto HexagonVectorCombine::getSizeOf(const Value *Val, SizeKind Kind) const

    -> int {

  return getSizeOf(Val->getType(), Kind);

}


auto HexagonVectorCombine::getSizeOf(const Type *Ty, SizeKind Kind) const

    -> int {

  auto *NcTy = const_cast<Type *>(Ty);

  switch (Kind) {

  case Store:

    return DL.getTypeStoreSize(NcTy).getFixedValue();

  case Alloc:

    return DL.getTypeAllocSize(NcTy).getFixedValue();

  }

  llvm_unreachable("Unhandled SizeKind enum");

}


auto HexagonVectorCombine::getTypeAlignment(Type *Ty) const -> int {

  // The actual type may be shorter than the HVX vector, so determine

  // the alignment based on subtarget info.

  if (HST.isTypeForHVX(Ty))

    return HST.getVectorLength();

  return DL.getABITypeAlign(Ty).value();

}


auto HexagonVectorCombine::length(Value *Val) const -> size_t {

  return length(Val->getType());

}


auto HexagonVectorCombine::length(Type *Ty) const -> size_t {

  auto *VecTy = dyn_cast<VectorType>(Ty);

  assert(VecTy && "Must be a vector type");

  return VecTy->getElementCount().getFixedValue();

}


auto HexagonVectorCombine::getNullValue(Type *Ty) const -> Constant * {

  assert(Ty->isIntOrIntVectorTy());

  auto Zero = ConstantInt::get(Ty->getScalarType(), 0);

  if (auto *VecTy = dyn_cast<VectorType>(Ty))

    return ConstantVector::getSplat(VecTy->getElementCount(), Zero);

  return Zero;

}


auto HexagonVectorCombine::getFullValue(Type *Ty) const -> Constant * {

  assert(Ty->isIntOrIntVectorTy());

  auto Minus1 = ConstantInt::get(Ty->getScalarType(), -1);

  if (auto *VecTy = dyn_cast<VectorType>(Ty))

    return ConstantVector::getSplat(VecTy->getElementCount(), Minus1);

  return Minus1;

}


auto HexagonVectorCombine::getConstSplat(Type *Ty, int Val) const

    -> Constant * {

  assert(Ty->isVectorTy());

  auto VecTy = cast<VectorType>(Ty);

  Type *ElemTy = VecTy->getElementType();

  // Add support for floats if needed.

  auto *Splat = ConstantVector::getSplat(VecTy->getElementCount(),

                                         ConstantInt::get(ElemTy, Val));

  return Splat;

}


auto HexagonVectorCombine::simplify(Value *V) const -> Value * {

  if (auto *In = dyn_cast<Instruction>(V)) {

    SimplifyQuery Q(DL, &TLI, &DT, &AC, In);

    return simplifyInstruction(In, Q);

  }

  return nullptr;

}


// Insert bytes [Start..Start+Length) of Src into Dst at byte Where.

auto HexagonVectorCombine::insertb(IRBuilderBase &Builder, Value *Dst,

                                   Value *Src, int Start, int Length,

                                   int Where) const -> Value * {

  assert(isByteVecTy(Dst->getType()) && isByteVecTy(Src->getType()));

  int SrcLen = getSizeOf(Src);

  int DstLen = getSizeOf(Dst);

  assert(0 <= Start && Start + Length <= SrcLen);

  assert(0 <= Where && Where + Length <= DstLen);


  int P2Len = PowerOf2Ceil(SrcLen | DstLen);

  auto *Poison = PoisonValue::get(getByteTy());

  Value *P2Src = vresize(Builder, Src, P2Len, Poison);

  Value *P2Dst = vresize(Builder, Dst, P2Len, Poison);


  SmallVector<int, 256> SMask(P2Len);

  for (int i = 0; i != P2Len; ++i) {

    // If i is in [Where, Where+Length), pick Src[Start+(i-Where)].

    // Otherwise, pick Dst[i];

    SMask[i] =

        (Where <= i && i < Where + Length) ? P2Len + Start + (i - Where) : i;

  }


  Value *P2Insert = Builder.CreateShuffleVector(P2Dst, P2Src, SMask, "shf");

  return vresize(Builder, P2Insert, DstLen, Poison);

}


auto HexagonVectorCombine::vlalignb(IRBuilderBase &Builder, Value *Lo,

                                    Value *Hi, Value *Amt) const -> Value * {

  assert(Lo->getType() == Hi->getType() && "Argument type mismatch");

  if (isZero(Amt))

    return Hi;

  int VecLen = getSizeOf(Hi);

  if (auto IntAmt = getIntValue(Amt))

    return getElementRange(Builder, Lo, Hi, VecLen - IntAmt->getSExtValue(),

                           VecLen);


  if (HST.isTypeForHVX(Hi->getType())) {

    assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&

           "Expecting an exact HVX type");

    return createHvxIntrinsic(Builder, HST.getIntrinsicId(Hexagon::V6_vlalignb),

                              Hi->getType(), {Hi, Lo, Amt});

  }


  if (VecLen == 4) {

    Value *Pair = concat(Builder, {Lo, Hi});

    Value *Shift =

        Builder.CreateLShr(Builder.CreateShl(Pair, Amt, "shl"), 32, "lsr");

    Value *Trunc =

        Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()), "trn");

    return Builder.CreateBitCast(Trunc, Hi->getType(), "cst");

  }

  if (VecLen == 8) {

    Value *Sub = Builder.CreateSub(getConstInt(VecLen), Amt, "sub");

    return vralignb(Builder, Lo, Hi, Sub);

  }

  llvm_unreachable("Unexpected vector length");

}


auto HexagonVectorCombine::vralignb(IRBuilderBase &Builder, Value *Lo,

                                    Value *Hi, Value *Amt) const -> Value * {

  assert(Lo->getType() == Hi->getType() && "Argument type mismatch");

  if (isZero(Amt))

    return Lo;

  int VecLen = getSizeOf(Lo);

  if (auto IntAmt = getIntValue(Amt))

    return getElementRange(Builder, Lo, Hi, IntAmt->getSExtValue(), VecLen);


  if (HST.isTypeForHVX(Lo->getType())) {

    assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&

           "Expecting an exact HVX type");

    return createHvxIntrinsic(Builder, HST.getIntrinsicId(Hexagon::V6_valignb),

                              Lo->getType(), {Hi, Lo, Amt});

  }


  if (VecLen == 4) {

    Value *Pair = concat(Builder, {Lo, Hi});

    Value *Shift = Builder.CreateLShr(Pair, Amt, "lsr");

    Value *Trunc =

        Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()), "trn");

    return Builder.CreateBitCast(Trunc, Lo->getType(), "cst");

  }

  if (VecLen == 8) {

    Type *Int64Ty = Type::getInt64Ty(F.getContext());

    Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty, "cst");

    Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty, "cst");

    Value *Call = Builder.CreateIntrinsic(Intrinsic::hexagon_S2_valignrb,

                                          {Hi64, Lo64, Amt},

                                          /*FMFSource=*/nullptr, "cup");

    return Builder.CreateBitCast(Call, Lo->getType(), "cst");

  }

  llvm_unreachable("Unexpected vector length");

}


// Concatenates a sequence of vectors of the same type.

auto HexagonVectorCombine::concat(IRBuilderBase &Builder,

                                  ArrayRef<Value *> Vecs) const -> Value * {

  assert(!Vecs.empty());

  SmallVector<int, 256> SMask;

  std::vector<Value *> Work[2];

  int ThisW = 0, OtherW = 1;


  Work[ThisW].assign(Vecs.begin(), Vecs.end());

  while (Work[ThisW].size() > 1) {

    auto *Ty = cast<VectorType>(Work[ThisW].front()->getType());

    SMask.resize(length(Ty) * 2);

    std::iota(SMask.begin(), SMask.end(), 0);


    Work[OtherW].clear();

    if (Work[ThisW].size() % 2 != 0)

      Work[ThisW].push_back(UndefValue::get(Ty));

    for (int i = 0, e = Work[ThisW].size(); i < e; i += 2) {

      Value *Joined = Builder.CreateShuffleVector(

          Work[ThisW][i], Work[ThisW][i + 1], SMask, "shf");

      Work[OtherW].push_back(Joined);

    }

    std::swap(ThisW, OtherW);

  }


  // Since there may have been some undefs appended to make shuffle operands

  // have the same type, perform the last shuffle to only pick the original

  // elements.

  SMask.resize(Vecs.size() * length(Vecs.front()->getType()));

  std::iota(SMask.begin(), SMask.end(), 0);

  Value *Total = Work[ThisW].front();

  return Builder.CreateShuffleVector(Total, SMask, "shf");

}


auto HexagonVectorCombine::vresize(IRBuilderBase &Builder, Value *Val,

                                   int NewSize, Value *Pad) const -> Value * {

  assert(isa<VectorType>(Val->getType()));

  auto *ValTy = cast<VectorType>(Val->getType());

  assert(ValTy->getElementType() == Pad->getType());


  int CurSize = length(ValTy);

  if (CurSize == NewSize)

    return Val;

  // Truncate?

  if (CurSize > NewSize)

    return getElementRange(Builder, Val, /*Ignored*/ Val, 0, NewSize);

  // Extend.

  SmallVector<int, 128> SMask(NewSize);

  std::iota(SMask.begin(), SMask.begin() + CurSize, 0);

  std::fill(SMask.begin() + CurSize, SMask.end(), CurSize);

  Value *PadVec = Builder.CreateVectorSplat(CurSize, Pad, "spt");

  return Builder.CreateShuffleVector(Val, PadVec, SMask, "shf");

}


auto HexagonVectorCombine::rescale(IRBuilderBase &Builder, Value *Mask,

                                   Type *FromTy, Type *ToTy) const -> Value * {

  // Mask is a vector <N x i1>, where each element corresponds to an

  // element of FromTy. Remap it so that each element will correspond

  // to an element of ToTy.

  assert(isa<VectorType>(Mask->getType()));


  Type *FromSTy = FromTy->getScalarType();

  Type *ToSTy = ToTy->getScalarType();

  if (FromSTy == ToSTy)

    return Mask;


  int FromSize = getSizeOf(FromSTy);

  int ToSize = getSizeOf(ToSTy);

  assert(FromSize % ToSize == 0 || ToSize % FromSize == 0);


  auto *MaskTy = cast<VectorType>(Mask->getType());

  int FromCount = length(MaskTy);

  int ToCount = (FromCount * FromSize) / ToSize;

  assert((FromCount * FromSize) % ToSize == 0);


  auto *FromITy = getIntTy(FromSize * 8);

  auto *ToITy = getIntTy(ToSize * 8);


  // Mask <N x i1> -> sext to <N x FromTy> -> bitcast to <M x ToTy> ->

  // -> trunc to <M x i1>.

  Value *Ext = Builder.CreateSExt(

      Mask, VectorType::get(FromITy, FromCount, /*Scalable=*/false), "sxt");

  Value *Cast = Builder.CreateBitCast(

      Ext, VectorType::get(ToITy, ToCount, /*Scalable=*/false), "cst");

  return Builder.CreateTrunc(

      Cast, VectorType::get(getBoolTy(), ToCount, /*Scalable=*/false), "trn");

}


// Bitcast to bytes, and return least significant bits.

auto HexagonVectorCombine::vlsb(IRBuilderBase &Builder, Value *Val) const

    -> Value * {

  Type *ScalarTy = Val->getType()->getScalarType();

  if (ScalarTy == getBoolTy())

    return Val;


  Value *Bytes = vbytes(Builder, Val);

  if (auto *VecTy = dyn_cast<VectorType>(Bytes->getType()))

    return Builder.CreateTrunc(Bytes, getBoolTy(getSizeOf(VecTy)), "trn");

  // If Bytes is a scalar (i.e. Val was a scalar byte), return i1, not

  // <1 x i1>.

  return Builder.CreateTrunc(Bytes, getBoolTy(), "trn");

}


// Bitcast to bytes for non-bool. For bool, convert i1 -> i8.

auto HexagonVectorCombine::vbytes(IRBuilderBase &Builder, Value *Val) const

    -> Value * {

  Type *ScalarTy = Val->getType()->getScalarType();

  if (ScalarTy == getByteTy())

    return Val;


  if (ScalarTy != getBoolTy())

    return Builder.CreateBitCast(Val, getByteTy(getSizeOf(Val)), "cst");

  // For bool, return a sext from i1 to i8.

  if (auto *VecTy = dyn_cast<VectorType>(Val->getType()))

    return Builder.CreateSExt(Val, VectorType::get(getByteTy(), VecTy), "sxt");

  return Builder.CreateSExt(Val, getByteTy(), "sxt");

}


auto HexagonVectorCombine::subvector(IRBuilderBase &Builder, Value *Val,

                                     unsigned Start, unsigned Length) const

    -> Value * {

  assert(Start + Length <= length(Val));

  return getElementRange(Builder, Val, /*Ignored*/ Val, Start, Length);

}


auto HexagonVectorCombine::sublo(IRBuilderBase &Builder, Value *Val) const

    -> Value * {

  size_t Len = length(Val);

  assert(Len % 2 == 0 && "Length should be even");

  return subvector(Builder, Val, 0, Len / 2);

}


auto HexagonVectorCombine::subhi(IRBuilderBase &Builder, Value *Val) const

    -> Value * {

  size_t Len = length(Val);

  assert(Len % 2 == 0 && "Length should be even");

  return subvector(Builder, Val, Len / 2, Len / 2);

}


auto HexagonVectorCombine::vdeal(IRBuilderBase &Builder, Value *Val0,

                                 Value *Val1) const -> Value * {

  assert(Val0->getType() == Val1->getType());

  int Len = length(Val0);

  SmallVector<int, 128> Mask(2 * Len);


  for (int i = 0; i != Len; ++i) {

    Mask[i] = 2 * i;           // Even

    Mask[i + Len] = 2 * i + 1; // Odd

  }

  return Builder.CreateShuffleVector(Val0, Val1, Mask, "shf");

}


auto HexagonVectorCombine::vshuff(IRBuilderBase &Builder, Value *Val0,

                                  Value *Val1) const -> Value * { //

  assert(Val0->getType() == Val1->getType());

  int Len = length(Val0);

  SmallVector<int, 128> Mask(2 * Len);


  for (int i = 0; i != Len; ++i) {

    Mask[2 * i + 0] = i;       // Val0

    Mask[2 * i + 1] = i + Len; // Val1

  }

  return Builder.CreateShuffleVector(Val0, Val1, Mask, "shf");

}


auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder,

                                              Intrinsic::ID IntID, Type *RetTy,

                                              ArrayRef<Value *> Args,

                                              ArrayRef<Type *> ArgTys,

                                              ArrayRef<Value *> MDSources) const

    -> Value * {

  auto getCast = [&](IRBuilderBase &Builder, Value *Val,

                     Type *DestTy) -> Value * {

    Type *SrcTy = Val->getType();

    if (SrcTy == DestTy)

      return Val;


    // Non-HVX type. It should be a scalar, and it should already have

    // a valid type.

    assert(HST.isTypeForHVX(SrcTy, /*IncludeBool=*/true));


    Type *BoolTy = Type::getInt1Ty(F.getContext());

    if (cast<VectorType>(SrcTy)->getElementType() != BoolTy)

      return Builder.CreateBitCast(Val, DestTy, "cst");


    // Predicate HVX vector.

    unsigned HwLen = HST.getVectorLength();

    Intrinsic::ID TC = HwLen == 64 ? Intrinsic::hexagon_V6_pred_typecast

                                   : Intrinsic::hexagon_V6_pred_typecast_128B;

    return Builder.CreateIntrinsic(TC, {DestTy, Val->getType()}, {Val},

                                   /*FMFSource=*/nullptr, "cup");

  };


  Function *IntrFn =

      Intrinsic::getOrInsertDeclaration(F.getParent(), IntID, ArgTys);

  FunctionType *IntrTy = IntrFn->getFunctionType();


  SmallVector<Value *, 4> IntrArgs;

  for (int i = 0, e = Args.size(); i != e; ++i) {

    Value *A = Args[i];

    Type *T = IntrTy->getParamType(i);

    if (A->getType() != T) {

      IntrArgs.push_back(getCast(Builder, A, T));

    } else {

      IntrArgs.push_back(A);

    }

  }

  StringRef MaybeName = !IntrTy->getReturnType()->isVoidTy() ? "cup" : "";

  CallInst *Call = Builder.CreateCall(IntrFn, IntrArgs, MaybeName);


  MemoryEffects ME = Call->getAttributes().getMemoryEffects();

  if (!ME.doesNotAccessMemory() && !ME.onlyAccessesInaccessibleMem())

    propagateMetadata(Call, MDSources);


  Type *CallTy = Call->getType();

  if (RetTy == nullptr || CallTy == RetTy)

    return Call;

  // Scalar types should have RetTy matching the call return type.

  assert(HST.isTypeForHVX(CallTy, /*IncludeBool=*/true));

  return getCast(Builder, Call, RetTy);

}


auto HexagonVectorCombine::splitVectorElements(IRBuilderBase &Builder,

                                               Value *Vec,

                                               unsigned ToWidth) const

    -> SmallVector<Value *> {

  // Break a vector of wide elements into a series of vectors with narrow

  // elements:

  //   (...c0:b0:a0, ...c1:b1:a1, ...c2:b2:a2, ...)

  // -->

  //   (a0, a1, a2, ...)    // lowest "ToWidth" bits

  //   (b0, b1, b2, ...)    // the next lowest...

  //   (c0, c1, c2, ...)    // ...

  //   ...

  //

  // The number of elements in each resulting vector is the same as

  // in the original vector.


  auto *VecTy = cast<VectorType>(Vec->getType());

  assert(VecTy->getElementType()->isIntegerTy());

  unsigned FromWidth = VecTy->getScalarSizeInBits();

  assert(isPowerOf2_32(ToWidth) && isPowerOf2_32(FromWidth));

  assert(ToWidth <= FromWidth && "Breaking up into wider elements?");

  unsigned NumResults = FromWidth / ToWidth;


  SmallVector<Value *> Results(NumResults);

  Results[0] = Vec;

  unsigned Length = length(VecTy);


  // Do it by splitting in half, since those operations correspond to deal

  // instructions.

  auto splitInHalf = [&](unsigned Begin, unsigned End, auto splitFunc) -> void {

    // Take V = Results[Begin], split it in L, H.

    // Store Results[Begin] = L, Results[(Begin+End)/2] = H

    // Call itself recursively split(Begin, Half), split(Half+1, End)

    if (Begin + 1 == End)

      return;


    Value *Val = Results[Begin];

    unsigned Width = Val->getType()->getScalarSizeInBits();


    auto *VTy = VectorType::get(getIntTy(Width / 2), 2 * Length, false);

    Value *VVal = Builder.CreateBitCast(Val, VTy, "cst");


    Value *Res = vdeal(Builder, sublo(Builder, VVal), subhi(Builder, VVal));


    unsigned Half = (Begin + End) / 2;

    Results[Begin] = sublo(Builder, Res);

    Results[Half] = subhi(Builder, Res);


    splitFunc(Begin, Half, splitFunc);

    splitFunc(Half, End, splitFunc);

  };


  splitInHalf(0, NumResults, splitInHalf);

  return Results;

}


auto HexagonVectorCombine::joinVectorElements(IRBuilderBase &Builder,

                                              ArrayRef<Value *> Values,

                                              VectorType *ToType) const

    -> Value * {

  assert(ToType->getElementType()->isIntegerTy());


  // If the list of values does not have power-of-2 elements, append copies

  // of the sign bit to it, to make the size be 2^n.

  // The reason for this is that the values will be joined in pairs, because

  // otherwise the shuffles will result in convoluted code. With pairwise

  // joins, the shuffles will hopefully be folded into a perfect shuffle.

  // The output will need to be sign-extended to a type with element width

  // being a power-of-2 anyways.

  SmallVector<Value *> Inputs(Values);


  unsigned ToWidth = ToType->getScalarSizeInBits();

  unsigned Width = Inputs.front()->getType()->getScalarSizeInBits();

  assert(Width <= ToWidth);

  assert(isPowerOf2_32(Width) && isPowerOf2_32(ToWidth));

  unsigned Length = length(Inputs.front()->getType());


  unsigned NeedInputs = ToWidth / Width;

  if (Inputs.size() != NeedInputs) {

    // Having too many inputs is ok: drop the high bits (usual wrap-around).

    // If there are too few, fill them with the sign bit.

    Value *Last = Inputs.back();

    Value *Sign = Builder.CreateAShr(

        Last, getConstSplat(Last->getType(), Width - 1), "asr");

    Inputs.resize(NeedInputs, Sign);

  }


  while (Inputs.size() > 1) {

    Width *= 2;

    auto *VTy = VectorType::get(getIntTy(Width), Length, false);

    for (int i = 0, e = Inputs.size(); i < e; i += 2) {

      Value *Res = vshuff(Builder, Inputs[i], Inputs[i + 1]);

      Inputs[i / 2] = Builder.CreateBitCast(Res, VTy, "cst");

    }

    Inputs.resize(Inputs.size() / 2);

  }


  assert(Inputs.front()->getType() == ToType);

  return Inputs.front();

}


auto HexagonVectorCombine::calculatePointerDifference(Value *Ptr0,

                                                      Value *Ptr1) const

    -> std::optional<int> {

  // Try SCEV first.

  const SCEV *Scev0 = SE.getSCEV(Ptr0);

  const SCEV *Scev1 = SE.getSCEV(Ptr1);

  const SCEV *ScevDiff = SE.getMinusSCEV(Scev0, Scev1);

  if (auto *Const = dyn_cast<SCEVConstant>(ScevDiff)) {

    APInt V = Const->getAPInt();

    if (V.isSignedIntN(8 * sizeof(int)))

      return static_cast<int>(V.getSExtValue());

  }


  struct Builder : IRBuilder<> {

    Builder(BasicBlock *B) : IRBuilder<>(B->getTerminator()) {}

    ~Builder() {

      for (Instruction *I : llvm::reverse(ToErase))

        I->eraseFromParent();

    }

    SmallVector<Instruction *, 8> ToErase;

  };


#define CallBuilder(B, F)                                                      \

  [&](auto &B_) {                                                              \

    Value *V = B_.F;                                                           \

    if (auto *I = dyn_cast<Instruction>(V))                                    \

      B_.ToErase.push_back(I);                                                 \

    return V;                                                                  \

  }(B)


  auto Simplify = [this](Value *V) {

    if (Value *S = simplify(V))

      return S;

    return V;

  };


  auto StripBitCast = [](Value *V) {

    while (auto *C = dyn_cast<BitCastInst>(V))

      V = C->getOperand(0);

    return V;

  };


  Ptr0 = StripBitCast(Ptr0);

  Ptr1 = StripBitCast(Ptr1);

  if (!isa<GetElementPtrInst>(Ptr0) || !isa<GetElementPtrInst>(Ptr1))

    return std::nullopt;


  auto *Gep0 = cast<GetElementPtrInst>(Ptr0);

  auto *Gep1 = cast<GetElementPtrInst>(Ptr1);

  if (Gep0->getPointerOperand() != Gep1->getPointerOperand())

    return std::nullopt;

  if (Gep0->getSourceElementType() != Gep1->getSourceElementType())

    return std::nullopt;


  Builder B(Gep0->getParent());

  int Scale = getSizeOf(Gep0->getSourceElementType(), Alloc);


  // FIXME: for now only check GEPs with a single index.

  if (Gep0->getNumOperands() != 2 || Gep1->getNumOperands() != 2)

    return std::nullopt;


  Value *Idx0 = Gep0->getOperand(1);

  Value *Idx1 = Gep1->getOperand(1);


  // First, try to simplify the subtraction directly.

  if (auto *Diff = dyn_cast<ConstantInt>(

          Simplify(CallBuilder(B, CreateSub(Idx0, Idx1)))))

    return Diff->getSExtValue() * Scale;


  KnownBits Known0 = getKnownBits(Idx0, Gep0);

  KnownBits Known1 = getKnownBits(Idx1, Gep1);

  APInt Unknown = ~(Known0.Zero | Known0.One) | ~(Known1.Zero | Known1.One);

  if (Unknown.isAllOnes())

    return std::nullopt;


  Value *MaskU = ConstantInt::get(Idx0->getType(), Unknown);

  Value *AndU0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskU)));

  Value *AndU1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskU)));

  Value *SubU = Simplify(CallBuilder(B, CreateSub(AndU0, AndU1)));

  int Diff0 = 0;

  if (auto *C = dyn_cast<ConstantInt>(SubU)) {

    Diff0 = C->getSExtValue();

  } else {

    return std::nullopt;

  }


  Value *MaskK = ConstantInt::get(MaskU->getType(), ~Unknown);

  Value *AndK0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskK)));

  Value *AndK1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskK)));

  Value *SubK = Simplify(CallBuilder(B, CreateSub(AndK0, AndK1)));

  int Diff1 = 0;

  if (auto *C = dyn_cast<ConstantInt>(SubK)) {

    Diff1 = C->getSExtValue();

  } else {

    return std::nullopt;

  }


  return (Diff0 + Diff1) * Scale;


#undef CallBuilder

}


auto HexagonVectorCombine::getNumSignificantBits(const Value *V,

                                                 const Instruction *CtxI) const

    -> unsigned {

  return ComputeMaxSignificantBits(V, DL, &AC, CtxI, &DT);

}


auto HexagonVectorCombine::getKnownBits(const Value *V,

                                        const Instruction *CtxI) const

    -> KnownBits {

  return computeKnownBits(V, DL, &AC, CtxI, &DT);

}


auto HexagonVectorCombine::isSafeToClone(const Instruction &In) const -> bool {

  if (In.mayHaveSideEffects() || In.isAtomic() || In.isVolatile() ||

      In.isFenceLike() || In.mayReadOrWriteMemory()) {

    return false;

  }

  if (isa<CallBase>(In) || isa<AllocaInst>(In))

    return false;

  return true;

}


template <typename T>

auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In,

                                                  BasicBlock::const_iterator To,

                                                  const T &IgnoreInsts) const

    -> bool {

  auto getLocOrNone =

      [this](const Instruction &I) -> std::optional<MemoryLocation> {

    if (const auto *II = dyn_cast<IntrinsicInst>(&I)) {

      switch (II->getIntrinsicID()) {

      case Intrinsic::masked_load:

        return MemoryLocation::getForArgument(II, 0, TLI);

      case Intrinsic::masked_store:

        return MemoryLocation::getForArgument(II, 1, TLI);

      }

    }

    return MemoryLocation::getOrNone(&I);

  };


  // The source and the destination must be in the same basic block.

  const BasicBlock &Block = *In.getParent();

  assert(Block.begin() == To || Block.end() == To || To->getParent() == &Block);

  // No PHIs.

  if (isa<PHINode>(In) || (To != Block.end() && isa<PHINode>(*To)))

    return false;


  if (!mayHaveNonDefUseDependency(In))

    return true;

  bool MayWrite = In.mayWriteToMemory();

  auto MaybeLoc = getLocOrNone(In);


  auto From = In.getIterator();

  if (From == To)

    return true;

  bool MoveUp = (To != Block.end() && To->comesBefore(&In));

  auto Range =

      MoveUp ? std::make_pair(To, From) : std::make_pair(std::next(From), To);

  for (auto It = Range.first; It != Range.second; ++It) {

    const Instruction &I = *It;

    if (llvm::is_contained(IgnoreInsts, &I))

      continue;

    // assume intrinsic can be ignored

    if (auto *II = dyn_cast<IntrinsicInst>(&I)) {

      if (II->getIntrinsicID() == Intrinsic::assume)

        continue;

    }

    // Parts based on isSafeToMoveBefore from CoveMoverUtils.cpp.

    if (I.mayThrow())

      return false;

    if (auto *CB = dyn_cast<CallBase>(&I)) {

      if (!CB->hasFnAttr(Attribute::WillReturn))

        return false;

      if (!CB->hasFnAttr(Attribute::NoSync))

        return false;

    }

    if (I.mayReadOrWriteMemory()) {

      auto MaybeLocI = getLocOrNone(I);

      if (MayWrite || I.mayWriteToMemory()) {

        if (!MaybeLoc || !MaybeLocI)

          return false;

        if (!AA.isNoAlias(*MaybeLoc, *MaybeLocI))

          return false;

      }

    }

  }

  return true;

}


auto HexagonVectorCombine::isByteVecTy(Type *Ty) const -> bool {

  if (auto *VecTy = dyn_cast<VectorType>(Ty))

    return VecTy->getElementType() == getByteTy();

  return false;

}


auto HexagonVectorCombine::getElementRange(IRBuilderBase &Builder, Value *Lo,

                                           Value *Hi, int Start,

                                           int Length) const -> Value * {

  assert(0 <= Start && size_t(Start + Length) < length(Lo) + length(Hi));

  SmallVector<int, 128> SMask(Length);

  std::iota(SMask.begin(), SMask.end(), Start);

  return Builder.CreateShuffleVector(Lo, Hi, SMask, "shf");

}


// Pass management.


namespace {

class HexagonVectorCombineLegacy : public FunctionPass {

public:

  static char ID;


  HexagonVectorCombineLegacy() : FunctionPass(ID) {}


  StringRef getPassName() const override { return "Hexagon Vector Combine"; }


  void getAnalysisUsage(AnalysisUsage &AU) const override {

    AU.setPreservesCFG();

    AU.addRequired<AAResultsWrapperPass>();

    AU.addRequired<AssumptionCacheTracker>();

    AU.addRequired<DominatorTreeWrapperPass>();

    AU.addRequired<ScalarEvolutionWrapperPass>();

    AU.addRequired<TargetLibraryInfoWrapperPass>();

    AU.addRequired<TargetPassConfig>();

    FunctionPass::getAnalysisUsage(AU);

  }


  bool runOnFunction(Function &F) override {

    if (skipFunction(F))

      return false;

    AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();

    AssumptionCache &AC =

        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);

    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();

    ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();

    TargetLibraryInfo &TLI =

        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);

    auto &TM = getAnalysis<TargetPassConfig>().getTM<HexagonTargetMachine>();

    HexagonVectorCombine HVC(F, AA, AC, DT, SE, TLI, TM);

    return HVC.run();

  }

};

} // namespace


char HexagonVectorCombineLegacy::ID = 0;


INITIALIZE_PASS_BEGIN(HexagonVectorCombineLegacy, DEBUG_TYPE,

                      "Hexagon Vector Combine", false, false)

INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)

INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)

INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)

INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)

INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)

INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)

INITIALIZE_PASS_END(HexagonVectorCombineLegacy, DEBUG_TYPE,

                    "Hexagon Vector Combine", false, false)


FunctionPass *llvm::createHexagonVectorCombineLegacyPass() {

  return new HexagonVectorCombineLegacy();

}


Poison
@ Poison
Definition AArch64AsmPrinter.cpp:76

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

const
aarch64 promote const
Definition AArch64PromoteConstant.cpp:228

Alloc
AMDGPU Prepare AGPR Alloc
Definition AMDGPUPrepareAGPRAlloc.cpp:68

APInt.h
This file implements a class to represent arbitrary precision integral constant values and operations...

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

Results
Function Alias Analysis Results
Definition AliasAnalysis.cpp:747

AliasAnalysis.h

ArrayRef.h

AssumptionCache.h

getIntTy
static IntegerType * getIntTy(IRBuilderBase &B, const TargetLibraryInfo *TLI)
Definition BuildLibCalls.cpp:1603

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

D
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

Info
Analysis containing CSE Info
Definition CSEInfo.cpp:27

CommandLine.h

DenseMap.h
This file defines the DenseMap class.

Dominators.h

runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition EntryExitInstrumenter.cpp:103

DEBUG_TYPE
#define DEBUG_TYPE
Definition GenericCycleImpl.h:31

simplify
hexagon bit simplify
Definition HexagonBitSimplify.cpp:267

SizeLimit
static cl::opt< unsigned > SizeLimit("eif-limit", cl::init(6), cl::Hidden, cl::desc("Size limit in Hexagon early if-conversion"))

Vector
shuff Hexagon Optimize Shuffle Vector
Definition HexagonOptShuffleVector.cpp:126

HexagonSubtarget.h

HexagonTargetMachine.h

locateIndexesFromIntrinsic
static Value * locateIndexesFromIntrinsic(Instruction *In)
Definition HexagonVectorCombine.cpp:2051

locateDestination
Instruction * locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual)
Definition HexagonVectorCombine.cpp:1922

getReinterpretiveCast_i8_to_i32
Value * getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC, IRBuilderBase &Builder, LLVMContext &Ctx, Value *I)
Definition HexagonVectorCombine.cpp:2083

locateIndexesFromGEP
static Value * locateIndexesFromGEP(Value *In)
Definition HexagonVectorCombine.cpp:2023

CallBuilder
#define CallBuilder(B, F)

getPointer
Value * getPointer(Value *Ptr)
Definition HexagonVectorCombine.cpp:1856

DEFAULT_HVX_VTCM_PAGE_SIZE
#define DEFAULT_HVX_VTCM_PAGE_SIZE
Definition HexagonVectorCombine.cpp:63

locateAddressFromIntrinsic
static Value * locateAddressFromIntrinsic(Instruction *In)
Definition HexagonVectorCombine.cpp:1961

selectDestination
static Instruction * selectDestination(Instruction *In, HvxIdioms::DstQualifier &Qual)
Definition HexagonVectorCombine.cpp:1869

get_i32_Mask
Value * get_i32_Mask(const HexagonVectorCombine &HVC, IRBuilderBase &Builder, LLVMContext &Ctx, unsigned int pattern)
Definition HexagonVectorCombine.cpp:2098

isArithmetic
bool isArithmetic(unsigned Opc)
Definition HexagonVectorCombine.cpp:1838

getIndexType
static Type * getIndexType(Value *In)
Definition HexagonVectorCombine.cpp:2007

locateGepFromIntrinsic
GetElementPtrInst * locateGepFromIntrinsic(Instruction *In)
Definition HexagonVectorCombine.cpp:1944

getReinterpretiveCast_i16_to_i32
Value * getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC, IRBuilderBase &Builder, LLVMContext &Ctx, Value *I)
Definition HexagonVectorCombine.cpp:2068

Hexagon.h

IRBuilder.h

IntrinsicInst.h

Users
iv Induction Variable Users
Definition IVUsers.cpp:48

InitializePasses.h

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

InstSimplifyFolder.h

InstructionSimplify.h

getMask
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
Definition InterleavedAccessPass.cpp:588

Intrinsics.h

TemplateParamKind::Type
@ Type
Definition ItaniumDemangle.h:1243

KnownBits.h

isZero
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539

F
#define F(x, y, z)
Definition MD5.cpp:54

I
#define I(x, y, z)
Definition MD5.cpp:57

G
#define G(x, y, z)
Definition MD5.cpp:55

H
#define H(x, y, z)
Definition MD5.cpp:56

isCandidate
static bool isCandidate(const MachineInstr *MI, Register &DefedReg, Register FrameReg)
Definition MachineLateInstrsCleanup.cpp:179

isUndef
static bool isUndef(const MachineInstr &MI)
Definition MachineSSAContext.cpp:57

MathExtras.h

Metadata.h
This file contains the declarations for metadata subclasses.

T
#define T
Definition Mips16ISelLowering.cpp:282

Unknown
@ Unknown
Definition NVPTXISelLowering.cpp:5933

Unsigned
@ Unsigned
Definition NVPTXISelLowering.cpp:5932

Signed
@ Signed
Definition NVPTXISelLowering.cpp:5931

Range
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

P
#define P(N)

if
if(PassOpts->AAPipeline)
Definition PassBuilderBindings.cpp:64

INITIALIZE_PASS_DEPENDENCY
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42

INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44

INITIALIZE_PASS_BEGIN
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39

Pass.h

PatternMatch.h

Opc
auto Opc
Definition RISCVRedundantCopyElimination.cpp:75

Uses
Remove Loads Into Fake Uses
Definition RemoveLoadsIntoFakeUses.cpp:81

getConstInt
static ConstantInt * getConstInt(MDNode *MD, unsigned NumOp)
Definition SPIRVCallLowering.cpp:92

STLExtras.h
This file contains some templates that are useful if you are working with the STL at all.

ScalarEvolutionExpressions.h

SmallVector.h
This file defines the SmallVector class.

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:114

Y
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")

X
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")

getType
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39

TargetLibraryInfo.h

TargetPassConfig.h
Target-Independent Code Generator Pass Configuration Options pass.

Local.h

ValueTracking.h

ValueTypes.h

VectorUtils.h

Predicate
Definition AMDGPURegBankLegalizeRules.cpp:375

VectorType
Definition ItaniumDemangle.h:1189

llvm::AAResultsWrapperPass
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Definition AliasAnalysis.h:1022

llvm::APInt::isAllOnes
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372

llvm::APInt::ashr
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:828

llvm::AllocaInst::getAllocatedType
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition Instructions.h:122

llvm::AllocaInst::getAddressSpace
unsigned getAddressSpace() const
Return the address space for the allocation.
Definition Instructions.h:107

llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition PassAnalysisSupport.h:76

llvm::AnalysisUsage::setPreservesCFG
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270

llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40

llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition AssumptionCache.h:211

llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition AssumptionCache.h:44

llvm::BasicBlock::const_iterator
InstListType::const_iterator const_iterator
Definition BasicBlock.h:171

llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170

llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233

llvm::CallBase::getAttributes
AttributeList getAttributes() const
Return the attributes for this call.
Definition InstrTypes.h:1422

llvm::CmpInst::ICMP_ULT
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701

llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition InstrTypes.h:698

llvm::ConstantDataVector::get
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
Definition Constants.cpp:3036

llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition Constants.h:87

llvm::ConstantInt::getTrue
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
Definition Constants.cpp:888

llvm::ConstantInt::getSigned
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:136

llvm::ConstantVector::getSplat
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition Constants.cpp:1495

llvm::Constant
This is an important base class in LLVM.
Definition Constant.h:43

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63

llvm::DataLayout::getTypeAllocSize
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition DataLayout.cpp:858

llvm::DenseMap
Definition DenseMap.h:747

llvm::DomTreeNodeBase::children
iterator_range< iterator > children()
Definition GenericDomTree.h:83

llvm::DomTreeNodeBase::getBlock
NodeT * getBlock() const
Definition GenericDomTree.h:88

llvm::DominatorTreeBase::getRootNode
DomTreeNodeBase< NodeT > * getRootNode()
getRootNode - This returns the entry node for the CFG of the function.
Definition GenericDomTree.h:420

llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:321

llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314

llvm::Function
Definition Function.h:64

llvm::Function::empty
bool empty() const
Definition Function.h:857

llvm::Function::getFunctionType
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209

llvm::Function::back
const BasicBlock & back() const
Definition Function.h:860

llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition Instructions.h:950

llvm::GetElementPtrInst::getPointerOperand
Value * getPointerOperand()
Definition Instructions.h:1054

llvm::HexagonSubtarget
Definition HexagonSubtarget.h:43

llvm::HexagonSubtarget::isHVXVectorType
bool isHVXVectorType(EVT VecTy, bool IncludeBool=false) const
Definition HexagonSubtarget.cpp:179

llvm::HexagonSubtarget::useHVXV62Ops
bool useHVXV62Ops() const
Definition HexagonSubtarget.h:261

llvm::HexagonSubtarget::useHVXV69Ops
bool useHVXV69Ops() const
Definition HexagonSubtarget.h:276

llvm::HexagonSubtarget::getVectorLength
unsigned getVectorLength() const
Definition HexagonSubtarget.h:335

llvm::HexagonSubtarget::useHVXV66Ops
bool useHVXV66Ops() const
Definition HexagonSubtarget.h:267

llvm::HexagonSubtarget::isTypeForHVX
bool isTypeForHVX(Type *VecTy, bool IncludeBool=false) const
Definition HexagonSubtarget.cpp:207

llvm::HexagonSubtarget::getIntrinsicId
Intrinsic::ID getIntrinsicId(unsigned Opc) const
Definition HexagonSubtarget.cpp:725

llvm::IRBuilderBase
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114

llvm::IRBuilderBase::CreateAlloca
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition IRBuilder.h:1833

llvm::IRBuilderBase::CreateVectorSplat
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition IRBuilder.cpp:1125

llvm::IRBuilderBase::CreateExtractValue
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2626

llvm::IRBuilderBase::CreateSelect
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.cpp:1015

llvm::IRBuilderBase::CreateSExt
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2097

llvm::IRBuilderBase::CreateLShr
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513

llvm::IRBuilderBase::CreateICmpNE
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2336

llvm::IRBuilderBase::CreateIntrinsic
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition IRBuilder.cpp:845

llvm::IRBuilderBase::CreateBitOrPointerCast
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2289

llvm::IRBuilderBase::CreateCmp
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466

llvm::IRBuilderBase::CreateSub
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1420

llvm::IRBuilderBase::CreateBitCast
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207

llvm::IRBuilderBase::CreateLoad
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850

llvm::IRBuilderBase::CreateShl
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492

llvm::IRBuilderBase::CreateZExt
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085

llvm::IRBuilderBase::CreateShuffleVector
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601

llvm::IRBuilderBase::CreateAnd
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551

llvm::IRBuilderBase::CreateStore
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863

llvm::IRBuilderBase::CreateAdd
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403

llvm::IRBuilderBase::CreatePtrToInt
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2197

llvm::IRBuilderBase::CreateCall
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2511

llvm::IRBuilderBase::CreateTrunc
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2071

llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207

llvm::IRBuilderBase::CreateAShr
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1532

llvm::IRBuilderBase::CreateICmp
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2442

llvm::IRBuilderBase::CreateOr
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573

llvm::Instruction
Definition Instruction.h:69

llvm::Instruction::getOpcodeName
const char * getOpcodeName() const
Definition Instruction.h:314

llvm::IntegerType
Class to represent integer types.
Definition DerivedTypes.h:42

llvm::IntegerType::get
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318

llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition IntrinsicInst.h:49

llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition IntrinsicInst.h:56

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68

llvm::LoadInst
An instruction for reading from memory.
Definition Instructions.h:181

llvm::MemoryEffectsBase::doesNotAccessMemory
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:220

llvm::MemoryEffectsBase::onlyAccessesInaccessibleMem
bool onlyAccessesInaccessibleMem() const
Whether this function only (at most) accesses inaccessible memory.
Definition ModRef.h:239

llvm::MemoryLocation::getOrNone
static LLVM_ABI std::optional< MemoryLocation > getOrNone(const Instruction *Inst)
Definition MemoryLocation.cpp:78

llvm::MemoryLocation::getForArgument
static LLVM_ABI MemoryLocation getForArgument(const CallBase *Call, unsigned ArgIdx, const TargetLibraryInfo *TLI)
Return a location representing a particular argument of a call.
Definition MemoryLocation.cpp:181

llvm::PoisonValue::get
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition Constants.cpp:1906

llvm::ScalarEvolutionWrapperPass
Definition ScalarEvolution.h:2411

llvm::ScalarEvolution
The main scalar evolution driver.
Definition ScalarEvolution.h:457

llvm::const_iterator

llvm::SmallVectorImpl::resize
void resize(size_type N)
Definition SmallVector.h:639

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:417

llvm::SmallVectorTemplateCommon::end
iterator end()
Definition SmallVector.h:273

llvm::SmallVectorTemplateCommon::begin
iterator begin()
Definition SmallVector.h:271

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1203

llvm::StoreInst
An instruction for storing to memory.
Definition Instructions.h:297

llvm::TargetLibraryInfoWrapperPass
Definition TargetLibraryInfo.h:635

llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition TargetLibraryInfo.h:266

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition TargetMachine.h:83

llvm::TargetMachine::getSubtargetImpl
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
Definition TargetMachine.h:139

llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition TargetPassConfig.h:84

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45

llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273

llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246

llvm::Type::getPointerAddressSpace
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition DerivedTypes.h:773

llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352

llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128

llvm::Type::getScalarSizeInBits
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230

llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240

llvm::UndefValue::get
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition Constants.cpp:1887

llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition User.h:232

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256

llvm::Value::replaceAllUsesWith
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546

llvm::Value::getName
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322

llvm::cl::opt
Definition CommandLine.h:1455

llvm::ilist_detail::node_parent_access::getParent
const ParentTy * getParent() const
Definition ilist_node.h:34

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition ilist_node.h:123

llvm::iterator

llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53

Call
CallInst * Call
Definition ObjCARCOpts.cpp:2359

Changed
Changed
Definition ObjCARCOpts.cpp:2369

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

TargetMachine.h

false
Definition MachinePipeliner.cpp:244

llvm::AA
Abstract Attribute helper functions.
Definition Attributor.h:165

llvm::AArch64::Rounding
Rounding
Possible values of current rounding mode, which is specified in bits 23:22 of FPCR.
Definition AArch64ISelLowering.h:31

llvm::AMDGPU::CPol::NT
@ NT
Definition SIDefines.h:374

llvm::AMDGPU::HSAMD::Kernel::Arg::Key::Align
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
Definition AMDGPUMetadata.h:183

llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition AMDGPUMetadata.h:396

llvm::ARMBuildAttrs::Section
@ Section
Legacy Tags.
Definition ARMBuildAttributes.h:83

llvm::ARM_MB::SY
@ SY
Definition ARMBaseInfo.h:74

llvm::ARM::ProfileKind::M
@ M
Definition ARMTargetParser.h:171

llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:126

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::ISD::BasicBlock
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81

llvm::ISD::Constant
@ Constant
Definition ISDOpcodes.h:86

llvm::Intrinsic::getOrInsertDeclaration
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition Intrinsics.cpp:755

llvm::Intrinsic::ID
unsigned ID
Definition GenericSSAContext.h:28

llvm::M68k::MemAddrModeKind::j
@ j
Definition M68kBaseInfo.h:52

llvm::M68k::MemAddrModeKind::U
@ U
Definition M68kBaseInfo.h:61

llvm::M68k::MemAddrModeKind::V
@ V
Definition M68kBaseInfo.h:63

llvm::M68k::MemAddrModeKind::p
@ p
Definition M68kBaseInfo.h:55

llvm::M68k::MemAddrModeKind::f
@ f
Definition M68kBaseInfo.h:56

llvm::M68k::MemAddrModeKind::L
@ L
Definition M68kBaseInfo.h:70

llvm::NVPTX::Const
@ Const
Definition NVPTX.h:184

llvm::PPC::Predicate
Predicate
Predicate - These are "(BI << 5) | BO" for various predicates.
Definition PPCPredicates.h:26

llvm::PatternMatch::m_Add
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition PatternMatch.h:1208

llvm::PatternMatch::m_AShr
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
Definition PatternMatch.h:1352

llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition PatternMatch.h:49

llvm::PatternMatch::m_Shr
BinOpPred_match< LHS, RHS, is_right_shift_op > m_Shr(const LHS &L, const RHS &R)
Matches logical shift operations.
Definition PatternMatch.h:1678

llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition PatternMatch.h:181

llvm::PatternMatch::m_Mul
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
Definition PatternMatch.h:1274

llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition PatternMatch.h:105

llvm::PatternMatch::m_LShr
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
Definition PatternMatch.h:1346

llvm::PatternMatch::m_CombineOr
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition PatternMatch.h:272

llvm::RISCVFenceField::W
@ W
Definition RISCVBaseInfo.h:469

llvm::RISCVFenceField::R
@ R
Definition RISCVBaseInfo.h:468

llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition MachineInstrBuilder.h:55

llvm::SPII::Store
@ Store
Definition SparcInstrInfo.h:33

llvm::SPII::Load
@ Load
Definition SparcInstrInfo.h:32

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:139

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:445

llvm::codeview::CompileSym3Flags::Exp
@ Exp
Definition CodeView.h:459

llvm::codeview::PublicSymFlags::Function
@ Function
Definition CodeView.h:408

llvm::dwarf::Index
Index
Definition Dwarf.h:903

llvm::logicalview::LVPrintKind::Elements
@ Elements
Definition LVOptions.h:153

llvm::logicalview::LVAttributeKind::Zero
@ Zero
Definition LVOptions.h:130

llvm::lsp::TraceLevel::Off
@ Off
Definition Protocol.h:200

llvm::memprof::Meta::Start
@ Start
Definition MemProf.h:69

llvm::ms_demangle::IntrinsicFunctionKind::New
@ New
Definition MicrosoftDemangleNodes.h:121

llvm::ms_demangle::QualifierMangleMode::Result
@ Result
Definition MicrosoftDemangle.h:132

llvm::msgpack::Type::Map
@ Map
Definition MsgPackReader.h:63

llvm::numbers::e
constexpr double e
Definition STLForwardCompat.h:61

llvm::objcarc::ARCInstKind::User
@ User
could "use" a pointer
Definition ObjCARCInstKind.h:52

llvm::omp::RTLDependInfoFields::Len
@ Len
Definition OMPConstants.h:286

llvm::pdb::PDB_SymType::VectorType
@ VectorType
Definition PDBTypes.h:278

llvm::pdb::DbgHeaderType::Max
@ Max
Definition RawConstants.h:98

llvm::sampleprof::Base
@ Base
Definition Discriminator.h:58

llvm::sandboxir::Instruction
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73

llvm::sandboxir::getTerminator
LLVM_ABI Instruction * getTerminator() const

llvm::sandboxir::front
LLVM_ABI Instruction & front() const

llvm::tgtok::Bits
@ Bits
Definition TGLexer.h:78

llvm::tgtok::In
@ In
Definition TGLexer.h:83

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::createHexagonVectorCombineLegacyPass
FunctionPass * createHexagonVectorCombineLegacyPass()
Definition HexagonVectorCombine.cpp:3859

llvm::Offset
@ Offset
Definition DWP.cpp:532

llvm::Length
@ Length
Definition DWP.cpp:532

llvm::Value
FunctionAddr VTableAddr Value
Definition InstrProf.h:137

llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725

llvm::PseudoProbeType::Block
@ Block
Definition PseudoProbe.h:30

llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655

llvm::RecursivelyDeleteTriviallyDeadInstructions
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643

llvm::Int32Ty
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296

llvm::getLoadStorePointerOperand
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Definition Instructions.h:5092

llvm::append_range
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136

llvm::isPowerOf2_64
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284

llvm::MemoryEffects
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:301

llvm::propagateMetadata
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
Definition VectorUtils.cpp:1080

llvm::copy_if
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1777

llvm::Log2_64
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337

llvm::concat
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150

llvm::PowerOf2Ceil
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385

llvm::simplifyInstruction
LLVM_ABI Value * simplifyInstruction(Instruction *I, const SimplifyQuery &Q)
See if we can compute a simplified version of this instruction.
Definition InstructionSimplify.cpp:7576

llvm::DomTreeNode
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94

llvm::reverse
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406

llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279

llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622

llvm::ComplexDeinterleavingOperation::Splat
@ Splat
Definition ComplexDeinterleavingPass.h:42

llvm::computeKnownBits
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition ValueTracking.cpp:152

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207

llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1129

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547

llvm::errs
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
Definition raw_ostream.cpp:897

llvm::PackElem::Hi
@ Hi
Definition VECustomDAG.h:132

llvm::PackElem::Lo
@ Lo
Definition VECustomDAG.h:131

llvm::IRBuilder
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >

llvm::RecurKind::And
@ And
Bitwise or logical AND of integers.
Definition IVDescriptors.h:42

llvm::RecurKind::Sub
@ Sub
Subtraction of integers.
Definition IVDescriptors.h:38

llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:22

llvm::max_element
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2030

llvm::operator<<
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition APFixedPoint.h:312

llvm::ArrayRef
ArrayRef(const T &OneElt) -> ArrayRef< T >

llvm::PseudoProbeReservedId::Last
@ Last
Definition PseudoProbe.h:28

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559

llvm::erase_if
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2120

llvm::ComputeMaxSignificantBits
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
Definition ValueTracking.cpp:344

llvm::getLoadStoreType
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
Definition Instructions.h:5147

llvm::AliasAnalysis
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
Definition AliasAnalysis.h:721

llvm::mayHaveNonDefUseDependency
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
Definition ValueTracking.cpp:7241

llvm::TensorType::Total
@ Total
Definition TensorSpec.h:59

shuffles::vshuff
MaskT vshuff(ArrayRef< int > Vu, ArrayRef< int > Vv, unsigned Size, bool TakeOdd)
Definition HexagonISelDAGToDAGHVX.cpp:866

shuffles::vdeal
MaskT vdeal(ArrayRef< int > Vu, ArrayRef< int > Vv, unsigned Size, bool TakeOdd)
Definition HexagonISelDAGToDAGHVX.cpp:879

std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869

raw_ostream.h

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39

llvm::Align::value
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77

llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137

llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373

llvm::EVT::getEVT
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition ValueTypes.cpp:301

llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316

llvm::KnownBits
Definition KnownBits.h:24

llvm::KnownBits::One
APInt One
Definition KnownBits.h:26

llvm::KnownBits::Zero
APInt Zero
Definition KnownBits.h:25