LLVM  16.0.0git
HexagonVectorCombine.cpp
Go to the documentation of this file.
1 //===-- HexagonVectorCombine.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // HexagonVectorCombine is a utility class implementing a variety of functions
9 // that assist in vector-based optimizations.
10 //
11 // AlignVectors: replace unaligned vector loads and stores with aligned ones.
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/ADT/APInt.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/DenseMap.h"
17 #include "llvm/ADT/None.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/SmallVector.h"
29 #include "llvm/IR/Dominators.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/IntrinsicInst.h"
32 #include "llvm/IR/Intrinsics.h"
33 #include "llvm/IR/IntrinsicsHexagon.h"
34 #include "llvm/IR/Metadata.h"
35 #include "llvm/IR/PatternMatch.h"
36 #include "llvm/InitializePasses.h"
37 #include "llvm/Pass.h"
38 #include "llvm/Support/KnownBits.h"
43 
44 #include "HexagonSubtarget.h"
45 #include "HexagonTargetMachine.h"
46 
47 #include <algorithm>
48 #include <deque>
49 #include <map>
50 #include <optional>
51 #include <set>
52 #include <utility>
53 #include <vector>
54 
55 #define DEBUG_TYPE "hexagon-vc"
56 
57 using namespace llvm;
58 
59 namespace {
60 class HexagonVectorCombine {
61 public:
62  HexagonVectorCombine(Function &F_, AliasAnalysis &AA_, AssumptionCache &AC_,
63  DominatorTree &DT_, TargetLibraryInfo &TLI_,
64  const TargetMachine &TM_)
65  : F(F_), DL(F.getParent()->getDataLayout()), AA(AA_), AC(AC_), DT(DT_),
66  TLI(TLI_),
67  HST(static_cast<const HexagonSubtarget &>(*TM_.getSubtargetImpl(F))) {}
68 
69  bool run();
70 
71  // Common integer type.
72  IntegerType *getIntTy(unsigned Width = 32) const;
73  // Byte type: either scalar (when Length = 0), or vector with given
74  // element count.
75  Type *getByteTy(int ElemCount = 0) const;
76  // Boolean type: either scalar (when Length = 0), or vector with given
77  // element count.
78  Type *getBoolTy(int ElemCount = 0) const;
79  // Create a ConstantInt of type returned by getIntTy with the value Val.
80  ConstantInt *getConstInt(int Val, unsigned Width = 32) const;
81  // Get the integer value of V, if it exists.
82  std::optional<APInt> getIntValue(const Value *Val) const;
83  // Is V a constant 0, or a vector of 0s?
84  bool isZero(const Value *Val) const;
85  // Is V an undef value?
86  bool isUndef(const Value *Val) const;
87 
88  // Get HVX vector type with the given element type.
89  VectorType *getHvxTy(Type *ElemTy, bool Pair = false) const;
90 
91  enum SizeKind {
92  Store, // Store size
93  Alloc, // Alloc size
94  };
95  int getSizeOf(const Value *Val, SizeKind Kind = Store) const;
96  int getSizeOf(const Type *Ty, SizeKind Kind = Store) const;
97  int getTypeAlignment(Type *Ty) const;
98  size_t length(Value *Val) const;
99  size_t length(Type *Ty) const;
100 
101  Constant *getNullValue(Type *Ty) const;
102  Constant *getFullValue(Type *Ty) const;
103  Constant *getConstSplat(Type *Ty, int Val) const;
104 
105  Value *simplify(Value *Val) const;
106 
107  Value *insertb(IRBuilderBase &Builder, Value *Dest, Value *Src, int Start,
108  int Length, int Where) const;
109  Value *vlalignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
110  Value *Amt) const;
111  Value *vralignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
112  Value *Amt) const;
114  Value *vresize(IRBuilderBase &Builder, Value *Val, int NewSize,
115  Value *Pad) const;
116  Value *rescale(IRBuilderBase &Builder, Value *Mask, Type *FromTy,
117  Type *ToTy) const;
118  Value *vlsb(IRBuilderBase &Builder, Value *Val) const;
119  Value *vbytes(IRBuilderBase &Builder, Value *Val) const;
120  Value *subvector(IRBuilderBase &Builder, Value *Val, unsigned Start,
121  unsigned Length) const;
122  Value *sublo(IRBuilderBase &Builder, Value *Val) const;
123  Value *subhi(IRBuilderBase &Builder, Value *Val) const;
124  Value *vdeal(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
125  Value *vshuff(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
126 
127  Value *createHvxIntrinsic(IRBuilderBase &Builder, Intrinsic::ID IntID,
128  Type *RetTy, ArrayRef<Value *> Args,
129  ArrayRef<Type *> ArgTys = None) const;
130  SmallVector<Value *> splitVectorElements(IRBuilderBase &Builder, Value *Vec,
131  unsigned ToWidth) const;
132  Value *joinVectorElements(IRBuilderBase &Builder, ArrayRef<Value *> Values,
133  VectorType *ToType) const;
134 
135  std::optional<int> calculatePointerDifference(Value *Ptr0, Value *Ptr1) const;
136 
137  unsigned getNumSignificantBits(const Value *V,
138  const Instruction *CtxI = nullptr) const;
139  KnownBits getKnownBits(const Value *V,
140  const Instruction *CtxI = nullptr) const;
141 
142  template <typename T = std::vector<Instruction *>>
143  bool isSafeToMoveBeforeInBB(const Instruction &In,
145  const T &IgnoreInsts = {}) const;
146 
147  // This function is only used for assertions at the moment.
148  [[maybe_unused]] bool isByteVecTy(Type *Ty) const;
149 
150  Function &F;
151  const DataLayout &DL;
152  AliasAnalysis &AA;
153  AssumptionCache &AC;
154  DominatorTree &DT;
155  TargetLibraryInfo &TLI;
156  const HexagonSubtarget &HST;
157 
158 private:
159  Value *getElementRange(IRBuilderBase &Builder, Value *Lo, Value *Hi,
160  int Start, int Length) const;
161 };
162 
163 class AlignVectors {
164 public:
165  AlignVectors(const HexagonVectorCombine &HVC_) : HVC(HVC_) {}
166 
167  bool run();
168 
169 private:
170  using InstList = std::vector<Instruction *>;
171 
172  struct Segment {
173  void *Data;
174  int Start;
175  int Size;
176  };
177 
178  struct AddrInfo {
179  AddrInfo(const AddrInfo &) = default;
180  AddrInfo(const HexagonVectorCombine &HVC, Instruction *I, Value *A, Type *T,
181  Align H)
182  : Inst(I), Addr(A), ValTy(T), HaveAlign(H),
183  NeedAlign(HVC.getTypeAlignment(ValTy)) {}
184  AddrInfo &operator=(const AddrInfo &) = default;
185 
186  // XXX: add Size member?
187  Instruction *Inst;
188  Value *Addr;
189  Type *ValTy;
190  Align HaveAlign;
191  Align NeedAlign;
192  int Offset = 0; // Offset (in bytes) from the first member of the
193  // containing AddrList.
194  };
195  using AddrList = std::vector<AddrInfo>;
196 
197  struct InstrLess {
198  bool operator()(const Instruction *A, const Instruction *B) const {
199  return A->comesBefore(B);
200  }
201  };
202  using DepList = std::set<Instruction *, InstrLess>;
203 
204  struct MoveGroup {
205  MoveGroup(const AddrInfo &AI, Instruction *B, bool Hvx, bool Load)
206  : Base(B), Main{AI.Inst}, IsHvx(Hvx), IsLoad(Load) {}
207  Instruction *Base; // Base instruction of the parent address group.
208  InstList Main; // Main group of instructions.
209  InstList Deps; // List of dependencies.
210  bool IsHvx; // Is this group of HVX instructions?
211  bool IsLoad; // Is this a load group?
212  };
213  using MoveList = std::vector<MoveGroup>;
214 
215  struct ByteSpan {
216  struct Segment {
217  // Segment of a Value: 'Len' bytes starting at byte 'Begin'.
218  Segment(Value *Val, int Begin, int Len)
219  : Val(Val), Start(Begin), Size(Len) {}
220  Segment(const Segment &Seg) = default;
221  Segment &operator=(const Segment &Seg) = default;
222  Value *Val; // Value representable as a sequence of bytes.
223  int Start; // First byte of the value that belongs to the segment.
224  int Size; // Number of bytes in the segment.
225  };
226 
227  struct Block {
228  Block(Value *Val, int Len, int Pos) : Seg(Val, 0, Len), Pos(Pos) {}
229  Block(Value *Val, int Off, int Len, int Pos)
230  : Seg(Val, Off, Len), Pos(Pos) {}
231  Block(const Block &Blk) = default;
232  Block &operator=(const Block &Blk) = default;
233  Segment Seg; // Value segment.
234  int Pos; // Position (offset) of the segment in the Block.
235  };
236 
237  int extent() const;
238  ByteSpan section(int Start, int Length) const;
239  ByteSpan &shift(int Offset);
241 
242  int size() const { return Blocks.size(); }
243  Block &operator[](int i) { return Blocks[i]; }
244 
245  std::vector<Block> Blocks;
246 
247  using iterator = decltype(Blocks)::iterator;
248  iterator begin() { return Blocks.begin(); }
249  iterator end() { return Blocks.end(); }
250  using const_iterator = decltype(Blocks)::const_iterator;
251  const_iterator begin() const { return Blocks.begin(); }
252  const_iterator end() const { return Blocks.end(); }
253  };
254 
255  Align getAlignFromValue(const Value *V) const;
256  std::optional<MemoryLocation> getLocation(const Instruction &In) const;
257  std::optional<AddrInfo> getAddrInfo(Instruction &In) const;
258  bool isHvx(const AddrInfo &AI) const;
259  // This function is only used for assertions at the moment.
260  [[maybe_unused]] bool isSectorTy(Type *Ty) const;
261 
262  Value *getPayload(Value *Val) const;
263  Value *getMask(Value *Val) const;
264  Value *getPassThrough(Value *Val) const;
265 
266  Value *createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
267  int Adjust) const;
268  Value *createAlignedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
269  int Alignment) const;
270  Value *createAlignedLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
271  int Alignment, Value *Mask, Value *PassThru) const;
272  Value *createAlignedStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
273  int Alignment, Value *Mask) const;
274 
275  DepList getUpwardDeps(Instruction *In, Instruction *Base) const;
276  bool createAddressGroups();
277  MoveList createLoadGroups(const AddrList &Group) const;
278  MoveList createStoreGroups(const AddrList &Group) const;
279  bool move(const MoveGroup &Move) const;
280  void realignLoadGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
281  int ScLen, Value *AlignVal, Value *AlignAddr) const;
282  void realignStoreGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
283  int ScLen, Value *AlignVal, Value *AlignAddr) const;
284  bool realignGroup(const MoveGroup &Move) const;
285 
286  friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);
287  friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);
288  friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan::Block &B);
289  friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);
290 
291  std::map<Instruction *, AddrList> AddrGroups;
292  const HexagonVectorCombine &HVC;
293 };
294 
296 raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::AddrInfo &AI) {
297  OS << "Inst: " << AI.Inst << " " << *AI.Inst << '\n';
298  OS << "Addr: " << *AI.Addr << '\n';
299  OS << "Type: " << *AI.ValTy << '\n';
300  OS << "HaveAlign: " << AI.HaveAlign.value() << '\n';
301  OS << "NeedAlign: " << AI.NeedAlign.value() << '\n';
302  OS << "Offset: " << AI.Offset;
303  return OS;
304 }
305 
307 raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
308  OS << "Main\n";
309  for (Instruction *I : MG.Main)
310  OS << " " << *I << '\n';
311  OS << "Deps\n";
312  for (Instruction *I : MG.Deps)
313  OS << " " << *I << '\n';
314  return OS;
315 }
316 
319  const AlignVectors::ByteSpan::Block &B) {
320  OS << " @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] "
321  << *B.Seg.Val;
322  return OS;
323 }
324 
326 raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {
327  OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
328  for (const AlignVectors::ByteSpan::Block &B : BS)
329  OS << B << '\n';
330  OS << ']';
331  return OS;
332 }
333 
334 class HvxIdioms {
335 public:
336  HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) {
337  auto *Int32Ty = HVC.getIntTy(32);
338  HvxI32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/false);
339  HvxP32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/true);
340  }
341 
342  bool run();
343 
344 private:
345  enum Signedness { Positive, Signed, Unsigned };
346 
347  // Value + sign
348  // This is to keep track of whether the value should be treated as signed
349  // or unsigned, or is known to be positive.
350  struct SValue {
351  Value *Val;
352  Signedness Sgn;
353  };
354 
355  struct FxpOp {
356  unsigned Opcode;
357  unsigned Frac; // Number of fraction bits
358  SValue X, Y;
359  // If present, add 1 << RoundAt before shift:
360  std::optional<unsigned> RoundAt;
361  };
362 
363  auto getNumSignificantBits(Value *V, Instruction *In) const
364  -> std::pair<unsigned, Signedness>;
365  auto canonSgn(SValue X, SValue Y) const -> std::pair<SValue, SValue>;
366 
367  auto matchFxpMul(Instruction &In) const -> std::optional<FxpOp>;
368  auto processFxpMul(Instruction &In, const FxpOp &Op) const -> Value *;
369 
370  auto processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
371  const FxpOp &Op) const -> Value *;
372  auto createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
373  bool Rounding) const -> Value *;
374  auto createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
375  bool Rounding) const -> Value *;
376  // Return {Result, Carry}, where Carry is a vector predicate.
377  auto createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
378  Value *CarryIn = nullptr) const
379  -> std::pair<Value *, Value *>;
380  auto createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const -> Value *;
381  auto createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
382  -> Value *;
383  auto createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
384  -> std::pair<Value *, Value *>;
385  auto createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
386  ArrayRef<Value *> WordY) const -> SmallVector<Value *>;
387  auto createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
388  Signedness SgnX, ArrayRef<Value *> WordY,
389  Signedness SgnY) const -> SmallVector<Value *>;
390 
391  VectorType *HvxI32Ty;
392  VectorType *HvxP32Ty;
393  const HexagonVectorCombine &HVC;
394 
395  friend raw_ostream &operator<<(raw_ostream &, const FxpOp &);
396 };
397 
398 [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
399  const HvxIdioms::FxpOp &Op) {
400  static const char *SgnNames[] = {"Positive", "Signed", "Unsigned"};
401  OS << Instruction::getOpcodeName(Op.Opcode) << '.' << Op.Frac;
402  if (Op.RoundAt.has_value()) {
403  if (Op.Frac != 0 && Op.RoundAt.value() == Op.Frac - 1) {
404  OS << ":rnd";
405  } else {
406  OS << " + 1<<" << Op.RoundAt.value();
407  }
408  }
409  OS << "\n X:(" << SgnNames[Op.X.Sgn] << ") " << *Op.X.Val << "\n"
410  << " Y:(" << SgnNames[Op.Y.Sgn] << ") " << *Op.Y.Val;
411  return OS;
412 }
413 
414 } // namespace
415 
416 namespace {
417 
418 template <typename T> T *getIfUnordered(T *MaybeT) {
419  return MaybeT && MaybeT->isUnordered() ? MaybeT : nullptr;
420 }
421 template <typename T> T *isCandidate(Instruction *In) {
422  return dyn_cast<T>(In);
423 }
424 template <> LoadInst *isCandidate<LoadInst>(Instruction *In) {
425  return getIfUnordered(dyn_cast<LoadInst>(In));
426 }
427 template <> StoreInst *isCandidate<StoreInst>(Instruction *In) {
428  return getIfUnordered(dyn_cast<StoreInst>(In));
429 }
430 
431 #if !defined(_MSC_VER) || _MSC_VER >= 1926
432 // VS2017 and some versions of VS2019 have trouble compiling this:
433 // error C2976: 'std::map': too few template arguments
434 // VS 2019 16.x is known to work, except for 16.4/16.5 (MSC_VER 1924/1925)
435 template <typename Pred, typename... Ts>
436 void erase_if(std::map<Ts...> &map, Pred p)
437 #else
438 template <typename Pred, typename T, typename U>
439 void erase_if(std::map<T, U> &map, Pred p)
440 #endif
441 {
442  for (auto i = map.begin(), e = map.end(); i != e;) {
443  if (p(*i))
444  i = map.erase(i);
445  else
446  i = std::next(i);
447  }
448 }
449 
450 // Forward other erase_ifs to the LLVM implementations.
451 template <typename Pred, typename T> void erase_if(T &&container, Pred p) {
452  llvm::erase_if(std::forward<T>(container), p);
453 }
454 
455 } // namespace
456 
457 // --- Begin AlignVectors
458 
459 auto AlignVectors::ByteSpan::extent() const -> int {
460  if (size() == 0)
461  return 0;
462  int Min = Blocks[0].Pos;
463  int Max = Blocks[0].Pos + Blocks[0].Seg.Size;
464  for (int i = 1, e = size(); i != e; ++i) {
465  Min = std::min(Min, Blocks[i].Pos);
466  Max = std::max(Max, Blocks[i].Pos + Blocks[i].Seg.Size);
467  }
468  return Max - Min;
469 }
470 
471 auto AlignVectors::ByteSpan::section(int Start, int Length) const -> ByteSpan {
472  ByteSpan Section;
473  for (const ByteSpan::Block &B : Blocks) {
474  int L = std::max(B.Pos, Start); // Left end.
475  int R = std::min(B.Pos + B.Seg.Size, Start + Length); // Right end+1.
476  if (L < R) {
477  // How much to chop off the beginning of the segment:
478  int Off = L > B.Pos ? L - B.Pos : 0;
479  Section.Blocks.emplace_back(B.Seg.Val, B.Seg.Start + Off, R - L, L);
480  }
481  }
482  return Section;
483 }
484 
485 auto AlignVectors::ByteSpan::shift(int Offset) -> ByteSpan & {
486  for (Block &B : Blocks)
487  B.Pos += Offset;
488  return *this;
489 }
490 
492  SmallVector<Value *, 8> Values(Blocks.size());
493  for (int i = 0, e = Blocks.size(); i != e; ++i)
494  Values[i] = Blocks[i].Seg.Val;
495  return Values;
496 }
497 
498 auto AlignVectors::getAlignFromValue(const Value *V) const -> Align {
499  const auto *C = dyn_cast<ConstantInt>(V);
500  assert(C && "Alignment must be a compile-time constant integer");
501  return C->getAlignValue();
502 }
503 
504 auto AlignVectors::getAddrInfo(Instruction &In) const
505  -> std::optional<AddrInfo> {
506  if (auto *L = isCandidate<LoadInst>(&In))
507  return AddrInfo(HVC, L, L->getPointerOperand(), L->getType(),
508  L->getAlign());
509  if (auto *S = isCandidate<StoreInst>(&In))
510  return AddrInfo(HVC, S, S->getPointerOperand(),
511  S->getValueOperand()->getType(), S->getAlign());
512  if (auto *II = isCandidate<IntrinsicInst>(&In)) {
513  Intrinsic::ID ID = II->getIntrinsicID();
514  switch (ID) {
515  case Intrinsic::masked_load:
516  return AddrInfo(HVC, II, II->getArgOperand(0), II->getType(),
517  getAlignFromValue(II->getArgOperand(1)));
518  case Intrinsic::masked_store:
519  return AddrInfo(HVC, II, II->getArgOperand(1),
520  II->getArgOperand(0)->getType(),
521  getAlignFromValue(II->getArgOperand(2)));
522  }
523  }
524  return std::nullopt;
525 }
526 
527 auto AlignVectors::isHvx(const AddrInfo &AI) const -> bool {
528  return HVC.HST.isTypeForHVX(AI.ValTy);
529 }
530 
531 auto AlignVectors::getPayload(Value *Val) const -> Value * {
532  if (auto *In = dyn_cast<Instruction>(Val)) {
533  Intrinsic::ID ID = 0;
534  if (auto *II = dyn_cast<IntrinsicInst>(In))
535  ID = II->getIntrinsicID();
536  if (isa<StoreInst>(In) || ID == Intrinsic::masked_store)
537  return In->getOperand(0);
538  }
539  return Val;
540 }
541 
542 auto AlignVectors::getMask(Value *Val) const -> Value * {
543  if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
544  switch (II->getIntrinsicID()) {
545  case Intrinsic::masked_load:
546  return II->getArgOperand(2);
547  case Intrinsic::masked_store:
548  return II->getArgOperand(3);
549  }
550  }
551 
552  Type *ValTy = getPayload(Val)->getType();
553  if (auto *VecTy = dyn_cast<VectorType>(ValTy))
554  return HVC.getFullValue(HVC.getBoolTy(HVC.length(VecTy)));
555  return HVC.getFullValue(HVC.getBoolTy());
556 }
557 
558 auto AlignVectors::getPassThrough(Value *Val) const -> Value * {
559  if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
560  if (II->getIntrinsicID() == Intrinsic::masked_load)
561  return II->getArgOperand(3);
562  }
563  return UndefValue::get(getPayload(Val)->getType());
564 }
565 
566 auto AlignVectors::createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr,
567  Type *ValTy, int Adjust) const
568  -> Value * {
569  // The adjustment is in bytes, but if it's a multiple of the type size,
570  // we don't need to do pointer casts.
571  auto *PtrTy = cast<PointerType>(Ptr->getType());
572  if (!PtrTy->isOpaque()) {
573  Type *ElemTy = PtrTy->getNonOpaquePointerElementType();
574  int ElemSize = HVC.getSizeOf(ElemTy, HVC.Alloc);
575  if (Adjust % ElemSize == 0 && Adjust != 0) {
576  Value *Tmp0 =
577  Builder.CreateGEP(ElemTy, Ptr, HVC.getConstInt(Adjust / ElemSize));
578  return Builder.CreatePointerCast(Tmp0, ValTy->getPointerTo());
579  }
580  }
581 
582  PointerType *CharPtrTy = Type::getInt8PtrTy(HVC.F.getContext());
583  Value *Tmp0 = Builder.CreatePointerCast(Ptr, CharPtrTy);
584  Value *Tmp1 = Builder.CreateGEP(Type::getInt8Ty(HVC.F.getContext()), Tmp0,
585  HVC.getConstInt(Adjust));
586  return Builder.CreatePointerCast(Tmp1, ValTy->getPointerTo());
587 }
588 
589 auto AlignVectors::createAlignedPointer(IRBuilderBase &Builder, Value *Ptr,
590  Type *ValTy, int Alignment) const
591  -> Value * {
592  Value *AsInt = Builder.CreatePtrToInt(Ptr, HVC.getIntTy());
593  Value *Mask = HVC.getConstInt(-Alignment);
594  Value *And = Builder.CreateAnd(AsInt, Mask);
595  return Builder.CreateIntToPtr(And, ValTy->getPointerTo());
596 }
597 
598 auto AlignVectors::createAlignedLoad(IRBuilderBase &Builder, Type *ValTy,
599  Value *Ptr, int Alignment, Value *Mask,
600  Value *PassThru) const -> Value * {
601  assert(!HVC.isUndef(Mask)); // Should this be allowed?
602  if (HVC.isZero(Mask))
603  return PassThru;
604  if (Mask == ConstantInt::getTrue(Mask->getType()))
605  return Builder.CreateAlignedLoad(ValTy, Ptr, Align(Alignment));
606  return Builder.CreateMaskedLoad(ValTy, Ptr, Align(Alignment), Mask, PassThru);
607 }
608 
609 auto AlignVectors::createAlignedStore(IRBuilderBase &Builder, Value *Val,
610  Value *Ptr, int Alignment,
611  Value *Mask) const -> Value * {
612  if (HVC.isZero(Mask) || HVC.isUndef(Val) || HVC.isUndef(Mask))
613  return UndefValue::get(Val->getType());
614  if (Mask == ConstantInt::getTrue(Mask->getType()))
615  return Builder.CreateAlignedStore(Val, Ptr, Align(Alignment));
616  return Builder.CreateMaskedStore(Val, Ptr, Align(Alignment), Mask);
617 }
618 
619 auto AlignVectors::getUpwardDeps(Instruction *In, Instruction *Base) const
620  -> DepList {
621  BasicBlock *Parent = Base->getParent();
622  assert(In->getParent() == Parent &&
623  "Base and In should be in the same block");
624  assert(Base->comesBefore(In) && "Base should come before In");
625 
626  DepList Deps;
627  std::deque<Instruction *> WorkQ = {In};
628  while (!WorkQ.empty()) {
629  Instruction *D = WorkQ.front();
630  WorkQ.pop_front();
631  Deps.insert(D);
632  for (Value *Op : D->operands()) {
633  if (auto *I = dyn_cast<Instruction>(Op)) {
634  if (I->getParent() == Parent && Base->comesBefore(I))
635  WorkQ.push_back(I);
636  }
637  }
638  }
639  return Deps;
640 }
641 
642 auto AlignVectors::createAddressGroups() -> bool {
643  // An address group created here may contain instructions spanning
644  // multiple basic blocks.
645  AddrList WorkStack;
646 
647  auto findBaseAndOffset = [&](AddrInfo &AI) -> std::pair<Instruction *, int> {
648  for (AddrInfo &W : WorkStack) {
649  if (auto D = HVC.calculatePointerDifference(AI.Addr, W.Addr))
650  return std::make_pair(W.Inst, *D);
651  }
652  return std::make_pair(nullptr, 0);
653  };
654 
655  auto traverseBlock = [&](DomTreeNode *DomN, auto Visit) -> void {
656  BasicBlock &Block = *DomN->getBlock();
657  for (Instruction &I : Block) {
658  auto AI = this->getAddrInfo(I); // Use this-> for gcc6.
659  if (!AI)
660  continue;
661  auto F = findBaseAndOffset(*AI);
662  Instruction *GroupInst;
663  if (Instruction *BI = F.first) {
664  AI->Offset = F.second;
665  GroupInst = BI;
666  } else {
667  WorkStack.push_back(*AI);
668  GroupInst = AI->Inst;
669  }
670  AddrGroups[GroupInst].push_back(*AI);
671  }
672 
673  for (DomTreeNode *C : DomN->children())
674  Visit(C, Visit);
675 
676  while (!WorkStack.empty() && WorkStack.back().Inst->getParent() == &Block)
677  WorkStack.pop_back();
678  };
679 
680  traverseBlock(HVC.DT.getRootNode(), traverseBlock);
681  assert(WorkStack.empty());
682 
683  // AddrGroups are formed.
684 
685  // Remove groups of size 1.
686  erase_if(AddrGroups, [](auto &G) { return G.second.size() == 1; });
687  // Remove groups that don't use HVX types.
688  erase_if(AddrGroups, [&](auto &G) {
689  return llvm::none_of(
690  G.second, [&](auto &I) { return HVC.HST.isTypeForHVX(I.ValTy); });
691  });
692 
693  return !AddrGroups.empty();
694 }
695 
696 auto AlignVectors::createLoadGroups(const AddrList &Group) const -> MoveList {
697  // Form load groups.
698  // To avoid complications with moving code across basic blocks, only form
699  // groups that are contained within a single basic block.
700 
701  auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
702  assert(!Move.Main.empty() && "Move group should have non-empty Main");
703  // Don't mix HVX and non-HVX instructions.
704  if (Move.IsHvx != isHvx(Info))
705  return false;
706  // Leading instruction in the load group.
707  Instruction *Base = Move.Main.front();
708  if (Base->getParent() != Info.Inst->getParent())
709  return false;
710 
711  auto isSafeToMoveToBase = [&](const Instruction *I) {
712  return HVC.isSafeToMoveBeforeInBB(*I, Base->getIterator());
713  };
714  DepList Deps = getUpwardDeps(Info.Inst, Base);
715  if (!llvm::all_of(Deps, isSafeToMoveToBase))
716  return false;
717 
718  // The dependencies will be moved together with the load, so make sure
719  // that none of them could be moved independently in another group.
720  Deps.erase(Info.Inst);
721  auto inAddrMap = [&](Instruction *I) { return AddrGroups.count(I) > 0; };
722  if (llvm::any_of(Deps, inAddrMap))
723  return false;
724  Move.Main.push_back(Info.Inst);
725  llvm::append_range(Move.Deps, Deps);
726  return true;
727  };
728 
729  MoveList LoadGroups;
730 
731  for (const AddrInfo &Info : Group) {
732  if (!Info.Inst->mayReadFromMemory())
733  continue;
734  if (LoadGroups.empty() || !tryAddTo(Info, LoadGroups.back()))
735  LoadGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), true);
736  }
737 
738  // Erase singleton groups.
739  erase_if(LoadGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
740  return LoadGroups;
741 }
742 
743 auto AlignVectors::createStoreGroups(const AddrList &Group) const -> MoveList {
744  // Form store groups.
745  // To avoid complications with moving code across basic blocks, only form
746  // groups that are contained within a single basic block.
747 
748  auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
749  assert(!Move.Main.empty() && "Move group should have non-empty Main");
750  // For stores with return values we'd have to collect downward depenencies.
751  // There are no such stores that we handle at the moment, so omit that.
752  assert(Info.Inst->getType()->isVoidTy() &&
753  "Not handling stores with return values");
754  // Don't mix HVX and non-HVX instructions.
755  if (Move.IsHvx != isHvx(Info))
756  return false;
757  // For stores we need to be careful whether it's safe to move them.
758  // Stores that are otherwise safe to move together may not appear safe
759  // to move over one another (i.e. isSafeToMoveBefore may return false).
760  Instruction *Base = Move.Main.front();
761  if (Base->getParent() != Info.Inst->getParent())
762  return false;
763  if (!HVC.isSafeToMoveBeforeInBB(*Info.Inst, Base->getIterator(), Move.Main))
764  return false;
765  Move.Main.push_back(Info.Inst);
766  return true;
767  };
768 
769  MoveList StoreGroups;
770 
771  for (auto I = Group.rbegin(), E = Group.rend(); I != E; ++I) {
772  const AddrInfo &Info = *I;
773  if (!Info.Inst->mayWriteToMemory())
774  continue;
775  if (StoreGroups.empty() || !tryAddTo(Info, StoreGroups.back()))
776  StoreGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), false);
777  }
778 
779  // Erase singleton groups.
780  erase_if(StoreGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
781  return StoreGroups;
782 }
783 
784 auto AlignVectors::move(const MoveGroup &Move) const -> bool {
785  assert(!Move.Main.empty() && "Move group should have non-empty Main");
786  Instruction *Where = Move.Main.front();
787 
788  if (Move.IsLoad) {
789  // Move all deps to before Where, keeping order.
790  for (Instruction *D : Move.Deps)
791  D->moveBefore(Where);
792  // Move all main instructions to after Where, keeping order.
793  ArrayRef<Instruction *> Main(Move.Main);
794  for (Instruction *M : Main.drop_front(1)) {
795  M->moveAfter(Where);
796  Where = M;
797  }
798  } else {
799  // NOTE: Deps are empty for "store" groups. If they need to be
800  // non-empty, decide on the order.
801  assert(Move.Deps.empty());
802  // Move all main instructions to before Where, inverting order.
803  ArrayRef<Instruction *> Main(Move.Main);
804  for (Instruction *M : Main.drop_front(1)) {
805  M->moveBefore(Where);
806  Where = M;
807  }
808  }
809 
810  return Move.Main.size() + Move.Deps.size() > 1;
811 }
812 
813 auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
814  const ByteSpan &VSpan, int ScLen,
815  Value *AlignVal, Value *AlignAddr) const
816  -> void {
817  Type *SecTy = HVC.getByteTy(ScLen);
818  int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
819  bool DoAlign = !HVC.isZero(AlignVal);
820  BasicBlock::iterator BasePos = Builder.GetInsertPoint();
821  BasicBlock *BaseBlock = Builder.GetInsertBlock();
822 
823  ByteSpan ASpan;
824  auto *True = HVC.getFullValue(HVC.getBoolTy(ScLen));
825  auto *Undef = UndefValue::get(SecTy);
826 
827  SmallVector<Instruction *> Loads(NumSectors + DoAlign, nullptr);
828 
829  // We could create all of the aligned loads, and generate the valigns
830  // at the location of the first load, but for large load groups, this
831  // could create highly suboptimal code (there have been groups of 140+
832  // loads in real code).
833  // Instead, place the loads/valigns as close to the users as possible.
834  // In any case we need to have a mapping from the blocks of VSpan (the
835  // span covered by the pre-existing loads) to ASpan (the span covered
836  // by the aligned loads). There is a small problem, though: ASpan needs
837  // to have pointers to the loads/valigns, but we don't know where to put
838  // them yet. We can't use nullptr, because when we create sections of
839  // ASpan (corresponding to blocks from VSpan), for each block in the
840  // section we need to know which blocks of ASpan they are a part of.
841  // To have 1-1 mapping between blocks of ASpan and the temporary value
842  // pointers, use the addresses of the blocks themselves.
843 
844  // Populate the blocks first, to avoid reallocations of the vector
845  // interfering with generating the placeholder addresses.
846  for (int Index = 0; Index != NumSectors; ++Index)
847  ASpan.Blocks.emplace_back(nullptr, ScLen, Index * ScLen);
848  for (int Index = 0; Index != NumSectors; ++Index) {
849  ASpan.Blocks[Index].Seg.Val =
850  reinterpret_cast<Value *>(&ASpan.Blocks[Index]);
851  }
852 
853  // Multiple values from VSpan can map to the same value in ASpan. Since we
854  // try to create loads lazily, we need to find the earliest use for each
855  // value from ASpan.
856  DenseMap<void *, Instruction *> EarliestUser;
857  auto isEarlier = [](Instruction *A, Instruction *B) {
858  if (B == nullptr)
859  return true;
860  if (A == nullptr)
861  return false;
862  assert(A->getParent() == B->getParent());
863  return A->comesBefore(B);
864  };
865  auto earliestUser = [&](const auto &Uses) {
866  Instruction *User = nullptr;
867  for (const Use &U : Uses) {
868  auto *I = dyn_cast<Instruction>(U.getUser());
869  assert(I != nullptr && "Load used in a non-instruction?");
870  // Make sure we only consider at users in this block, but we need
871  // to remember if there were users outside the block too. This is
872  // because if there are no users, aligned loads will not be created.
873  if (I->getParent() == BaseBlock) {
874  if (!isa<PHINode>(I))
875  User = std::min(User, I, isEarlier);
876  } else {
877  User = std::min(User, BaseBlock->getTerminator(), isEarlier);
878  }
879  }
880  return User;
881  };
882 
883  for (const ByteSpan::Block &B : VSpan) {
884  ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size);
885  for (const ByteSpan::Block &S : ASection) {
886  EarliestUser[S.Seg.Val] = std::min(
887  EarliestUser[S.Seg.Val], earliestUser(B.Seg.Val->uses()), isEarlier);
888  }
889  }
890 
891  auto createLoad = [&](IRBuilderBase &Builder, const ByteSpan &VSpan,
892  int Index) {
893  Value *Ptr =
894  createAdjustedPointer(Builder, AlignAddr, SecTy, Index * ScLen);
895  // FIXME: generate a predicated load?
896  Value *Load = createAlignedLoad(Builder, SecTy, Ptr, ScLen, True, Undef);
897  // If vector shifting is potentially needed, accumulate metadata
898  // from source sections of twice the load width.
899  int Start = (Index - DoAlign) * ScLen;
900  int Width = (1 + DoAlign) * ScLen;
901  propagateMetadata(cast<Instruction>(Load),
902  VSpan.section(Start, Width).values());
903  return cast<Instruction>(Load);
904  };
905 
906  auto moveBefore = [this](Instruction *In, Instruction *To) {
907  // Move In and its upward dependencies to before To.
908  assert(In->getParent() == To->getParent());
909  DepList Deps = getUpwardDeps(In, To);
910  // DepList is sorted with respect to positions in the basic block.
911  for (Instruction *I : Deps)
912  I->moveBefore(To);
913  };
914 
915  // Generate necessary loads at appropriate locations.
916  for (int Index = 0; Index != NumSectors + 1; ++Index) {
917  // In ASpan, each block will be either a single aligned load, or a
918  // valign of a pair of loads. In the latter case, an aligned load j
919  // will belong to the current valign, and the one in the previous
920  // block (for j > 0).
921  Instruction *PrevAt =
922  DoAlign && Index > 0 ? EarliestUser[&ASpan[Index - 1]] : nullptr;
923  Instruction *ThisAt =
924  Index < NumSectors ? EarliestUser[&ASpan[Index]] : nullptr;
925  if (auto *Where = std::min(PrevAt, ThisAt, isEarlier)) {
926  Builder.SetInsertPoint(Where);
927  Loads[Index] = createLoad(Builder, VSpan, Index);
928  // We know it's safe to put the load at BasePos, so if it's not safe
929  // to move it from this location to BasePos, then the current location
930  // is not valid.
931  // We can't do this check proactively because we need the load to exist
932  // in order to check legality.
933  if (!HVC.isSafeToMoveBeforeInBB(*Loads[Index], BasePos))
934  moveBefore(Loads[Index], &*BasePos);
935  }
936  }
937  // Generate valigns if needed, and fill in proper values in ASpan
938  for (int Index = 0; Index != NumSectors; ++Index) {
939  ASpan[Index].Seg.Val = nullptr;
940  if (auto *Where = EarliestUser[&ASpan[Index]]) {
941  Builder.SetInsertPoint(Where);
942  Value *Val = Loads[Index];
943  assert(Val != nullptr);
944  if (DoAlign) {
945  Value *NextLoad = Loads[Index + 1];
946  assert(NextLoad != nullptr);
947  Val = HVC.vralignb(Builder, Val, NextLoad, AlignVal);
948  }
949  ASpan[Index].Seg.Val = Val;
950  }
951  }
952 
953  for (const ByteSpan::Block &B : VSpan) {
954  ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);
955  Value *Accum = UndefValue::get(HVC.getByteTy(B.Seg.Size));
956  Builder.SetInsertPoint(cast<Instruction>(B.Seg.Val));
957 
958  for (ByteSpan::Block &S : ASection) {
959  if (S.Seg.Val == nullptr)
960  continue;
961  // The processing of the data loaded by the aligned loads
962  // needs to be inserted after the data is available.
963  Instruction *SegI = cast<Instruction>(S.Seg.Val);
964  Builder.SetInsertPoint(&*std::next(SegI->getIterator()));
965  Value *Pay = HVC.vbytes(Builder, getPayload(S.Seg.Val));
966  Accum = HVC.insertb(Builder, Accum, Pay, S.Seg.Start, S.Seg.Size, S.Pos);
967  }
968  // Instead of casting everything to bytes for the vselect, cast to the
969  // original value type. This will avoid complications with casting masks.
970  // For example, in cases when the original mask applied to i32, it could
971  // be converted to a mask applicable to i8 via pred_typecast intrinsic,
972  // but if the mask is not exactly of HVX length, extra handling would be
973  // needed to make it work.
974  Type *ValTy = getPayload(B.Seg.Val)->getType();
975  Value *Cast = Builder.CreateBitCast(Accum, ValTy);
976  Value *Sel = Builder.CreateSelect(getMask(B.Seg.Val), Cast,
977  getPassThrough(B.Seg.Val));
978  B.Seg.Val->replaceAllUsesWith(Sel);
979  }
980 }
981 
982 auto AlignVectors::realignStoreGroup(IRBuilderBase &Builder,
983  const ByteSpan &VSpan, int ScLen,
984  Value *AlignVal, Value *AlignAddr) const
985  -> void {
986  Type *SecTy = HVC.getByteTy(ScLen);
987  int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
988  bool DoAlign = !HVC.isZero(AlignVal);
989 
990  // Stores.
991  ByteSpan ASpanV, ASpanM;
992 
993  // Return a vector value corresponding to the input value Val:
994  // either <1 x Val> for scalar Val, or Val itself for vector Val.
995  auto MakeVec = [](IRBuilderBase &Builder, Value *Val) -> Value * {
996  Type *Ty = Val->getType();
997  if (Ty->isVectorTy())
998  return Val;
999  auto *VecTy = VectorType::get(Ty, 1, /*Scalable=*/false);
1000  return Builder.CreateBitCast(Val, VecTy);
1001  };
1002 
1003  // Create an extra "undef" sector at the beginning and at the end.
1004  // They will be used as the left/right filler in the vlalign step.
1005  for (int i = (DoAlign ? -1 : 0); i != NumSectors + DoAlign; ++i) {
1006  // For stores, the size of each section is an aligned vector length.
1007  // Adjust the store offsets relative to the section start offset.
1008  ByteSpan VSection = VSpan.section(i * ScLen, ScLen).shift(-i * ScLen);
1009  Value *AccumV = UndefValue::get(SecTy);
1010  Value *AccumM = HVC.getNullValue(SecTy);
1011  for (ByteSpan::Block &S : VSection) {
1012  Value *Pay = getPayload(S.Seg.Val);
1013  Value *Mask = HVC.rescale(Builder, MakeVec(Builder, getMask(S.Seg.Val)),
1014  Pay->getType(), HVC.getByteTy());
1015  AccumM = HVC.insertb(Builder, AccumM, HVC.vbytes(Builder, Mask),
1016  S.Seg.Start, S.Seg.Size, S.Pos);
1017  AccumV = HVC.insertb(Builder, AccumV, HVC.vbytes(Builder, Pay),
1018  S.Seg.Start, S.Seg.Size, S.Pos);
1019  }
1020  ASpanV.Blocks.emplace_back(AccumV, ScLen, i * ScLen);
1021  ASpanM.Blocks.emplace_back(AccumM, ScLen, i * ScLen);
1022  }
1023 
1024  // vlalign
1025  if (DoAlign) {
1026  for (int j = 1; j != NumSectors + 2; ++j) {
1027  Value *PrevV = ASpanV[j - 1].Seg.Val, *ThisV = ASpanV[j].Seg.Val;
1028  Value *PrevM = ASpanM[j - 1].Seg.Val, *ThisM = ASpanM[j].Seg.Val;
1029  assert(isSectorTy(PrevV->getType()) && isSectorTy(PrevM->getType()));
1030  ASpanV[j - 1].Seg.Val = HVC.vlalignb(Builder, PrevV, ThisV, AlignVal);
1031  ASpanM[j - 1].Seg.Val = HVC.vlalignb(Builder, PrevM, ThisM, AlignVal);
1032  }
1033  }
1034 
1035  for (int i = 0; i != NumSectors + DoAlign; ++i) {
1036  Value *Ptr = createAdjustedPointer(Builder, AlignAddr, SecTy, i * ScLen);
1037  Value *Val = ASpanV[i].Seg.Val;
1038  Value *Mask = ASpanM[i].Seg.Val; // bytes
1039  if (!HVC.isUndef(Val) && !HVC.isZero(Mask)) {
1040  Value *Store =
1041  createAlignedStore(Builder, Val, Ptr, ScLen, HVC.vlsb(Builder, Mask));
1042  // If vector shifting is potentially needed, accumulate metadata
1043  // from source sections of twice the store width.
1044  int Start = (i - DoAlign) * ScLen;
1045  int Width = (1 + DoAlign) * ScLen;
1046  propagateMetadata(cast<Instruction>(Store),
1047  VSpan.section(Start, Width).values());
1048  }
1049  }
1050 }
1051 
1052 auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
1053  // TODO: Needs support for masked loads/stores of "scalar" vectors.
1054  if (!Move.IsHvx)
1055  return false;
1056 
1057  // Return the element with the maximum alignment from Range,
1058  // where GetValue obtains the value to compare from an element.
1059  auto getMaxOf = [](auto Range, auto GetValue) {
1060  return *std::max_element(
1061  Range.begin(), Range.end(),
1062  [&GetValue](auto &A, auto &B) { return GetValue(A) < GetValue(B); });
1063  };
1064 
1065  const AddrList &BaseInfos = AddrGroups.at(Move.Base);
1066 
1067  // Conceptually, there is a vector of N bytes covering the addresses
1068  // starting from the minimum offset (i.e. Base.Addr+Start). This vector
1069  // represents a contiguous memory region that spans all accessed memory
1070  // locations.
1071  // The correspondence between loaded or stored values will be expressed
1072  // in terms of this vector. For example, the 0th element of the vector
1073  // from the Base address info will start at byte Start from the beginning
1074  // of this conceptual vector.
1075  //
1076  // This vector will be loaded/stored starting at the nearest down-aligned
1077  // address and the amount od the down-alignment will be AlignVal:
1078  // valign(load_vector(align_down(Base+Start)), AlignVal)
1079 
1080  std::set<Instruction *> TestSet(Move.Main.begin(), Move.Main.end());
1081  AddrList MoveInfos;
1082  llvm::copy_if(
1083  BaseInfos, std::back_inserter(MoveInfos),
1084  [&TestSet](const AddrInfo &AI) { return TestSet.count(AI.Inst); });
1085 
1086  // Maximum alignment present in the whole address group.
1087  const AddrInfo &WithMaxAlign =
1088  getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.HaveAlign; });
1089  Align MaxGiven = WithMaxAlign.HaveAlign;
1090 
1091  // Minimum alignment present in the move address group.
1092  const AddrInfo &WithMinOffset =
1093  getMaxOf(MoveInfos, [](const AddrInfo &AI) { return -AI.Offset; });
1094 
1095  const AddrInfo &WithMaxNeeded =
1096  getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.NeedAlign; });
1097  Align MinNeeded = WithMaxNeeded.NeedAlign;
1098 
1099  // Set the builder's insertion point right before the load group, or
1100  // immediately after the store group. (Instructions in a store group are
1101  // listed in reverse order.)
1102  Instruction *InsertAt = Move.Main.front();
1103  if (!Move.IsLoad) {
1104  // There should be a terminator (which store isn't, but check anyways).
1105  assert(InsertAt->getIterator() != InsertAt->getParent()->end());
1106  InsertAt = &*std::next(InsertAt->getIterator());
1107  }
1108 
1109  IRBuilder Builder(InsertAt->getParent(), InsertAt->getIterator(),
1110  InstSimplifyFolder(HVC.DL));
1111  Value *AlignAddr = nullptr; // Actual aligned address.
1112  Value *AlignVal = nullptr; // Right-shift amount (for valign).
1113 
1114  if (MinNeeded <= MaxGiven) {
1115  int Start = WithMinOffset.Offset;
1116  int OffAtMax = WithMaxAlign.Offset;
1117  // Shift the offset of the maximally aligned instruction (OffAtMax)
1118  // back by just enough multiples of the required alignment to cover the
1119  // distance from Start to OffAtMax.
1120  // Calculate the address adjustment amount based on the address with the
1121  // maximum alignment. This is to allow a simple gep instruction instead
1122  // of potential bitcasts to i8*.
1123  int Adjust = -alignTo(OffAtMax - Start, MinNeeded.value());
1124  AlignAddr = createAdjustedPointer(Builder, WithMaxAlign.Addr,
1125  WithMaxAlign.ValTy, Adjust);
1126  int Diff = Start - (OffAtMax + Adjust);
1127  AlignVal = HVC.getConstInt(Diff);
1128  assert(Diff >= 0);
1129  assert(static_cast<decltype(MinNeeded.value())>(Diff) < MinNeeded.value());
1130  } else {
1131  // WithMinOffset is the lowest address in the group,
1132  // WithMinOffset.Addr = Base+Start.
1133  // Align instructions for both HVX (V6_valign) and scalar (S2_valignrb)
1134  // mask off unnecessary bits, so it's ok to just the original pointer as
1135  // the alignment amount.
1136  // Do an explicit down-alignment of the address to avoid creating an
1137  // aligned instruction with an address that is not really aligned.
1138  AlignAddr = createAlignedPointer(Builder, WithMinOffset.Addr,
1139  WithMinOffset.ValTy, MinNeeded.value());
1140  AlignVal = Builder.CreatePtrToInt(WithMinOffset.Addr, HVC.getIntTy());
1141  }
1142 
1143  ByteSpan VSpan;
1144  for (const AddrInfo &AI : MoveInfos) {
1145  VSpan.Blocks.emplace_back(AI.Inst, HVC.getSizeOf(AI.ValTy),
1146  AI.Offset - WithMinOffset.Offset);
1147  }
1148 
1149  // The aligned loads/stores will use blocks that are either scalars,
1150  // or HVX vectors. Let "sector" be the unified term for such a block.
1151  // blend(scalar, vector) -> sector...
1152  int ScLen = Move.IsHvx ? HVC.HST.getVectorLength()
1153  : std::max<int>(MinNeeded.value(), 4);
1154  assert(!Move.IsHvx || ScLen == 64 || ScLen == 128);
1155  assert(Move.IsHvx || ScLen == 4 || ScLen == 8);
1156 
1157  if (Move.IsLoad)
1158  realignLoadGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1159  else
1160  realignStoreGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1161 
1162  for (auto *Inst : Move.Main)
1163  Inst->eraseFromParent();
1164 
1165  return true;
1166 }
1167 
1168 auto AlignVectors::isSectorTy(Type *Ty) const -> bool {
1169  if (!HVC.isByteVecTy(Ty))
1170  return false;
1171  int Size = HVC.getSizeOf(Ty);
1172  if (HVC.HST.isTypeForHVX(Ty))
1173  return Size == static_cast<int>(HVC.HST.getVectorLength());
1174  return Size == 4 || Size == 8;
1175 }
1176 
1177 auto AlignVectors::run() -> bool {
1178  if (!createAddressGroups())
1179  return false;
1180 
1181  bool Changed = false;
1182  MoveList LoadGroups, StoreGroups;
1183 
1184  for (auto &G : AddrGroups) {
1185  llvm::append_range(LoadGroups, createLoadGroups(G.second));
1186  llvm::append_range(StoreGroups, createStoreGroups(G.second));
1187  }
1188 
1189  for (auto &M : LoadGroups)
1190  Changed |= move(M);
1191  for (auto &M : StoreGroups)
1192  Changed |= move(M);
1193 
1194  for (auto &M : LoadGroups)
1195  Changed |= realignGroup(M);
1196  for (auto &M : StoreGroups)
1197  Changed |= realignGroup(M);
1198 
1199  return Changed;
1200 }
1201 
1202 // --- End AlignVectors
1203 
1204 // --- Begin HvxIdioms
1205 
1206 auto HvxIdioms::getNumSignificantBits(Value *V, Instruction *In) const
1207  -> std::pair<unsigned, Signedness> {
1208  unsigned Bits = HVC.getNumSignificantBits(V, In);
1209  // The significant bits are calculated including the sign bit. This may
1210  // add an extra bit for zero-extended values, e.g. (zext i32 to i64) may
1211  // result in 33 significant bits. To avoid extra words, skip the extra
1212  // sign bit, but keep information that the value is to be treated as
1213  // unsigned.
1214  KnownBits Known = HVC.getKnownBits(V, In);
1215  Signedness Sign = Signed;
1216  unsigned NumToTest = 0; // Number of bits used in test for unsignedness.
1217  if (isPowerOf2_32(Bits))
1218  NumToTest = Bits;
1219  else if (Bits > 1 && isPowerOf2_32(Bits - 1))
1220  NumToTest = Bits - 1;
1221 
1222  if (NumToTest != 0 && Known.Zero.ashr(NumToTest).isAllOnes()) {
1223  Sign = Unsigned;
1224  Bits = NumToTest;
1225  }
1226 
1227  // If the top bit of the nearest power-of-2 is zero, this value is
1228  // positive. It could be treated as either signed or unsigned.
1229  if (unsigned Pow2 = PowerOf2Ceil(Bits); Pow2 != Bits) {
1230  if (Known.Zero.ashr(Pow2 - 1).isAllOnes())
1231  Sign = Positive;
1232  }
1233  return {Bits, Sign};
1234 }
1235 
1236 auto HvxIdioms::canonSgn(SValue X, SValue Y) const
1237  -> std::pair<SValue, SValue> {
1238  // Canonicalize the signedness of X and Y, so that the result is one of:
1239  // S, S
1240  // U/P, S
1241  // U/P, U/P
1242  if (X.Sgn == Signed && Y.Sgn != Signed)
1243  std::swap(X, Y);
1244  return {X, Y};
1245 }
1246 
1247 // Match
1248 // (X * Y) [>> N], or
1249 // ((X * Y) + (1 << N-1)) >> N
1250 auto HvxIdioms::matchFxpMul(Instruction &In) const -> std::optional<FxpOp> {
1251  using namespace PatternMatch;
1252  auto *Ty = In.getType();
1253 
1254  if (!Ty->isVectorTy() || !Ty->getScalarType()->isIntegerTy())
1255  return std::nullopt;
1256 
1257  unsigned Width = cast<IntegerType>(Ty->getScalarType())->getBitWidth();
1258 
1259  FxpOp Op;
1260  Value *Exp = &In;
1261 
1262  // Fixed-point multiplication is always shifted right (except when the
1263  // fraction is 0 bits).
1264  auto m_Shr = [](auto &&V, auto &&S) {
1265  return m_CombineOr(m_LShr(V, S), m_AShr(V, S));
1266  };
1267 
1268  const APInt *Qn = nullptr;
1269  if (Value * T; match(Exp, m_Shr(m_Value(T), m_APInt(Qn)))) {
1270  Op.Frac = Qn->getZExtValue();
1271  Exp = T;
1272  } else {
1273  Op.Frac = 0;
1274  }
1275 
1276  if (Op.Frac > Width)
1277  return std::nullopt;
1278 
1279  // Check if there is rounding added.
1280  const APInt *C = nullptr;
1281  if (Value * T; Op.Frac > 0 && match(Exp, m_Add(m_Value(T), m_APInt(C)))) {
1282  unsigned CV = C->getZExtValue();
1283  if (CV != 0 && !isPowerOf2_32(CV))
1284  return std::nullopt;
1285  if (CV != 0)
1286  Op.RoundAt = Log2_32(CV);
1287  Exp = T;
1288  }
1289 
1290  // Check if the rest is a multiplication.
1291  if (match(Exp, m_Mul(m_Value(Op.X.Val), m_Value(Op.Y.Val)))) {
1292  Op.Opcode = Instruction::Mul;
1293  // FIXME: The information below is recomputed.
1294  Op.X.Sgn = getNumSignificantBits(Op.X.Val, &In).second;
1295  Op.Y.Sgn = getNumSignificantBits(Op.Y.Val, &In).second;
1296  return Op;
1297  }
1298 
1299  return std::nullopt;
1300 }
1301 
1302 auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const
1303  -> Value * {
1304  assert(Op.X.Val->getType() == Op.Y.Val->getType());
1305 
1306  auto *VecTy = dyn_cast<VectorType>(Op.X.Val->getType());
1307  if (VecTy == nullptr)
1308  return nullptr;
1309  auto *ElemTy = cast<IntegerType>(VecTy->getElementType());
1310  unsigned ElemWidth = ElemTy->getBitWidth();
1311 
1312  // TODO: This can be relaxed after legalization is done pre-isel.
1313  if ((HVC.length(VecTy) * ElemWidth) % (8 * HVC.HST.getVectorLength()) != 0)
1314  return nullptr;
1315 
1316  // There are no special intrinsics that should be used for multiplying
1317  // signed 8-bit values, so just skip them. Normal codegen should handle
1318  // this just fine.
1319  if (ElemWidth <= 8)
1320  return nullptr;
1321  // Similarly, if this is just a multiplication that can be handled without
1322  // intervention, then leave it alone.
1323  if (ElemWidth <= 32 && Op.Frac == 0)
1324  return nullptr;
1325 
1326  auto [BitsX, SignX] = getNumSignificantBits(Op.X.Val, &In);
1327  auto [BitsY, SignY] = getNumSignificantBits(Op.Y.Val, &In);
1328 
1329  // TODO: Add multiplication of vectors by scalar registers (up to 4 bytes).
1330 
1331  Value *X = Op.X.Val, *Y = Op.Y.Val;
1332  IRBuilder Builder(In.getParent(), In.getIterator(),
1333  InstSimplifyFolder(HVC.DL));
1334 
1335  auto roundUpWidth = [](unsigned Width) -> unsigned {
1336  if (Width <= 32 && !isPowerOf2_32(Width)) {
1337  // If the element width is not a power of 2, round it up
1338  // to the next one. Do this for widths not exceeding 32.
1339  return PowerOf2Ceil(Width);
1340  }
1341  if (Width > 32 && Width % 32 != 0) {
1342  // For wider elements, round it up to the multiple of 32.
1343  return alignTo(Width, 32u);
1344  }
1345  return Width;
1346  };
1347 
1348  BitsX = roundUpWidth(BitsX);
1349  BitsY = roundUpWidth(BitsY);
1350 
1351  // For elementwise multiplication vectors must have the same lengths, so
1352  // resize the elements of both inputs to the same width, the max of the
1353  // calculated significant bits.
1354  unsigned Width = std::max(BitsX, BitsY);
1355 
1356  auto *ResizeTy = VectorType::get(HVC.getIntTy(Width), VecTy);
1357  if (Width < ElemWidth) {
1358  X = Builder.CreateTrunc(X, ResizeTy);
1359  Y = Builder.CreateTrunc(Y, ResizeTy);
1360  } else if (Width > ElemWidth) {
1361  X = SignX == Signed ? Builder.CreateSExt(X, ResizeTy)
1362  : Builder.CreateZExt(X, ResizeTy);
1363  Y = SignY == Signed ? Builder.CreateSExt(Y, ResizeTy)
1364  : Builder.CreateZExt(Y, ResizeTy);
1365  };
1366 
1367  assert(X->getType() == Y->getType() && X->getType() == ResizeTy);
1368 
1369  unsigned VecLen = HVC.length(ResizeTy);
1370  unsigned ChopLen = (8 * HVC.HST.getVectorLength()) / std::min(Width, 32u);
1371 
1373  FxpOp ChopOp = Op;
1374 
1375  for (unsigned V = 0; V != VecLen / ChopLen; ++V) {
1376  ChopOp.X.Val = HVC.subvector(Builder, X, V * ChopLen, ChopLen);
1377  ChopOp.Y.Val = HVC.subvector(Builder, Y, V * ChopLen, ChopLen);
1378  Results.push_back(processFxpMulChopped(Builder, In, ChopOp));
1379  if (Results.back() == nullptr)
1380  break;
1381  }
1382 
1383  if (Results.empty() || Results.back() == nullptr)
1384  return nullptr;
1385 
1386  Value *Cat = HVC.concat(Builder, Results);
1387  Value *Ext = SignX == Signed || SignY == Signed
1388  ? Builder.CreateSExt(Cat, VecTy)
1389  : Builder.CreateZExt(Cat, VecTy);
1390  return Ext;
1391 }
1392 
1393 auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
1394  const FxpOp &Op) const -> Value * {
1395  assert(Op.X.Val->getType() == Op.Y.Val->getType());
1396  auto *InpTy = cast<VectorType>(Op.X.Val->getType());
1397  unsigned Width = InpTy->getScalarSizeInBits();
1398  bool Rounding = Op.RoundAt.has_value();
1399 
1400  if (!Op.RoundAt || *Op.RoundAt == Op.Frac - 1) {
1401  // The fixed-point intrinsics do signed multiplication.
1402  if (Width == Op.Frac + 1 && Op.X.Sgn != Unsigned && Op.Y.Sgn != Unsigned) {
1403  Value *QMul = nullptr;
1404  if (Width == 16) {
1405  QMul = createMulQ15(Builder, Op.X, Op.Y, Rounding);
1406  } else if (Width == 32) {
1407  QMul = createMulQ31(Builder, Op.X, Op.Y, Rounding);
1408  }
1409  if (QMul != nullptr)
1410  return QMul;
1411  }
1412  }
1413 
1414  assert(Width >= 32 || isPowerOf2_32(Width)); // Width <= 32 => Width is 2^n
1415  assert(Width < 32 || Width % 32 == 0); // Width > 32 => Width is 32*k
1416 
1417  // If Width < 32, then it should really be 16.
1418  if (Width < 32) {
1419  if (Width < 16)
1420  return nullptr;
1421  // Getting here with Op.Frac == 0 isn't wrong, but suboptimal: here we
1422  // generate a full precision products, which is unnecessary if there is
1423  // no shift.
1424  assert(Width == 16);
1425  assert(Op.Frac != 0 && "Unshifted mul should have been skipped");
1426  if (Op.Frac == 16) {
1427  // Multiply high
1428  if (Value *MulH = createMulH16(Builder, Op.X, Op.Y))
1429  return MulH;
1430  }
1431  // Do full-precision multiply and shift.
1432  Value *Prod32 = createMul16(Builder, Op.X, Op.Y);
1433  if (Rounding) {
1434  Value *RoundVal = HVC.getConstSplat(Prod32->getType(), 1 << *Op.RoundAt);
1435  Prod32 = Builder.CreateAdd(Prod32, RoundVal);
1436  }
1437 
1438  Value *ShiftAmt = HVC.getConstSplat(Prod32->getType(), Op.Frac);
1439  Value *Shifted = Op.X.Sgn == Signed || Op.Y.Sgn == Signed
1440  ? Builder.CreateAShr(Prod32, ShiftAmt)
1441  : Builder.CreateLShr(Prod32, ShiftAmt);
1442  return Builder.CreateTrunc(Shifted, InpTy);
1443  }
1444 
1445  // Width >= 32
1446 
1447  // Break up the arguments Op.X and Op.Y into vectors of smaller widths
1448  // in preparation of doing the multiplication by 32-bit parts.
1449  auto WordX = HVC.splitVectorElements(Builder, Op.X.Val, /*ToWidth=*/32);
1450  auto WordY = HVC.splitVectorElements(Builder, Op.Y.Val, /*ToWidth=*/32);
1451  auto WordP = createMulLong(Builder, WordX, Op.X.Sgn, WordY, Op.Y.Sgn);
1452 
1453  auto *HvxWordTy = cast<VectorType>(WordP.front()->getType());
1454 
1455  // Add the optional rounding to the proper word.
1456  if (Op.RoundAt.has_value()) {
1457  Value *Zero = HVC.getNullValue(WordX[0]->getType());
1458  SmallVector<Value *> RoundV(WordP.size(), Zero);
1459  RoundV[*Op.RoundAt / 32] =
1460  HVC.getConstSplat(HvxWordTy, 1 << (*Op.RoundAt % 32));
1461  WordP = createAddLong(Builder, WordP, RoundV);
1462  }
1463 
1464  // createRightShiftLong?
1465 
1466  // Shift all products right by Op.Frac.
1467  unsigned SkipWords = Op.Frac / 32;
1468  Constant *ShiftAmt = HVC.getConstSplat(HvxWordTy, Op.Frac % 32);
1469 
1470  for (int Dst = 0, End = WordP.size() - SkipWords; Dst != End; ++Dst) {
1471  int Src = Dst + SkipWords;
1472  Value *Lo = WordP[Src];
1473  if (Src + 1 < End) {
1474  Value *Hi = WordP[Src + 1];
1475  WordP[Dst] = Builder.CreateIntrinsic(HvxWordTy, Intrinsic::fshr,
1476  {Hi, Lo, ShiftAmt});
1477  } else {
1478  // The shift of the most significant word.
1479  WordP[Dst] = Builder.CreateAShr(Lo, ShiftAmt);
1480  }
1481  }
1482  if (SkipWords != 0)
1483  WordP.resize(WordP.size() - SkipWords);
1484 
1485  return HVC.joinVectorElements(Builder, WordP, InpTy);
1486 }
1487 
1488 auto HvxIdioms::createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
1489  bool Rounding) const -> Value * {
1490  assert(X.Val->getType() == Y.Val->getType());
1491  assert(X.Val->getType()->getScalarType() == HVC.getIntTy(16));
1492  assert(HVC.HST.isHVXVectorType(EVT::getEVT(X.Val->getType(), false)));
1493 
1494  // There is no non-rounding intrinsic for i16.
1495  if (!Rounding || X.Sgn == Unsigned || Y.Sgn == Unsigned)
1496  return nullptr;
1497 
1498  auto V6_vmpyhvsrs = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhvsrs);
1499  return HVC.createHvxIntrinsic(Builder, V6_vmpyhvsrs, X.Val->getType(),
1500  {X.Val, Y.Val});
1501 }
1502 
1503 auto HvxIdioms::createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
1504  bool Rounding) const -> Value * {
1505  Type *InpTy = X.Val->getType();
1506  assert(InpTy == Y.Val->getType());
1507  assert(InpTy->getScalarType() == HVC.getIntTy(32));
1508  assert(HVC.HST.isHVXVectorType(EVT::getEVT(InpTy, false)));
1509 
1510  if (X.Sgn == Unsigned || Y.Sgn == Unsigned)
1511  return nullptr;
1512 
1513  auto V6_vmpyewuh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyewuh);
1514  auto V6_vmpyo_acc = Rounding
1515  ? HVC.HST.getIntrinsicId(Hexagon::V6_vmpyowh_rnd_sacc)
1516  : HVC.HST.getIntrinsicId(Hexagon::V6_vmpyowh_sacc);
1517  Value *V1 =
1518  HVC.createHvxIntrinsic(Builder, V6_vmpyewuh, InpTy, {X.Val, Y.Val});
1519  return HVC.createHvxIntrinsic(Builder, V6_vmpyo_acc, InpTy,
1520  {V1, X.Val, Y.Val});
1521 }
1522 
1523 auto HvxIdioms::createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
1524  Value *CarryIn) const
1525  -> std::pair<Value *, Value *> {
1526  assert(X->getType() == Y->getType());
1527  auto VecTy = cast<VectorType>(X->getType());
1528  if (VecTy == HvxI32Ty && HVC.HST.useHVXV62Ops()) {
1530  Intrinsic::ID AddCarry;
1531  if (CarryIn == nullptr && HVC.HST.useHVXV66Ops()) {
1532  AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarryo);
1533  } else {
1534  AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarry);
1535  if (CarryIn == nullptr)
1536  CarryIn = HVC.getNullValue(HVC.getBoolTy(HVC.length(VecTy)));
1537  Args.push_back(CarryIn);
1538  }
1539  Value *Ret = HVC.createHvxIntrinsic(Builder, AddCarry,
1540  /*RetTy=*/nullptr, Args);
1541  Value *Result = Builder.CreateExtractValue(Ret, {0});
1542  Value *CarryOut = Builder.CreateExtractValue(Ret, {1});
1543  return {Result, CarryOut};
1544  }
1545 
1546  // In other cases, do a regular add, and unsigned compare-less-than.
1547  // The carry-out can originate in two places: adding the carry-in or adding
1548  // the two input values.
1549  Value *Result1 = X; // Result1 = X + CarryIn
1550  if (CarryIn != nullptr) {
1551  unsigned Width = VecTy->getScalarSizeInBits();
1552  uint32_t Mask = 1;
1553  if (Width < 32) {
1554  for (unsigned i = 0, e = 32 / Width; i != e; ++i)
1555  Mask = (Mask << Width) | 1;
1556  }
1557  auto V6_vandqrt = HVC.HST.getIntrinsicId(Hexagon::V6_vandqrt);
1558  Value *ValueIn =
1559  HVC.createHvxIntrinsic(Builder, V6_vandqrt, /*RetTy=*/nullptr,
1560  {CarryIn, HVC.getConstInt(Mask)});
1561  Result1 = Builder.CreateAdd(X, ValueIn);
1562  }
1563 
1564  Value *CarryOut1 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result1, X);
1565  Value *Result2 = Builder.CreateAdd(Result1, Y);
1566  Value *CarryOut2 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result2, Y);
1567  return {Result2, Builder.CreateOr(CarryOut1, CarryOut2)};
1568 }
1569 
1570 auto HvxIdioms::createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const
1571  -> Value * {
1572  Intrinsic::ID V6_vmpyh = 0;
1573  std::tie(X, Y) = canonSgn(X, Y);
1574 
1575  if (X.Sgn == Signed) {
1576  V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhv);
1577  } else if (Y.Sgn == Signed) {
1578  // In vmpyhus the second operand is unsigned
1579  V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhus);
1580  } else {
1581  V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyuhv);
1582  }
1583 
1584  // i16*i16 -> i32 / interleaved
1585  Value *P =
1586  HVC.createHvxIntrinsic(Builder, V6_vmpyh, HvxP32Ty, {Y.Val, X.Val});
1587  // Deinterleave
1588  return HVC.vshuff(Builder, HVC.sublo(Builder, P), HVC.subhi(Builder, P));
1589 }
1590 
1591 auto HvxIdioms::createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
1592  -> Value * {
1593  Type *HvxI16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/false);
1594 
1595  if (HVC.HST.useHVXV69Ops()) {
1596  if (X.Sgn != Signed && Y.Sgn != Signed) {
1597  auto V6_vmpyuhvs = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyuhvs);
1598  return HVC.createHvxIntrinsic(Builder, V6_vmpyuhvs, HvxI16Ty,
1599  {X.Val, Y.Val});
1600  }
1601  }
1602 
1603  Type *HvxP16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/true);
1604  Value *Pair16 = Builder.CreateBitCast(createMul16(Builder, X, Y), HvxP16Ty);
1605  unsigned Len = HVC.length(HvxP16Ty) / 2;
1606 
1607  SmallVector<int, 128> PickOdd(Len);
1608  for (int i = 0; i != static_cast<int>(Len); ++i)
1609  PickOdd[i] = 2 * i + 1;
1610 
1611  return Builder.CreateShuffleVector(HVC.sublo(Builder, Pair16),
1612  HVC.subhi(Builder, Pair16), PickOdd);
1613 }
1614 
1615 auto HvxIdioms::createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
1616  -> std::pair<Value *, Value *> {
1617  assert(X.Val->getType() == Y.Val->getType());
1618  assert(X.Val->getType() == HvxI32Ty);
1619 
1620  Intrinsic::ID V6_vmpy_parts;
1621  std::tie(X, Y) = canonSgn(X, Y);
1622 
1623  if (X.Sgn == Signed) {
1624  V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyss_parts;
1625  } else if (Y.Sgn == Signed) {
1626  V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyus_parts;
1627  } else {
1628  V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyuu_parts;
1629  }
1630 
1631  Value *Parts = HVC.createHvxIntrinsic(Builder, V6_vmpy_parts, nullptr,
1632  {X.Val, Y.Val}, {HvxI32Ty});
1633  Value *Hi = Builder.CreateExtractValue(Parts, {0});
1634  Value *Lo = Builder.CreateExtractValue(Parts, {1});
1635  return {Lo, Hi};
1636 }
1637 
1638 auto HvxIdioms::createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
1639  ArrayRef<Value *> WordY) const
1641  assert(WordX.size() == WordY.size());
1642  unsigned Idx = 0, Length = WordX.size();
1643  SmallVector<Value *> Sum(Length);
1644 
1645  while (Idx != Length) {
1646  if (HVC.isZero(WordX[Idx]))
1647  Sum[Idx] = WordY[Idx];
1648  else if (HVC.isZero(WordY[Idx]))
1649  Sum[Idx] = WordX[Idx];
1650  else
1651  break;
1652  ++Idx;
1653  }
1654 
1655  Value *Carry = nullptr;
1656  for (; Idx != Length; ++Idx) {
1657  std::tie(Sum[Idx], Carry) =
1658  createAddCarry(Builder, WordX[Idx], WordY[Idx], Carry);
1659  }
1660 
1661  // This drops the final carry beyond the highest word.
1662  return Sum;
1663 }
1664 
1665 auto HvxIdioms::createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
1666  Signedness SgnX, ArrayRef<Value *> WordY,
1667  Signedness SgnY) const -> SmallVector<Value *> {
1668  SmallVector<SmallVector<Value *>> Products(WordX.size() + WordY.size());
1669 
1670  // WordX[i] * WordY[j] produces words i+j and i+j+1 of the results,
1671  // that is halves 2(i+j), 2(i+j)+1, 2(i+j)+2, 2(i+j)+3.
1672  for (int i = 0, e = WordX.size(); i != e; ++i) {
1673  for (int j = 0, f = WordY.size(); j != f; ++j) {
1674  // Check the 4 halves that this multiplication can generate.
1675  Signedness SX = (i + 1 == e) ? SgnX : Unsigned;
1676  Signedness SY = (j + 1 == f) ? SgnY : Unsigned;
1677  auto [Lo, Hi] = createMul32(Builder, {WordX[i], SX}, {WordY[j], SY});
1678  Products[i + j + 0].push_back(Lo);
1679  Products[i + j + 1].push_back(Hi);
1680  }
1681  }
1682 
1683  Value *Zero = HVC.getNullValue(WordX[0]->getType());
1684 
1685  auto pop_back_or_zero = [Zero](auto &Vector) -> Value * {
1686  if (Vector.empty())
1687  return Zero;
1688  auto Last = Vector.back();
1689  Vector.pop_back();
1690  return Last;
1691  };
1692 
1693  for (int i = 0, e = Products.size(); i != e; ++i) {
1694  while (Products[i].size() > 1) {
1695  Value *Carry = nullptr; // no carry-in
1696  for (int j = i; j != e; ++j) {
1697  auto &ProdJ = Products[j];
1698  auto [Sum, CarryOut] = createAddCarry(Builder, pop_back_or_zero(ProdJ),
1699  pop_back_or_zero(ProdJ), Carry);
1700  ProdJ.insert(ProdJ.begin(), Sum);
1701  Carry = CarryOut;
1702  }
1703  }
1704  }
1705 
1706  SmallVector<Value *> WordP;
1707  for (auto &P : Products) {
1708  assert(P.size() == 1 && "Should have been added together");
1709  WordP.push_back(P.front());
1710  }
1711 
1712  return WordP;
1713 }
1714 
1715 auto HvxIdioms::run() -> bool {
1716  bool Changed = false;
1717 
1718  for (BasicBlock &B : HVC.F) {
1719  for (auto It = B.rbegin(); It != B.rend(); ++It) {
1720  if (auto Fxm = matchFxpMul(*It)) {
1721  Value *New = processFxpMul(*It, *Fxm);
1722  // Always report "changed" for now.
1723  Changed = true;
1724  if (!New)
1725  continue;
1726  bool StartOver = !isa<Instruction>(New);
1727  It->replaceAllUsesWith(New);
1729  It = StartOver ? B.rbegin()
1730  : cast<Instruction>(New)->getReverseIterator();
1731  Changed = true;
1732  }
1733  }
1734  }
1735 
1736  return Changed;
1737 }
1738 
1739 // --- End HvxIdioms
1740 
1741 auto HexagonVectorCombine::run() -> bool {
1742  if (!HST.useHVXOps())
1743  return false;
1744 
1745  bool Changed = false;
1746  Changed |= AlignVectors(*this).run();
1747  Changed |= HvxIdioms(*this).run();
1748 
1749  return Changed;
1750 }
1751 
1752 auto HexagonVectorCombine::getIntTy(unsigned Width) const -> IntegerType * {
1753  return IntegerType::get(F.getContext(), Width);
1754 }
1755 
1756 auto HexagonVectorCombine::getByteTy(int ElemCount) const -> Type * {
1757  assert(ElemCount >= 0);
1758  IntegerType *ByteTy = Type::getInt8Ty(F.getContext());
1759  if (ElemCount == 0)
1760  return ByteTy;
1761  return VectorType::get(ByteTy, ElemCount, /*Scalable=*/false);
1762 }
1763 
1764 auto HexagonVectorCombine::getBoolTy(int ElemCount) const -> Type * {
1765  assert(ElemCount >= 0);
1766  IntegerType *BoolTy = Type::getInt1Ty(F.getContext());
1767  if (ElemCount == 0)
1768  return BoolTy;
1769  return VectorType::get(BoolTy, ElemCount, /*Scalable=*/false);
1770 }
1771 
1772 auto HexagonVectorCombine::getConstInt(int Val, unsigned Width) const
1773  -> ConstantInt * {
1774  return ConstantInt::getSigned(getIntTy(Width), Val);
1775 }
1776 
1777 auto HexagonVectorCombine::isZero(const Value *Val) const -> bool {
1778  if (auto *C = dyn_cast<Constant>(Val))
1779  return C->isZeroValue();
1780  return false;
1781 }
1782 
1783 auto HexagonVectorCombine::getIntValue(const Value *Val) const
1784  -> std::optional<APInt> {
1785  if (auto *CI = dyn_cast<ConstantInt>(Val))
1786  return CI->getValue();
1787  return std::nullopt;
1788 }
1789 
1790 auto HexagonVectorCombine::isUndef(const Value *Val) const -> bool {
1791  return isa<UndefValue>(Val);
1792 }
1793 
1794 auto HexagonVectorCombine::getHvxTy(Type *ElemTy, bool Pair) const
1795  -> VectorType * {
1796  EVT ETy = EVT::getEVT(ElemTy, false);
1797  assert(ETy.isSimple() && "Invalid HVX element type");
1798  // Do not allow boolean types here: they don't have a fixed length.
1799  assert(HST.isHVXElementType(ETy.getSimpleVT(), /*IncludeBool=*/false) &&
1800  "Invalid HVX element type");
1801  unsigned HwLen = HST.getVectorLength();
1802  unsigned NumElems = (8 * HwLen) / ETy.getSizeInBits();
1803  return VectorType::get(ElemTy, Pair ? 2 * NumElems : NumElems,
1804  /*Scalable=*/false);
1805 }
1806 
1807 auto HexagonVectorCombine::getSizeOf(const Value *Val, SizeKind Kind) const
1808  -> int {
1809  return getSizeOf(Val->getType(), Kind);
1810 }
1811 
1812 auto HexagonVectorCombine::getSizeOf(const Type *Ty, SizeKind Kind) const
1813  -> int {
1814  auto *NcTy = const_cast<Type *>(Ty);
1815  switch (Kind) {
1816  case Store:
1817  return DL.getTypeStoreSize(NcTy).getFixedValue();
1818  case Alloc:
1819  return DL.getTypeAllocSize(NcTy).getFixedValue();
1820  }
1821  llvm_unreachable("Unhandled SizeKind enum");
1822 }
1823 
1824 auto HexagonVectorCombine::getTypeAlignment(Type *Ty) const -> int {
1825  // The actual type may be shorter than the HVX vector, so determine
1826  // the alignment based on subtarget info.
1827  if (HST.isTypeForHVX(Ty))
1828  return HST.getVectorLength();
1829  return DL.getABITypeAlign(Ty).value();
1830 }
1831 
1832 auto HexagonVectorCombine::length(Value *Val) const -> size_t {
1833  return length(Val->getType());
1834 }
1835 
1836 auto HexagonVectorCombine::length(Type *Ty) const -> size_t {
1837  auto *VecTy = dyn_cast<VectorType>(Ty);
1838  assert(VecTy && "Must be a vector type");
1839  return VecTy->getElementCount().getFixedValue();
1840 }
1841 
1842 auto HexagonVectorCombine::getNullValue(Type *Ty) const -> Constant * {
1843  assert(Ty->isIntOrIntVectorTy());
1844  auto Zero = ConstantInt::get(Ty->getScalarType(), 0);
1845  if (auto *VecTy = dyn_cast<VectorType>(Ty))
1846  return ConstantVector::getSplat(VecTy->getElementCount(), Zero);
1847  return Zero;
1848 }
1849 
1850 auto HexagonVectorCombine::getFullValue(Type *Ty) const -> Constant * {
1851  assert(Ty->isIntOrIntVectorTy());
1852  auto Minus1 = ConstantInt::get(Ty->getScalarType(), -1);
1853  if (auto *VecTy = dyn_cast<VectorType>(Ty))
1854  return ConstantVector::getSplat(VecTy->getElementCount(), Minus1);
1855  return Minus1;
1856 }
1857 
1858 auto HexagonVectorCombine::getConstSplat(Type *Ty, int Val) const
1859  -> Constant * {
1860  assert(Ty->isVectorTy());
1861  auto VecTy = cast<VectorType>(Ty);
1862  Type *ElemTy = VecTy->getElementType();
1863  // Add support for floats if needed.
1864  auto *Splat = ConstantVector::getSplat(VecTy->getElementCount(),
1865  ConstantInt::get(ElemTy, Val));
1866  return Splat;
1867 }
1868 
1869 auto HexagonVectorCombine::simplify(Value *V) const -> Value * {
1870  if (auto *In = dyn_cast<Instruction>(V)) {
1871  SimplifyQuery Q(DL, &TLI, &DT, &AC, In);
1872  return simplifyInstruction(In, Q);
1873  }
1874  return nullptr;
1875 }
1876 
1877 // Insert bytes [Start..Start+Length) of Src into Dst at byte Where.
1878 auto HexagonVectorCombine::insertb(IRBuilderBase &Builder, Value *Dst,
1879  Value *Src, int Start, int Length,
1880  int Where) const -> Value * {
1881  assert(isByteVecTy(Dst->getType()) && isByteVecTy(Src->getType()));
1882  int SrcLen = getSizeOf(Src);
1883  int DstLen = getSizeOf(Dst);
1884  assert(0 <= Start && Start + Length <= SrcLen);
1885  assert(0 <= Where && Where + Length <= DstLen);
1886 
1887  int P2Len = PowerOf2Ceil(SrcLen | DstLen);
1888  auto *Undef = UndefValue::get(getByteTy());
1889  Value *P2Src = vresize(Builder, Src, P2Len, Undef);
1890  Value *P2Dst = vresize(Builder, Dst, P2Len, Undef);
1891 
1892  SmallVector<int, 256> SMask(P2Len);
1893  for (int i = 0; i != P2Len; ++i) {
1894  // If i is in [Where, Where+Length), pick Src[Start+(i-Where)].
1895  // Otherwise, pick Dst[i];
1896  SMask[i] =
1897  (Where <= i && i < Where + Length) ? P2Len + Start + (i - Where) : i;
1898  }
1899 
1900  Value *P2Insert = Builder.CreateShuffleVector(P2Dst, P2Src, SMask);
1901  return vresize(Builder, P2Insert, DstLen, Undef);
1902 }
1903 
1904 auto HexagonVectorCombine::vlalignb(IRBuilderBase &Builder, Value *Lo,
1905  Value *Hi, Value *Amt) const -> Value * {
1906  assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
1907  if (isZero(Amt))
1908  return Hi;
1909  int VecLen = getSizeOf(Hi);
1910  if (auto IntAmt = getIntValue(Amt))
1911  return getElementRange(Builder, Lo, Hi, VecLen - IntAmt->getSExtValue(),
1912  VecLen);
1913 
1914  if (HST.isTypeForHVX(Hi->getType())) {
1915  assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
1916  "Expecting an exact HVX type");
1917  return createHvxIntrinsic(Builder, HST.getIntrinsicId(Hexagon::V6_vlalignb),
1918  Hi->getType(), {Hi, Lo, Amt});
1919  }
1920 
1921  if (VecLen == 4) {
1922  Value *Pair = concat(Builder, {Lo, Hi});
1923  Value *Shift = Builder.CreateLShr(Builder.CreateShl(Pair, Amt), 32);
1924  Value *Trunc = Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()));
1925  return Builder.CreateBitCast(Trunc, Hi->getType());
1926  }
1927  if (VecLen == 8) {
1928  Value *Sub = Builder.CreateSub(getConstInt(VecLen), Amt);
1929  return vralignb(Builder, Lo, Hi, Sub);
1930  }
1931  llvm_unreachable("Unexpected vector length");
1932 }
1933 
1934 auto HexagonVectorCombine::vralignb(IRBuilderBase &Builder, Value *Lo,
1935  Value *Hi, Value *Amt) const -> Value * {
1936  assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
1937  if (isZero(Amt))
1938  return Lo;
1939  int VecLen = getSizeOf(Lo);
1940  if (auto IntAmt = getIntValue(Amt))
1941  return getElementRange(Builder, Lo, Hi, IntAmt->getSExtValue(), VecLen);
1942 
1943  if (HST.isTypeForHVX(Lo->getType())) {
1944  assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
1945  "Expecting an exact HVX type");
1946  return createHvxIntrinsic(Builder, HST.getIntrinsicId(Hexagon::V6_valignb),
1947  Lo->getType(), {Hi, Lo, Amt});
1948  }
1949 
1950  if (VecLen == 4) {
1951  Value *Pair = concat(Builder, {Lo, Hi});
1952  Value *Shift = Builder.CreateLShr(Pair, Amt);
1953  Value *Trunc = Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()));
1954  return Builder.CreateBitCast(Trunc, Lo->getType());
1955  }
1956  if (VecLen == 8) {
1957  Type *Int64Ty = Type::getInt64Ty(F.getContext());
1958  Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty);
1959  Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty);
1960  Function *FI = Intrinsic::getDeclaration(F.getParent(),
1961  Intrinsic::hexagon_S2_valignrb);
1962  Value *Call = Builder.CreateCall(FI, {Hi64, Lo64, Amt});
1963  return Builder.CreateBitCast(Call, Lo->getType());
1964  }
1965  llvm_unreachable("Unexpected vector length");
1966 }
1967 
1968 // Concatenates a sequence of vectors of the same type.
1970  ArrayRef<Value *> Vecs) const -> Value * {
1971  assert(!Vecs.empty());
1972  SmallVector<int, 256> SMask;
1973  std::vector<Value *> Work[2];
1974  int ThisW = 0, OtherW = 1;
1975 
1976  Work[ThisW].assign(Vecs.begin(), Vecs.end());
1977  while (Work[ThisW].size() > 1) {
1978  auto *Ty = cast<VectorType>(Work[ThisW].front()->getType());
1979  SMask.resize(length(Ty) * 2);
1980  std::iota(SMask.begin(), SMask.end(), 0);
1981 
1982  Work[OtherW].clear();
1983  if (Work[ThisW].size() % 2 != 0)
1984  Work[ThisW].push_back(UndefValue::get(Ty));
1985  for (int i = 0, e = Work[ThisW].size(); i < e; i += 2) {
1986  Value *Joined = Builder.CreateShuffleVector(Work[ThisW][i],
1987  Work[ThisW][i + 1], SMask);
1988  Work[OtherW].push_back(Joined);
1989  }
1990  std::swap(ThisW, OtherW);
1991  }
1992 
1993  // Since there may have been some undefs appended to make shuffle operands
1994  // have the same type, perform the last shuffle to only pick the original
1995  // elements.
1996  SMask.resize(Vecs.size() * length(Vecs.front()->getType()));
1997  std::iota(SMask.begin(), SMask.end(), 0);
1998  Value *Total = Work[ThisW].front();
1999  return Builder.CreateShuffleVector(Total, SMask);
2000 }
2001 
2002 auto HexagonVectorCombine::vresize(IRBuilderBase &Builder, Value *Val,
2003  int NewSize, Value *Pad) const -> Value * {
2004  assert(isa<VectorType>(Val->getType()));
2005  auto *ValTy = cast<VectorType>(Val->getType());
2006  assert(ValTy->getElementType() == Pad->getType());
2007 
2008  int CurSize = length(ValTy);
2009  if (CurSize == NewSize)
2010  return Val;
2011  // Truncate?
2012  if (CurSize > NewSize)
2013  return getElementRange(Builder, Val, /*Ignored*/ Val, 0, NewSize);
2014  // Extend.
2015  SmallVector<int, 128> SMask(NewSize);
2016  std::iota(SMask.begin(), SMask.begin() + CurSize, 0);
2017  std::fill(SMask.begin() + CurSize, SMask.end(), CurSize);
2018  Value *PadVec = Builder.CreateVectorSplat(CurSize, Pad);
2019  return Builder.CreateShuffleVector(Val, PadVec, SMask);
2020 }
2021 
2022 auto HexagonVectorCombine::rescale(IRBuilderBase &Builder, Value *Mask,
2023  Type *FromTy, Type *ToTy) const -> Value * {
2024  // Mask is a vector <N x i1>, where each element corresponds to an
2025  // element of FromTy. Remap it so that each element will correspond
2026  // to an element of ToTy.
2027  assert(isa<VectorType>(Mask->getType()));
2028 
2029  Type *FromSTy = FromTy->getScalarType();
2030  Type *ToSTy = ToTy->getScalarType();
2031  if (FromSTy == ToSTy)
2032  return Mask;
2033 
2034  int FromSize = getSizeOf(FromSTy);
2035  int ToSize = getSizeOf(ToSTy);
2036  assert(FromSize % ToSize == 0 || ToSize % FromSize == 0);
2037 
2038  auto *MaskTy = cast<VectorType>(Mask->getType());
2039  int FromCount = length(MaskTy);
2040  int ToCount = (FromCount * FromSize) / ToSize;
2041  assert((FromCount * FromSize) % ToSize == 0);
2042 
2043  auto *FromITy = getIntTy(FromSize * 8);
2044  auto *ToITy = getIntTy(ToSize * 8);
2045 
2046  // Mask <N x i1> -> sext to <N x FromTy> -> bitcast to <M x ToTy> ->
2047  // -> trunc to <M x i1>.
2048  Value *Ext = Builder.CreateSExt(
2049  Mask, VectorType::get(FromITy, FromCount, /*Scalable=*/false));
2050  Value *Cast = Builder.CreateBitCast(
2051  Ext, VectorType::get(ToITy, ToCount, /*Scalable=*/false));
2052  return Builder.CreateTrunc(
2053  Cast, VectorType::get(getBoolTy(), ToCount, /*Scalable=*/false));
2054 }
2055 
2056 // Bitcast to bytes, and return least significant bits.
2057 auto HexagonVectorCombine::vlsb(IRBuilderBase &Builder, Value *Val) const
2058  -> Value * {
2059  Type *ScalarTy = Val->getType()->getScalarType();
2060  if (ScalarTy == getBoolTy())
2061  return Val;
2062 
2063  Value *Bytes = vbytes(Builder, Val);
2064  if (auto *VecTy = dyn_cast<VectorType>(Bytes->getType()))
2065  return Builder.CreateTrunc(Bytes, getBoolTy(getSizeOf(VecTy)));
2066  // If Bytes is a scalar (i.e. Val was a scalar byte), return i1, not
2067  // <1 x i1>.
2068  return Builder.CreateTrunc(Bytes, getBoolTy());
2069 }
2070 
2071 // Bitcast to bytes for non-bool. For bool, convert i1 -> i8.
2072 auto HexagonVectorCombine::vbytes(IRBuilderBase &Builder, Value *Val) const
2073  -> Value * {
2074  Type *ScalarTy = Val->getType()->getScalarType();
2075  if (ScalarTy == getByteTy())
2076  return Val;
2077 
2078  if (ScalarTy != getBoolTy())
2079  return Builder.CreateBitCast(Val, getByteTy(getSizeOf(Val)));
2080  // For bool, return a sext from i1 to i8.
2081  if (auto *VecTy = dyn_cast<VectorType>(Val->getType()))
2082  return Builder.CreateSExt(Val, VectorType::get(getByteTy(), VecTy));
2083  return Builder.CreateSExt(Val, getByteTy());
2084 }
2085 
2086 auto HexagonVectorCombine::subvector(IRBuilderBase &Builder, Value *Val,
2087  unsigned Start, unsigned Length) const
2088  -> Value * {
2089  assert(Start + Length <= length(Val));
2090  return getElementRange(Builder, Val, /*Ignored*/ Val, Start, Length);
2091 }
2092 
2093 auto HexagonVectorCombine::sublo(IRBuilderBase &Builder, Value *Val) const
2094  -> Value * {
2095  size_t Len = length(Val);
2096  assert(Len % 2 == 0 && "Length should be even");
2097  return subvector(Builder, Val, 0, Len / 2);
2098 }
2099 
2100 auto HexagonVectorCombine::subhi(IRBuilderBase &Builder, Value *Val) const
2101  -> Value * {
2102  size_t Len = length(Val);
2103  assert(Len % 2 == 0 && "Length should be even");
2104  return subvector(Builder, Val, Len / 2, Len / 2);
2105 }
2106 
2108  Value *Val1) const -> Value * {
2109  assert(Val0->getType() == Val1->getType());
2110  int Len = length(Val0);
2111  SmallVector<int, 128> Mask(2 * Len);
2112 
2113  for (int i = 0; i != Len; ++i) {
2114  Mask[i] = 2 * i; // Even
2115  Mask[i + Len] = 2 * i + 1; // Odd
2116  }
2117  return Builder.CreateShuffleVector(Val0, Val1, Mask);
2118 }
2119 
2121  Value *Val1) const -> Value * { //
2122  assert(Val0->getType() == Val1->getType());
2123  int Len = length(Val0);
2124  SmallVector<int, 128> Mask(2 * Len);
2125 
2126  for (int i = 0; i != Len; ++i) {
2127  Mask[2 * i + 0] = i; // Val0
2128  Mask[2 * i + 1] = i + Len; // Val1
2129  }
2130  return Builder.CreateShuffleVector(Val0, Val1, Mask);
2131 }
2132 
2133 auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder,
2134  Intrinsic::ID IntID, Type *RetTy,
2136  ArrayRef<Type *> ArgTys) const
2137  -> Value * {
2138  auto getCast = [&](IRBuilderBase &Builder, Value *Val,
2139  Type *DestTy) -> Value * {
2140  Type *SrcTy = Val->getType();
2141  if (SrcTy == DestTy)
2142  return Val;
2143 
2144  // Non-HVX type. It should be a scalar, and it should already have
2145  // a valid type.
2146  assert(HST.isTypeForHVX(SrcTy, /*IncludeBool=*/true));
2147 
2148  Type *BoolTy = Type::getInt1Ty(F.getContext());
2149  if (cast<VectorType>(SrcTy)->getElementType() != BoolTy)
2150  return Builder.CreateBitCast(Val, DestTy);
2151 
2152  // Predicate HVX vector.
2153  unsigned HwLen = HST.getVectorLength();
2154  Intrinsic::ID TC = HwLen == 64 ? Intrinsic::hexagon_V6_pred_typecast
2155  : Intrinsic::hexagon_V6_pred_typecast_128B;
2156  Function *FI =
2157  Intrinsic::getDeclaration(F.getParent(), TC, {DestTy, Val->getType()});
2158  return Builder.CreateCall(FI, {Val});
2159  };
2160 
2161  Function *IntrFn = Intrinsic::getDeclaration(F.getParent(), IntID, ArgTys);
2162  FunctionType *IntrTy = IntrFn->getFunctionType();
2163 
2164  SmallVector<Value *, 4> IntrArgs;
2165  for (int i = 0, e = Args.size(); i != e; ++i) {
2166  Value *A = Args[i];
2167  Type *T = IntrTy->getParamType(i);
2168  if (A->getType() != T) {
2169  IntrArgs.push_back(getCast(Builder, A, T));
2170  } else {
2171  IntrArgs.push_back(A);
2172  }
2173  }
2174  Value *Call = Builder.CreateCall(IntrFn, IntrArgs);
2175 
2176  Type *CallTy = Call->getType();
2177  if (RetTy == nullptr || CallTy == RetTy)
2178  return Call;
2179  // Scalar types should have RetTy matching the call return type.
2180  assert(HST.isTypeForHVX(CallTy, /*IncludeBool=*/true));
2181  return getCast(Builder, Call, RetTy);
2182 }
2183 
2184 auto HexagonVectorCombine::splitVectorElements(IRBuilderBase &Builder,
2185  Value *Vec,
2186  unsigned ToWidth) const
2188  // Break a vector of wide elements into a series of vectors with narrow
2189  // elements:
2190  // (...c0:b0:a0, ...c1:b1:a1, ...c2:b2:a2, ...)
2191  // -->
2192  // (a0, a1, a2, ...) // lowest "ToWidth" bits
2193  // (b0, b1, b2, ...) // the next lowest...
2194  // (c0, c1, c2, ...) // ...
2195  // ...
2196  //
2197  // The number of elements in each resulting vector is the same as
2198  // in the original vector.
2199 
2200  auto *VecTy = cast<VectorType>(Vec->getType());
2201  assert(VecTy->getElementType()->isIntegerTy());
2202  unsigned FromWidth = VecTy->getScalarSizeInBits();
2203  assert(isPowerOf2_32(ToWidth) && isPowerOf2_32(FromWidth));
2204  assert(ToWidth <= FromWidth && "Breaking up into wider elements?");
2205  unsigned NumResults = FromWidth / ToWidth;
2206 
2207  SmallVector<Value *> Results(NumResults);
2208  Results[0] = Vec;
2209  unsigned Length = length(VecTy);
2210 
2211  // Do it by splitting in half, since those operations correspond to deal
2212  // instructions.
2213  auto splitInHalf = [&](unsigned Begin, unsigned End, auto splitFunc) -> void {
2214  // Take V = Results[Begin], split it in L, H.
2215  // Store Results[Begin] = L, Results[(Begin+End)/2] = H
2216  // Call itself recursively split(Begin, Half), split(Half+1, End)
2217  if (Begin + 1 == End)
2218  return;
2219 
2220  Value *Val = Results[Begin];
2221  unsigned Width = Val->getType()->getScalarSizeInBits();
2222 
2223  auto *VTy = VectorType::get(getIntTy(Width / 2), 2 * Length, false);
2224  Value *VVal = Builder.CreateBitCast(Val, VTy);
2225 
2226  Value *Res = vdeal(Builder, sublo(Builder, VVal), subhi(Builder, VVal));
2227 
2228  unsigned Half = (Begin + End) / 2;
2229  Results[Begin] = sublo(Builder, Res);
2230  Results[Half] = subhi(Builder, Res);
2231 
2232  splitFunc(Begin, Half, splitFunc);
2233  splitFunc(Half, End, splitFunc);
2234  };
2235 
2236  splitInHalf(0, NumResults, splitInHalf);
2237  return Results;
2238 }
2239 
2240 auto HexagonVectorCombine::joinVectorElements(IRBuilderBase &Builder,
2241  ArrayRef<Value *> Values,
2242  VectorType *ToType) const
2243  -> Value * {
2244  assert(ToType->getElementType()->isIntegerTy());
2245 
2246  // If the list of values does not have power-of-2 elements, append copies
2247  // of the sign bit to it, to make the size be 2^n.
2248  // The reason for this is that the values will be joined in pairs, because
2249  // otherwise the shuffles will result in convoluted code. With pairwise
2250  // joins, the shuffles will hopefully be folded into a perfect shuffle.
2251  // The output will need to be sign-extended to a type with element width
2252  // being a power-of-2 anyways.
2253  SmallVector<Value *> Inputs(Values.begin(), Values.end());
2254 
2255  unsigned ToWidth = ToType->getScalarSizeInBits();
2256  unsigned Width = Inputs.front()->getType()->getScalarSizeInBits();
2257  assert(Width <= ToWidth);
2258  assert(isPowerOf2_32(Width) && isPowerOf2_32(ToWidth));
2259  unsigned Length = length(Inputs.front()->getType());
2260 
2261  unsigned NeedInputs = ToWidth / Width;
2262  if (Inputs.size() != NeedInputs) {
2263  Value *Last = Inputs.back();
2264  Value *Sign =
2265  Builder.CreateAShr(Last, getConstSplat(Last->getType(), Width - 1));
2266  Inputs.resize(NeedInputs, Sign);
2267  }
2268 
2269  while (Inputs.size() > 1) {
2270  Width *= 2;
2271  auto *VTy = VectorType::get(getIntTy(Width), Length, false);
2272  for (int i = 0, e = Inputs.size(); i < e; i += 2) {
2273  Value *Res = vshuff(Builder, Inputs[i], Inputs[i + 1]);
2274  Inputs[i / 2] = Builder.CreateBitCast(Res, VTy);
2275  }
2276  Inputs.resize(Inputs.size() / 2);
2277  }
2278 
2279  assert(Inputs.front()->getType() == ToType);
2280  return Inputs.front();
2281 }
2282 
2283 auto HexagonVectorCombine::calculatePointerDifference(Value *Ptr0,
2284  Value *Ptr1) const
2285  -> std::optional<int> {
2286  struct Builder : IRBuilder<> {
2287  Builder(BasicBlock *B) : IRBuilder<>(B->getTerminator()) {}
2288  ~Builder() {
2289  for (Instruction *I : llvm::reverse(ToErase))
2290  I->eraseFromParent();
2291  }
2293  };
2294 
2295 #define CallBuilder(B, F) \
2296  [&](auto &B_) { \
2297  Value *V = B_.F; \
2298  if (auto *I = dyn_cast<Instruction>(V)) \
2299  B_.ToErase.push_back(I); \
2300  return V; \
2301  }(B)
2302 
2303  auto Simplify = [this](Value *V) {
2304  if (Value *S = simplify(V))
2305  return S;
2306  return V;
2307  };
2308 
2309  auto StripBitCast = [](Value *V) {
2310  while (auto *C = dyn_cast<BitCastInst>(V))
2311  V = C->getOperand(0);
2312  return V;
2313  };
2314 
2315  Ptr0 = StripBitCast(Ptr0);
2316  Ptr1 = StripBitCast(Ptr1);
2317  if (!isa<GetElementPtrInst>(Ptr0) || !isa<GetElementPtrInst>(Ptr1))
2318  return std::nullopt;
2319 
2320  auto *Gep0 = cast<GetElementPtrInst>(Ptr0);
2321  auto *Gep1 = cast<GetElementPtrInst>(Ptr1);
2322  if (Gep0->getPointerOperand() != Gep1->getPointerOperand())
2323  return std::nullopt;
2324  if (Gep0->getSourceElementType() != Gep1->getSourceElementType())
2325  return std::nullopt;
2326 
2327  Builder B(Gep0->getParent());
2328  int Scale = getSizeOf(Gep0->getSourceElementType(), Alloc);
2329 
2330  // FIXME: for now only check GEPs with a single index.
2331  if (Gep0->getNumOperands() != 2 || Gep1->getNumOperands() != 2)
2332  return std::nullopt;
2333 
2334  Value *Idx0 = Gep0->getOperand(1);
2335  Value *Idx1 = Gep1->getOperand(1);
2336 
2337  // First, try to simplify the subtraction directly.
2338  if (auto *Diff = dyn_cast<ConstantInt>(
2339  Simplify(CallBuilder(B, CreateSub(Idx0, Idx1)))))
2340  return Diff->getSExtValue() * Scale;
2341 
2342  KnownBits Known0 = getKnownBits(Idx0, Gep0);
2343  KnownBits Known1 = getKnownBits(Idx1, Gep1);
2344  APInt Unknown = ~(Known0.Zero | Known0.One) | ~(Known1.Zero | Known1.One);
2345  if (Unknown.isAllOnes())
2346  return std::nullopt;
2347 
2348  Value *MaskU = ConstantInt::get(Idx0->getType(), Unknown);
2349  Value *AndU0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskU)));
2350  Value *AndU1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskU)));
2351  Value *SubU = Simplify(CallBuilder(B, CreateSub(AndU0, AndU1)));
2352  int Diff0 = 0;
2353  if (auto *C = dyn_cast<ConstantInt>(SubU)) {
2354  Diff0 = C->getSExtValue();
2355  } else {
2356  return std::nullopt;
2357  }
2358 
2359  Value *MaskK = ConstantInt::get(MaskU->getType(), ~Unknown);
2360  Value *AndK0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskK)));
2361  Value *AndK1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskK)));
2362  Value *SubK = Simplify(CallBuilder(B, CreateSub(AndK0, AndK1)));
2363  int Diff1 = 0;
2364  if (auto *C = dyn_cast<ConstantInt>(SubK)) {
2365  Diff1 = C->getSExtValue();
2366  } else {
2367  return std::nullopt;
2368  }
2369 
2370  return (Diff0 + Diff1) * Scale;
2371 
2372 #undef CallBuilder
2373 }
2374 
2375 auto HexagonVectorCombine::getNumSignificantBits(const Value *V,
2376  const Instruction *CtxI) const
2377  -> unsigned {
2378  return ComputeMaxSignificantBits(V, DL, /*Depth=*/0, &AC, CtxI, &DT);
2379 }
2380 
2381 auto HexagonVectorCombine::getKnownBits(const Value *V,
2382  const Instruction *CtxI) const
2383  -> KnownBits {
2384  return computeKnownBits(V, DL, /*Depth=*/0, &AC, CtxI, &DT, /*ORE=*/nullptr,
2385  /*UseInstrInfo=*/true);
2386 }
2387 
2388 template <typename T>
2389 auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In,
2391  const T &IgnoreInsts) const
2392  -> bool {
2393  auto getLocOrNone =
2394  [this](const Instruction &I) -> std::optional<MemoryLocation> {
2395  if (const auto *II = dyn_cast<IntrinsicInst>(&I)) {
2396  switch (II->getIntrinsicID()) {
2397  case Intrinsic::masked_load:
2398  return MemoryLocation::getForArgument(II, 0, TLI);
2399  case Intrinsic::masked_store:
2400  return MemoryLocation::getForArgument(II, 1, TLI);
2401  }
2402  }
2403  return MemoryLocation::getOrNone(&I);
2404  };
2405 
2406  // The source and the destination must be in the same basic block.
2407  const BasicBlock &Block = *In.getParent();
2408  assert(Block.begin() == To || Block.end() == To || To->getParent() == &Block);
2409  // No PHIs.
2410  if (isa<PHINode>(In) || (To != Block.end() && isa<PHINode>(*To)))
2411  return false;
2412 
2414  return true;
2415  bool MayWrite = In.mayWriteToMemory();
2416  auto MaybeLoc = getLocOrNone(In);
2417 
2418  auto From = In.getIterator();
2419  if (From == To)
2420  return true;
2421  bool MoveUp = (To != Block.end() && To->comesBefore(&In));
2422  auto Range =
2423  MoveUp ? std::make_pair(To, From) : std::make_pair(std::next(From), To);
2424  for (auto It = Range.first; It != Range.second; ++It) {
2425  const Instruction &I = *It;
2426  if (llvm::is_contained(IgnoreInsts, &I))
2427  continue;
2428  // assume intrinsic can be ignored
2429  if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
2430  if (II->getIntrinsicID() == Intrinsic::assume)
2431  continue;
2432  }
2433  // Parts based on isSafeToMoveBefore from CoveMoverUtils.cpp.
2434  if (I.mayThrow())
2435  return false;
2436  if (auto *CB = dyn_cast<CallBase>(&I)) {
2437  if (!CB->hasFnAttr(Attribute::WillReturn))
2438  return false;
2439  if (!CB->hasFnAttr(Attribute::NoSync))
2440  return false;
2441  }
2442  if (I.mayReadOrWriteMemory()) {
2443  auto MaybeLocI = getLocOrNone(I);
2444  if (MayWrite || I.mayWriteToMemory()) {
2445  if (!MaybeLoc || !MaybeLocI)
2446  return false;
2447  if (!AA.isNoAlias(*MaybeLoc, *MaybeLocI))
2448  return false;
2449  }
2450  }
2451  }
2452  return true;
2453 }
2454 
2455 auto HexagonVectorCombine::isByteVecTy(Type *Ty) const -> bool {
2456  if (auto *VecTy = dyn_cast<VectorType>(Ty))
2457  return VecTy->getElementType() == getByteTy();
2458  return false;
2459 }
2460 
2461 auto HexagonVectorCombine::getElementRange(IRBuilderBase &Builder, Value *Lo,
2462  Value *Hi, int Start,
2463  int Length) const -> Value * {
2464  assert(0 <= Start && size_t(Start + Length) < length(Lo) + length(Hi));
2465  SmallVector<int, 128> SMask(Length);
2466  std::iota(SMask.begin(), SMask.end(), Start);
2467  return Builder.CreateShuffleVector(Lo, Hi, SMask);
2468 }
2469 
2470 // Pass management.
2471 
2472 namespace llvm {
2475 } // namespace llvm
2476 
2477 namespace {
2478 class HexagonVectorCombineLegacy : public FunctionPass {
2479 public:
2480  static char ID;
2481 
2482  HexagonVectorCombineLegacy() : FunctionPass(ID) {}
2483 
2484  StringRef getPassName() const override { return "Hexagon Vector Combine"; }
2485 
2486  void getAnalysisUsage(AnalysisUsage &AU) const override {
2487  AU.setPreservesCFG();
2494  }
2495 
2496  bool runOnFunction(Function &F) override {
2497  if (skipFunction(F))
2498  return false;
2499  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
2500  AssumptionCache &AC =
2501  getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2502  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2503  TargetLibraryInfo &TLI =
2504  getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2505  auto &TM = getAnalysis<TargetPassConfig>().getTM<HexagonTargetMachine>();
2506  HexagonVectorCombine HVC(F, AA, AC, DT, TLI, TM);
2507  return HVC.run();
2508  }
2509 };
2510 } // namespace
2511 
2513 
2514 INITIALIZE_PASS_BEGIN(HexagonVectorCombineLegacy, DEBUG_TYPE,
2515  "Hexagon Vector Combine", false, false)
2521 INITIALIZE_PASS_END(HexagonVectorCombineLegacy, DEBUG_TYPE,
2522  "Hexagon Vector Combine", false, false)
2523 
2525  return new HexagonVectorCombineLegacy();
2526 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:77
i
i
Definition: README.txt:29
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:156
Int32Ty
IntegerType * Int32Ty
Definition: NVVMIntrRange.cpp:67
ValueTypes.h
llvm::RecursivelyDeleteTriviallyDeadInstructions
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:519
AssumptionCache.h
llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:308
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(HexagonVectorCombineLegacy, DEBUG_TYPE, "Hexagon Vector Combine", false, false) INITIALIZE_PASS_END(HexagonVectorCombineLegacy
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4715
MathExtras.h
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::sys::path::const_iterator::end
friend const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:235
llvm::Type::getInt1Ty
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:236
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1748
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::mayHaveNonDefUseDependency
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
Definition: ValueTracking.cpp:4840
llvm::AArch64PACKey::ID
ID
Definition: AArch64BaseInfo.h:818
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1481
Metadata.h
llvm::Type::getInt8PtrTy
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:291
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
shuffles::vdeal
MaskT vdeal(ArrayRef< int > Vu, ArrayRef< int > Vv, unsigned Size, bool TakeOdd)
Definition: HexagonISelDAGToDAGHVX.cpp:884
IntrinsicInst.h
llvm::SimplifyQuery
Definition: InstructionSimplify.h:93
simplify
hexagon bit simplify
Definition: HexagonBitSimplify.cpp:289
llvm::Function::empty
bool empty() const
Definition: Function.h:713
T
llvm::Function
Definition: Function.h:60
llvm::PseudoProbeReservedId::Last
@ Last
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
Pass.h
llvm::PatternMatch::m_LShr
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1117
llvm::MemoryLocation::getOrNone
static std::optional< MemoryLocation > getOrNone(const Instruction *Inst)
Definition: MemoryLocation.cpp:78
llvm::KnownBits::Zero
APInt Zero
Definition: KnownBits.h:24
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:328
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1199
llvm::RISCVFenceField::W
@ W
Definition: RISCVBaseInfo.h:266
llvm::createHexagonVectorCombineLegacyPass
FunctionPass * createHexagonVectorCombineLegacyPass()
Definition: HexagonVectorCombine.cpp:2524
llvm::ARM_MB::SY
@ SY
Definition: ARMBaseInfo.h:74
llvm::PatternMatch::m_Add
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition: PatternMatch.h:979
HexagonSubtarget.h
llvm::MipsISD::Lo
@ Lo
Definition: MipsISelLowering.h:79
llvm::HexagonTargetMachine
Definition: HexagonTargetMachine.h:25
llvm::PatternMatch::m_CombineOr
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:218
llvm::initializeHexagonVectorCombineLegacyPass
void initializeHexagonVectorCombineLegacyPass(PassRegistry &)
llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2525
llvm::erase_if
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:1997
ValueTracking.h
Local.h
llvm::ComputeMaxSignificantBits
unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Get the upper bound on bit size for this Value Op as a signed integer.
Definition: ValueTracking.cpp:393
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
APInt.h
Shift
bool Shift
Definition: README.txt:468
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
DenseMap.h
llvm::tgtok::Bits
@ Bits
Definition: TGLexer.h:50
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:235
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
Vector
So we should use XX3Form_Rcr to implement intrinsic Convert DP outs ins xscvdpsp No builtin are required Round &Convert QP DP(dword[1] is set to zero) No builtin are required Round to Quad Precision because you need to assign rounding mode in instruction Provide builtin(set f128:$vT,(int_ppc_vsx_xsrqpi f128:$vB))(set f128 yields< n x< ty > >< result > yields< ty >< result > No builtin are required Load Store Vector
Definition: README_P9.txt:497
llvm::PatternMatch::m_AShr
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1123
Results
Function Alias Analysis Results
Definition: AliasAnalysis.cpp:772
llvm::max
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:337
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:119
STLExtras.h
llvm::PowerOf2Ceil
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:630
llvm::propagateMetadata
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
Definition: VectorUtils.cpp:877
HexagonTargetMachine.h
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:458
getConstInt
static ConstantInt * getConstInt(MDNode *MD, unsigned NumOp)
Definition: SPIRVCallLowering.cpp:68
LLVM_ATTRIBUTE_UNUSED
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:172
llvm::Type::getNonOpaquePointerElementType
Type * getNonOpaquePointerElementType() const
Only use this method in code that is not reachable with opaque pointers, or part of deprecated method...
Definition: Type.h:401
llvm::Type::getInt8Ty
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:237
llvm::Data
@ Data
Definition: SIMachineScheduler.h:55
p
the resulting code requires compare and branches when and if * p
Definition: README.txt:396
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:239
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:265
KnownBits.h
Uses
SmallPtrSet< MachineInstr *, 2 > Uses
Definition: ARMLowOverheadLoops.cpp:590
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
llvm::MipsISD::Hi
@ Hi
Definition: MipsISelLowering.h:75
isUndef
static bool isUndef(ArrayRef< int > Mask)
Definition: HexagonISelDAGToDAGHVX.cpp:1032
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:129
AliasAnalysis.h
llvm::ARMBuildAttrs::Section
@ Section
Legacy Tags.
Definition: ARMBuildAttributes.h:82
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::PatternMatch::m_APInt
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:278
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1734
llvm::RecurKind::And
@ And
Bitwise or logical AND of integers.
TargetMachine.h
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
isZero
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:524
llvm::AAResults
Definition: AliasAnalysis.h:294
f
Itanium Name Demangler i e convert the string _Z1fv into f()". You can also use the CRTP base ManglingParser to perform some simple analysis on the mangled name
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::User
Definition: User.h:44
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:34
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::ARM_PROC::A
@ A
Definition: ARMBaseInfo.h:34
llvm::KnownBits::One
APInt One
Definition: KnownBits.h:25
Y
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
llvm::DomTreeNodeBase::children
iterator_range< iterator > children()
Definition: GenericDomTree.h:83
TargetLibraryInfo.h
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:246
false
Definition: StackSlotColoring.cpp:141
llvm::dwarf::Index
Index
Definition: Dwarf.h:472
llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:547
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:40
llvm::Instruction::getOpcodeName
const char * getOpcodeName() const
Definition: Instruction.h:171
llvm::Instruction
Definition: Instruction.h:42
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:189
llvm::APInt::isAllOnes
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:347
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:306
llvm::PassRegistry
PassRegistry - This class manages the registration and intitialization of the pass subsystem as appli...
Definition: PassRegistry.h:38
llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1486
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
llvm::ConstantVector::getSplat
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1391
llvm::SmallVectorImpl::resize
void resize(size_type N)
Definition: SmallVector.h:642
llvm::operator<<
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:291
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1713
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:879
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Align
uint64_t Align
Definition: ELFObjHandler.cpp:82
PatternMatch.h
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::MCID::Call
@ Call
Definition: MCInstrDesc.h:155
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::omp::RTLDependInfoFields::Len
@ Len
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::dxil::PointerTypeAnalysis::run
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
Definition: PointerTypeAnalysis.cpp:189
llvm::APInt::ashr
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:808
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:210
llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition: TargetPassConfig.h:84
llvm::PatternMatch::m_Shr
BinOpPred_match< LHS, RHS, is_right_shift_op > m_Shr(const LHS &L, const RHS &R)
Matches logical shift operations.
Definition: PatternMatch.h:1303
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
G
const DataFlowGraph & G
Definition: RDFGraph.cpp:200
llvm::tgtok::In
@ In
Definition: TGLexer.h:51
Combine
Hexagon Vector Combine
Definition: HexagonVectorCombine.cpp:2522
VectorUtils.h
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:298
llvm::MipsISD::Ext
@ Ext
Definition: MipsISelLowering.h:159
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::cl::values
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:705
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:52
llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:340
llvm::AMDGPU::Hwreg::Offset
Offset
Definition: SIDefines.h:416
Index
uint32_t Index
Definition: ELFObjHandler.cpp:83
llvm::DomTreeNodeBase::getBlock
NodeT * getBlock() const
Definition: GenericDomTree.h:88
llvm::TargetLibraryInfoWrapperPass
Definition: TargetLibraryInfo.h:475
llvm::pdb::Unknown
@ Unknown
Definition: PDBTypes.h:396
D
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
InstSimplifyFolder.h
const
aarch64 promote const
Definition: AArch64PromoteConstant.cpp:232
Addr
uint64_t Addr
Definition: ELFObjHandler.cpp:79
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
move
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
Definition: README.txt:546
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:53
llvm::DenseMap
Definition: DenseMap.h:714
llvm::EVT::getEVT
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:595
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::FunctionType::getParamType
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
llvm::PointerType
Class to represent pointers.
Definition: DerivedTypes.h:632
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1868
llvm::simplifyInstruction
Value * simplifyInstruction(Instruction *I, const SimplifyQuery &Q, OptimizationRemarkEmitter *ORE=nullptr)
See if we can compute a simplified version of this instruction.
Definition: InstructionSimplify.cpp:6599
ArrayRef.h
llvm::concat
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&... Ranges)
Concatenated range across two or more ranges.
Definition: STLExtras.h:1208
TargetPassConfig.h
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:197
llvm::sys::path::const_iterator::begin
friend const_iterator begin(StringRef path, Style style)
Get begin iterator over path.
Definition: Path.cpp:226
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
llvm::move
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1861
llvm::MemoryLocation::getForArgument
static MemoryLocation getForArgument(const CallBase *Call, unsigned ArgIdx, const TargetLibraryInfo *TLI)
Return a location representing a particular argument of a call.
Definition: MemoryLocation.cpp:160
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
Ptr
@ Ptr
Definition: TargetLibraryInfo.cpp:60
llvm::AMDGPU::IsaInfo::TargetIDSetting::Off
@ Off
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
getType
static M68kRelType getType(unsigned Kind, MCSymbolRefExpr::VariantKind &Modifier, bool &IsPCRel)
Definition: M68kELFObjectWriter.cpp:48
llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1715
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:202
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
None.h
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
Mul
BinaryOperator * Mul
Definition: X86PartialReduction.cpp:70
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1741
llvm::logicalview::LVAttributeKind::Range
@ Range
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:265
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
llvm::InstSimplifyFolder
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
Definition: InstSimplifyFolder.h:34
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:42
Simplify
assume Assume Simplify
Definition: AssumeBundleBuilder.cpp:604
llvm::CmpInst::ICMP_ULT
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:745
llvm::logicalview::LVAttributeKind::Zero
@ Zero
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::ms_demangle::IntrinsicFunctionKind::New
@ New
uint32_t
llvm::append_range
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
Definition: STLExtras.h:2013
llvm::IRBuilderBase
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:93
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:82
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
DEBUG_TYPE
#define DEBUG_TYPE
Definition: HexagonVectorCombine.cpp:55
Unsigned
@ Unsigned
Definition: NVPTXISelLowering.cpp:4716
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
llvm::DomTreeNodeBase< BasicBlock >
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:174
getIntTy
static IntegerType * getIntTy(IRBuilderBase &B, const TargetLibraryInfo *TLI)
Definition: BuildLibCalls.cpp:1419
CallBuilder
#define CallBuilder(B, F)
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:85
j
return j(j<< 16)
llvm::Type::getInt64Ty
static IntegerType * getInt64Ty(LLVMContext &C)
Definition: Type.cpp:240
llvm::ConstantInt::getTrue
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:827
std
Definition: BitVector.h:851
llvm::KnownBits
Definition: KnownBits.h:23
llvm::copy_if
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1780
H
#define H(x, y, z)
Definition: MD5.cpp:57
llvm::None
constexpr std::nullopt_t None
Definition: None.h:27
llvm::SmallVectorImpl::assign
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:708
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:348
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:216
llvm::Function::back
const BasicBlock & back() const
Definition: Function.h:716
llvm::ConstantInt::getSigned
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.cpp:893
llvm::Type::getPointerTo
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:774
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:226
llvm::Function::getFunctionType
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:175
llvm::AMDGPU::Hwreg::Width
Width
Definition: SIDefines.h:433
shuffles::vshuff
MaskT vshuff(ArrayRef< int > Vu, ArrayRef< int > Vv, unsigned Size, bool TakeOdd)
Definition: HexagonISelDAGToDAGHVX.cpp:871
llvm::pdb::DbgHeaderType::Max
@ Max
llvm::TargetMachine::getSubtargetImpl
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
Definition: TargetMachine.h:132
SmallVector.h
Dominators.h
llvm::AAResultsWrapperPass
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Definition: AliasAnalysis.h:924
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:91
InstructionSimplify.h
shift
http eax xorl edx cl sete al setne dl sall eax sall edx But that requires good bit subreg support this might be better It s an extra shift
Definition: README.txt:30
llvm::HexagonSubtarget
Definition: HexagonSubtarget.h:43
llvm::codeview::CompileSym3Flags::Exp
@ Exp
llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:119
llvm::reverse
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:485
llvm::Pass::getAnalysisUsage
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
getLocation
static MemoryLocation getLocation(Instruction *I)
Definition: SLPVectorizer.cpp:778
llvm::IntegerType::get
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
From
BlockVerifier::State From
Definition: BlockVerifier.cpp:55
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:394
raw_ostream.h
llvm::pdb::PDB_SymType::Block
@ Block
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::VectorType::get
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:668
llvm::PatternMatch::m_Mul
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1045
llvm::BasicBlock::const_iterator
InstListType::const_iterator const_iterator
Definition: BasicBlock.h:88
llvm::sampleprof::Base
@ Base
Definition: Discriminator.h:58
llvm::FunctionType
Class to represent function types.
Definition: DerivedTypes.h:103
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:288
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:39
llvm::AArch64::Rounding
Rounding
Possible values of current rounding mode, which is specified in bits 23:22 of FPCR.
Definition: AArch64ISelLowering.h:484