LLVM 22.0.0git
HexagonVectorCombine.cpp
Go to the documentation of this file.
1//===-- HexagonVectorCombine.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// HexagonVectorCombine is a utility class implementing a variety of functions
9// that assist in vector-based optimizations.
10//
11// AlignVectors: replace unaligned vector loads and stores with aligned ones.
12// HvxIdioms: recognize various opportunities to generate HVX intrinsic code.
13//===----------------------------------------------------------------------===//
14
15#include "llvm/ADT/APInt.h"
16#include "llvm/ADT/ArrayRef.h"
17#include "llvm/ADT/DenseMap.h"
18#include "llvm/ADT/STLExtras.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/IRBuilder.h"
33#include "llvm/IR/Intrinsics.h"
34#include "llvm/IR/IntrinsicsHexagon.h"
35#include "llvm/IR/Metadata.h"
38#include "llvm/Pass.h"
45
46#include "Hexagon.h"
47#include "HexagonSubtarget.h"
49
50#include <algorithm>
51#include <deque>
52#include <map>
53#include <optional>
54#include <set>
55#include <utility>
56#include <vector>
57
58#define DEBUG_TYPE "hexagon-vc"
59
60// This is a const that represents default HVX VTCM page size.
61// It is boot time configurable, so we probably want an API to
62// read it, but for now assume 128KB
63#define DEFAULT_HVX_VTCM_PAGE_SIZE 131072
64
65using namespace llvm;
66
67namespace {
68cl::opt<bool> DumpModule("hvc-dump-module", cl::Hidden);
69cl::opt<bool> VAEnabled("hvc-va", cl::Hidden, cl::init(true)); // Align
70cl::opt<bool> VIEnabled("hvc-vi", cl::Hidden, cl::init(true)); // Idioms
71cl::opt<bool> VADoFullStores("hvc-va-full-stores", cl::Hidden);
72
73cl::opt<unsigned> VAGroupCountLimit("hvc-va-group-count-limit", cl::Hidden,
74 cl::init(~0));
75cl::opt<unsigned> VAGroupSizeLimit("hvc-va-group-size-limit", cl::Hidden,
76 cl::init(~0));
77
78class HexagonVectorCombine {
79public:
80 HexagonVectorCombine(Function &F_, AliasAnalysis &AA_, AssumptionCache &AC_,
82 TargetLibraryInfo &TLI_, const TargetMachine &TM_)
83 : F(F_), DL(F.getDataLayout()), AA(AA_), AC(AC_), DT(DT_),
84 SE(SE_), TLI(TLI_),
85 HST(static_cast<const HexagonSubtarget &>(*TM_.getSubtargetImpl(F))) {}
86
87 bool run();
88
89 // Common integer type.
90 IntegerType *getIntTy(unsigned Width = 32) const;
91 // Byte type: either scalar (when Length = 0), or vector with given
92 // element count.
93 Type *getByteTy(int ElemCount = 0) const;
94 // Boolean type: either scalar (when Length = 0), or vector with given
95 // element count.
96 Type *getBoolTy(int ElemCount = 0) const;
97 // Create a ConstantInt of type returned by getIntTy with the value Val.
98 ConstantInt *getConstInt(int Val, unsigned Width = 32) const;
99 // Get the integer value of V, if it exists.
100 std::optional<APInt> getIntValue(const Value *Val) const;
101 // Is Val a constant 0, or a vector of 0s?
102 bool isZero(const Value *Val) const;
103 // Is Val an undef value?
104 bool isUndef(const Value *Val) const;
105 // Is Val a scalar (i1 true) or a vector of (i1 true)?
106 bool isTrue(const Value *Val) const;
107 // Is Val a scalar (i1 false) or a vector of (i1 false)?
108 bool isFalse(const Value *Val) const;
109
110 // Get HVX vector type with the given element type.
111 VectorType *getHvxTy(Type *ElemTy, bool Pair = false) const;
112
113 enum SizeKind {
114 Store, // Store size
115 Alloc, // Alloc size
116 };
117 int getSizeOf(const Value *Val, SizeKind Kind = Store) const;
118 int getSizeOf(const Type *Ty, SizeKind Kind = Store) const;
119 int getTypeAlignment(Type *Ty) const;
120 size_t length(Value *Val) const;
121 size_t length(Type *Ty) const;
122
123 Value *simplify(Value *Val) const;
124
125 Value *insertb(IRBuilderBase &Builder, Value *Dest, Value *Src, int Start,
126 int Length, int Where) const;
127 Value *vlalignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
128 Value *Amt) const;
129 Value *vralignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
130 Value *Amt) const;
131 Value *concat(IRBuilderBase &Builder, ArrayRef<Value *> Vecs) const;
132 Value *vresize(IRBuilderBase &Builder, Value *Val, int NewSize,
133 Value *Pad) const;
134 Value *rescale(IRBuilderBase &Builder, Value *Mask, Type *FromTy,
135 Type *ToTy) const;
136 Value *vlsb(IRBuilderBase &Builder, Value *Val) const;
137 Value *vbytes(IRBuilderBase &Builder, Value *Val) const;
138 Value *subvector(IRBuilderBase &Builder, Value *Val, unsigned Start,
139 unsigned Length) const;
140 Value *sublo(IRBuilderBase &Builder, Value *Val) const;
141 Value *subhi(IRBuilderBase &Builder, Value *Val) const;
142 Value *vdeal(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
143 Value *vshuff(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
144
145 Value *createHvxIntrinsic(IRBuilderBase &Builder, Intrinsic::ID IntID,
146 Type *RetTy, ArrayRef<Value *> Args,
147 ArrayRef<Type *> ArgTys = {},
148 ArrayRef<Value *> MDSources = {}) const;
149 SmallVector<Value *> splitVectorElements(IRBuilderBase &Builder, Value *Vec,
150 unsigned ToWidth) const;
151 Value *joinVectorElements(IRBuilderBase &Builder, ArrayRef<Value *> Values,
152 VectorType *ToType) const;
153
154 std::optional<int> calculatePointerDifference(Value *Ptr0, Value *Ptr1) const;
155
156 unsigned getNumSignificantBits(const Value *V,
157 const Instruction *CtxI = nullptr) const;
158 KnownBits getKnownBits(const Value *V,
159 const Instruction *CtxI = nullptr) const;
160
161 bool isSafeToClone(const Instruction &In) const;
162
163 template <typename T = std::vector<Instruction *>>
164 bool isSafeToMoveBeforeInBB(const Instruction &In,
166 const T &IgnoreInsts = {}) const;
167
168 // This function is only used for assertions at the moment.
169 [[maybe_unused]] bool isByteVecTy(Type *Ty) const;
170
171 Function &F;
172 const DataLayout &DL;
174 AssumptionCache &AC;
175 DominatorTree &DT;
176 ScalarEvolution &SE;
178 const HexagonSubtarget &HST;
179
180private:
181 Value *getElementRange(IRBuilderBase &Builder, Value *Lo, Value *Hi,
182 int Start, int Length) const;
183};
184
185class AlignVectors {
186 // This code tries to replace unaligned vector loads/stores with aligned
187 // ones.
188 // Consider unaligned load:
189 // %v = original_load %some_addr, align <bad>
190 // %user = %v
191 // It will generate
192 // = load ..., align <good>
193 // = load ..., align <good>
194 // = valign
195 // etc.
196 // %synthesize = combine/shuffle the loaded data so that it looks
197 // exactly like what "original_load" has loaded.
198 // %user = %synthesize
199 // Similarly for stores.
200public:
201 AlignVectors(const HexagonVectorCombine &HVC_) : HVC(HVC_) {}
202
203 bool run();
204
205private:
206 using InstList = std::vector<Instruction *>;
208
209 struct AddrInfo {
210 AddrInfo(const AddrInfo &) = default;
211 AddrInfo(const HexagonVectorCombine &HVC, Instruction *I, Value *A, Type *T,
212 Align H)
213 : Inst(I), Addr(A), ValTy(T), HaveAlign(H),
214 NeedAlign(HVC.getTypeAlignment(ValTy)) {}
215 AddrInfo &operator=(const AddrInfo &) = default;
216
217 // XXX: add Size member?
218 Instruction *Inst;
219 Value *Addr;
220 Type *ValTy;
221 Align HaveAlign;
222 Align NeedAlign;
223 int Offset = 0; // Offset (in bytes) from the first member of the
224 // containing AddrList.
225 };
226 using AddrList = std::vector<AddrInfo>;
227
228 struct InstrLess {
229 bool operator()(const Instruction *A, const Instruction *B) const {
230 return A->comesBefore(B);
231 }
232 };
233 using DepList = std::set<Instruction *, InstrLess>;
234
235 struct MoveGroup {
236 MoveGroup(const AddrInfo &AI, Instruction *B, bool Hvx, bool Load)
237 : Base(B), Main{AI.Inst}, Clones{}, IsHvx(Hvx), IsLoad(Load) {}
238 MoveGroup() = default;
239 Instruction *Base; // Base instruction of the parent address group.
240 InstList Main; // Main group of instructions.
241 InstList Deps; // List of dependencies.
242 InstMap Clones; // Map from original Deps to cloned ones.
243 bool IsHvx; // Is this group of HVX instructions?
244 bool IsLoad; // Is this a load group?
245 };
246 using MoveList = std::vector<MoveGroup>;
247
248 struct ByteSpan {
249 // A representation of "interesting" bytes within a given span of memory.
250 // These bytes are those that are loaded or stored, and they don't have
251 // to cover the entire span of memory.
252 //
253 // The representation works by picking a contiguous sequence of bytes
254 // from somewhere within a llvm::Value, and placing it at a given offset
255 // within the span.
256 //
257 // The sequence of bytes from llvm:Value is represented by Segment.
258 // Block is Segment, plus where it goes in the span.
259 //
260 // An important feature of ByteSpan is being able to make a "section",
261 // i.e. creating another ByteSpan corresponding to a range of offsets
262 // relative to the source span.
263
264 struct Segment {
265 // Segment of a Value: 'Len' bytes starting at byte 'Begin'.
266 Segment(Value *Val, int Begin, int Len)
267 : Val(Val), Start(Begin), Size(Len) {}
268 Segment(const Segment &Seg) = default;
269 Segment &operator=(const Segment &Seg) = default;
270 Value *Val; // Value representable as a sequence of bytes.
271 int Start; // First byte of the value that belongs to the segment.
272 int Size; // Number of bytes in the segment.
273 };
274
275 struct Block {
276 Block(Value *Val, int Len, int Pos) : Seg(Val, 0, Len), Pos(Pos) {}
277 Block(Value *Val, int Off, int Len, int Pos)
278 : Seg(Val, Off, Len), Pos(Pos) {}
279 Block(const Block &Blk) = default;
280 Block &operator=(const Block &Blk) = default;
281 Segment Seg; // Value segment.
282 int Pos; // Position (offset) of the block in the span.
283 };
284
285 int extent() const;
286 ByteSpan section(int Start, int Length) const;
287 ByteSpan &shift(int Offset);
288 SmallVector<Value *, 8> values() const;
289
290 int size() const { return Blocks.size(); }
291 Block &operator[](int i) { return Blocks[i]; }
292 const Block &operator[](int i) const { return Blocks[i]; }
293
294 std::vector<Block> Blocks;
295
296 using iterator = decltype(Blocks)::iterator;
297 iterator begin() { return Blocks.begin(); }
298 iterator end() { return Blocks.end(); }
299 using const_iterator = decltype(Blocks)::const_iterator;
300 const_iterator begin() const { return Blocks.begin(); }
301 const_iterator end() const { return Blocks.end(); }
302 };
303
304 std::optional<AddrInfo> getAddrInfo(Instruction &In) const;
305 bool isHvx(const AddrInfo &AI) const;
306 // This function is only used for assertions at the moment.
307 [[maybe_unused]] bool isSectorTy(Type *Ty) const;
308
309 Value *getPayload(Value *Val) const;
310 Value *getMask(Value *Val) const;
311 Value *getPassThrough(Value *Val) const;
312
313 Value *createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
314 int Adjust,
315 const InstMap &CloneMap = InstMap()) const;
316 Value *createAlignedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
317 int Alignment,
318 const InstMap &CloneMap = InstMap()) const;
319
320 Value *createLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
321 Value *Predicate, int Alignment, Value *Mask,
322 Value *PassThru, ArrayRef<Value *> MDSources = {}) const;
323 Value *createSimpleLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
324 int Alignment,
325 ArrayRef<Value *> MDSources = {}) const;
326
327 Value *createStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
328 Value *Predicate, int Alignment, Value *Mask,
329 ArrayRef<Value *> MDSources = {}) const;
330 Value *createSimpleStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
331 int Alignment,
332 ArrayRef<Value *> MDSources = {}) const;
333
334 Value *createPredicatedLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
335 Value *Predicate, int Alignment,
336 ArrayRef<Value *> MDSources = {}) const;
337 Value *createPredicatedStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
338 Value *Predicate, int Alignment,
339 ArrayRef<Value *> MDSources = {}) const;
340
341 DepList getUpwardDeps(Instruction *In, Instruction *Base) const;
342 bool createAddressGroups();
343 MoveList createLoadGroups(const AddrList &Group) const;
344 MoveList createStoreGroups(const AddrList &Group) const;
345 bool moveTogether(MoveGroup &Move) const;
346 template <typename T>
347 InstMap cloneBefore(BasicBlock::iterator To, T &&Insts) const;
348
349 void realignLoadGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
350 int ScLen, Value *AlignVal, Value *AlignAddr) const;
351 void realignStoreGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
352 int ScLen, Value *AlignVal, Value *AlignAddr) const;
353 bool realignGroup(const MoveGroup &Move) const;
354
355 Value *makeTestIfUnaligned(IRBuilderBase &Builder, Value *AlignVal,
356 int Alignment) const;
357
358 friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);
359 friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);
360 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan::Block &B);
361 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);
362
363 std::map<Instruction *, AddrList> AddrGroups;
364 const HexagonVectorCombine &HVC;
365};
366
367[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
368 const AlignVectors::AddrInfo &AI) {
369 OS << "Inst: " << AI.Inst << " " << *AI.Inst << '\n';
370 OS << "Addr: " << *AI.Addr << '\n';
371 OS << "Type: " << *AI.ValTy << '\n';
372 OS << "HaveAlign: " << AI.HaveAlign.value() << '\n';
373 OS << "NeedAlign: " << AI.NeedAlign.value() << '\n';
374 OS << "Offset: " << AI.Offset;
375 return OS;
376}
377
378[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
379 const AlignVectors::MoveGroup &MG) {
380 OS << "IsLoad:" << (MG.IsLoad ? "yes" : "no");
381 OS << ", IsHvx:" << (MG.IsHvx ? "yes" : "no") << '\n';
382 OS << "Main\n";
383 for (Instruction *I : MG.Main)
384 OS << " " << *I << '\n';
385 OS << "Deps\n";
386 for (Instruction *I : MG.Deps)
387 OS << " " << *I << '\n';
388 OS << "Clones\n";
389 for (auto [K, V] : MG.Clones) {
390 OS << " ";
391 K->printAsOperand(OS, false);
392 OS << "\t-> " << *V << '\n';
393 }
394 return OS;
395}
396
397[[maybe_unused]] raw_ostream &
398operator<<(raw_ostream &OS, const AlignVectors::ByteSpan::Block &B) {
399 OS << " @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] ";
400 if (B.Seg.Val == reinterpret_cast<const Value *>(&B)) {
401 OS << "(self:" << B.Seg.Val << ')';
402 } else if (B.Seg.Val != nullptr) {
403 OS << *B.Seg.Val;
404 } else {
405 OS << "(null)";
406 }
407 return OS;
408}
409
410[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
411 const AlignVectors::ByteSpan &BS) {
412 OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
413 for (const AlignVectors::ByteSpan::Block &B : BS)
414 OS << B << '\n';
415 OS << ']';
416 return OS;
417}
418
419class HvxIdioms {
420public:
421 enum DstQualifier {
422 Undefined = 0,
423 Arithmetic,
424 LdSt,
425 LLVM_Gather,
426 LLVM_Scatter,
427 HEX_Gather_Scatter,
428 HEX_Gather,
429 HEX_Scatter,
430 Call
431 };
432
433 HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) {
434 auto *Int32Ty = HVC.getIntTy(32);
435 HvxI32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/false);
436 HvxP32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/true);
437 }
438
439 bool run();
440
441private:
442 enum Signedness { Positive, Signed, Unsigned };
443
444 // Value + sign
445 // This is to keep track of whether the value should be treated as signed
446 // or unsigned, or is known to be positive.
447 struct SValue {
448 Value *Val;
449 Signedness Sgn;
450 };
451
452 struct FxpOp {
453 unsigned Opcode;
454 unsigned Frac; // Number of fraction bits
455 SValue X, Y;
456 // If present, add 1 << RoundAt before shift:
457 std::optional<unsigned> RoundAt;
458 VectorType *ResTy;
459 };
460
461 auto getNumSignificantBits(Value *V, Instruction *In) const
462 -> std::pair<unsigned, Signedness>;
463 auto canonSgn(SValue X, SValue Y) const -> std::pair<SValue, SValue>;
464
465 auto matchFxpMul(Instruction &In) const -> std::optional<FxpOp>;
466 auto processFxpMul(Instruction &In, const FxpOp &Op) const -> Value *;
467
468 auto processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
469 const FxpOp &Op) const -> Value *;
470 auto createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
471 bool Rounding) const -> Value *;
472 auto createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
473 bool Rounding) const -> Value *;
474 // Return {Result, Carry}, where Carry is a vector predicate.
475 auto createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
476 Value *CarryIn = nullptr) const
477 -> std::pair<Value *, Value *>;
478 auto createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const -> Value *;
479 auto createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
480 -> Value *;
481 auto createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
482 -> std::pair<Value *, Value *>;
483 auto createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
485 auto createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
486 Signedness SgnX, ArrayRef<Value *> WordY,
487 Signedness SgnY) const -> SmallVector<Value *>;
488 // Vector manipulations for Ripple
489 bool matchScatter(Instruction &In) const;
490 bool matchGather(Instruction &In) const;
491 Value *processVScatter(Instruction &In) const;
492 Value *processVGather(Instruction &In) const;
493
494 VectorType *HvxI32Ty;
495 VectorType *HvxP32Ty;
496 const HexagonVectorCombine &HVC;
497
498 friend raw_ostream &operator<<(raw_ostream &, const FxpOp &);
499};
500
501[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
502 const HvxIdioms::FxpOp &Op) {
503 static const char *SgnNames[] = {"Positive", "Signed", "Unsigned"};
504 OS << Instruction::getOpcodeName(Op.Opcode) << '.' << Op.Frac;
505 if (Op.RoundAt.has_value()) {
506 if (Op.Frac != 0 && *Op.RoundAt == Op.Frac - 1) {
507 OS << ":rnd";
508 } else {
509 OS << " + 1<<" << *Op.RoundAt;
510 }
511 }
512 OS << "\n X:(" << SgnNames[Op.X.Sgn] << ") " << *Op.X.Val << "\n"
513 << " Y:(" << SgnNames[Op.Y.Sgn] << ") " << *Op.Y.Val;
514 return OS;
515}
516
517} // namespace
518
519namespace {
520
521template <typename T> T *getIfUnordered(T *MaybeT) {
522 return MaybeT && MaybeT->isUnordered() ? MaybeT : nullptr;
523}
524template <typename T> T *isCandidate(Instruction *In) {
525 return dyn_cast<T>(In);
526}
528 return getIfUnordered(dyn_cast<LoadInst>(In));
529}
531 return getIfUnordered(dyn_cast<StoreInst>(In));
532}
533
534#if !defined(_MSC_VER) || _MSC_VER >= 1926
535// VS2017 and some versions of VS2019 have trouble compiling this:
536// error C2976: 'std::map': too few template arguments
537// VS 2019 16.x is known to work, except for 16.4/16.5 (MSC_VER 1924/1925)
538template <typename Pred, typename... Ts>
539void erase_if(std::map<Ts...> &map, Pred p)
540#else
541template <typename Pred, typename T, typename U>
542void erase_if(std::map<T, U> &map, Pred p)
543#endif
544{
545 for (auto i = map.begin(), e = map.end(); i != e;) {
546 if (p(*i))
547 i = map.erase(i);
548 else
549 i = std::next(i);
550 }
551}
552
553// Forward other erase_ifs to the LLVM implementations.
554template <typename Pred, typename T> void erase_if(T &&container, Pred p) {
555 llvm::erase_if(std::forward<T>(container), p);
556}
557
558} // namespace
559
560// --- Begin AlignVectors
561
562// For brevity, only consider loads. We identify a group of loads where we
563// know the relative differences between their addresses, so we know how they
564// are laid out in memory (relative to one another). These loads can overlap,
565// can be shorter or longer than the desired vector length.
566// Ultimately we want to generate a sequence of aligned loads that will load
567// every byte that the original loads loaded, and have the program use these
568// loaded values instead of the original loads.
569// We consider the contiguous memory area spanned by all these loads.
570//
571// Let's say that a single aligned vector load can load 16 bytes at a time.
572// If the program wanted to use a byte at offset 13 from the beginning of the
573// original span, it will be a byte at offset 13+x in the aligned data for
574// some x>=0. This may happen to be in the first aligned load, or in the load
575// following it. Since we generally don't know what the that alignment value
576// is at compile time, we proactively do valigns on the aligned loads, so that
577// byte that was at offset 13 is still at offset 13 after the valigns.
578//
579// This will be the starting point for making the rest of the program use the
580// data loaded by the new loads.
581// For each original load, and its users:
582// %v = load ...
583// ... = %v
584// ... = %v
585// we create
586// %new_v = extract/combine/shuffle data from loaded/valigned vectors so
587// it contains the same value as %v did before
588// then replace all users of %v with %new_v.
589// ... = %new_v
590// ... = %new_v
591
592auto AlignVectors::ByteSpan::extent() const -> int {
593 if (size() == 0)
594 return 0;
595 int Min = Blocks[0].Pos;
596 int Max = Blocks[0].Pos + Blocks[0].Seg.Size;
597 for (int i = 1, e = size(); i != e; ++i) {
598 Min = std::min(Min, Blocks[i].Pos);
599 Max = std::max(Max, Blocks[i].Pos + Blocks[i].Seg.Size);
600 }
601 return Max - Min;
602}
603
604auto AlignVectors::ByteSpan::section(int Start, int Length) const -> ByteSpan {
605 ByteSpan Section;
606 for (const ByteSpan::Block &B : Blocks) {
607 int L = std::max(B.Pos, Start); // Left end.
608 int R = std::min(B.Pos + B.Seg.Size, Start + Length); // Right end+1.
609 if (L < R) {
610 // How much to chop off the beginning of the segment:
611 int Off = L > B.Pos ? L - B.Pos : 0;
612 Section.Blocks.emplace_back(B.Seg.Val, B.Seg.Start + Off, R - L, L);
613 }
614 }
615 return Section;
616}
617
618auto AlignVectors::ByteSpan::shift(int Offset) -> ByteSpan & {
619 for (Block &B : Blocks)
620 B.Pos += Offset;
621 return *this;
622}
623
624auto AlignVectors::ByteSpan::values() const -> SmallVector<Value *, 8> {
625 SmallVector<Value *, 8> Values(Blocks.size());
626 for (int i = 0, e = Blocks.size(); i != e; ++i)
627 Values[i] = Blocks[i].Seg.Val;
628 return Values;
629}
630
631auto AlignVectors::getAddrInfo(Instruction &In) const
632 -> std::optional<AddrInfo> {
633 if (auto *L = isCandidate<LoadInst>(&In))
634 return AddrInfo(HVC, L, L->getPointerOperand(), L->getType(),
635 L->getAlign());
636 if (auto *S = isCandidate<StoreInst>(&In))
637 return AddrInfo(HVC, S, S->getPointerOperand(),
638 S->getValueOperand()->getType(), S->getAlign());
639 if (auto *II = isCandidate<IntrinsicInst>(&In)) {
640 Intrinsic::ID ID = II->getIntrinsicID();
641 switch (ID) {
642 case Intrinsic::masked_load:
643 return AddrInfo(HVC, II, II->getArgOperand(0), II->getType(),
644 II->getParamAlign(0).valueOrOne());
645 case Intrinsic::masked_store:
646 return AddrInfo(HVC, II, II->getArgOperand(1),
647 II->getArgOperand(0)->getType(),
648 II->getParamAlign(1).valueOrOne());
649 }
650 }
651 return std::nullopt;
652}
653
654auto AlignVectors::isHvx(const AddrInfo &AI) const -> bool {
655 return HVC.HST.isTypeForHVX(AI.ValTy);
656}
657
658auto AlignVectors::getPayload(Value *Val) const -> Value * {
659 if (auto *In = dyn_cast<Instruction>(Val)) {
660 Intrinsic::ID ID = 0;
661 if (auto *II = dyn_cast<IntrinsicInst>(In))
662 ID = II->getIntrinsicID();
663 if (isa<StoreInst>(In) || ID == Intrinsic::masked_store)
664 return In->getOperand(0);
665 }
666 return Val;
667}
668
669auto AlignVectors::getMask(Value *Val) const -> Value * {
670 if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
671 switch (II->getIntrinsicID()) {
672 case Intrinsic::masked_load:
673 return II->getArgOperand(1);
674 case Intrinsic::masked_store:
675 return II->getArgOperand(2);
676 }
677 }
678
679 Type *ValTy = getPayload(Val)->getType();
680 if (auto *VecTy = dyn_cast<VectorType>(ValTy))
681 return Constant::getAllOnesValue(HVC.getBoolTy(HVC.length(VecTy)));
682 return Constant::getAllOnesValue(HVC.getBoolTy());
683}
684
685auto AlignVectors::getPassThrough(Value *Val) const -> Value * {
686 if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
687 if (II->getIntrinsicID() == Intrinsic::masked_load)
688 return II->getArgOperand(2);
689 }
690 return UndefValue::get(getPayload(Val)->getType());
691}
692
693auto AlignVectors::createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr,
694 Type *ValTy, int Adjust,
695 const InstMap &CloneMap) const
696 -> Value * {
697 if (auto *I = dyn_cast<Instruction>(Ptr))
698 if (Instruction *New = CloneMap.lookup(I))
699 Ptr = New;
700 return Builder.CreatePtrAdd(Ptr, HVC.getConstInt(Adjust), "gep");
701}
702
703auto AlignVectors::createAlignedPointer(IRBuilderBase &Builder, Value *Ptr,
704 Type *ValTy, int Alignment,
705 const InstMap &CloneMap) const
706 -> Value * {
707 auto remap = [&](Value *V) -> Value * {
708 if (auto *I = dyn_cast<Instruction>(V)) {
709 for (auto [Old, New] : CloneMap)
710 I->replaceUsesOfWith(Old, New);
711 return I;
712 }
713 return V;
714 };
715 Value *AsInt = Builder.CreatePtrToInt(Ptr, HVC.getIntTy(), "pti");
716 Value *Mask = HVC.getConstInt(-Alignment);
717 Value *And = Builder.CreateAnd(remap(AsInt), Mask, "and");
718 return Builder.CreateIntToPtr(
719 And, PointerType::getUnqual(ValTy->getContext()), "itp");
720}
721
722auto AlignVectors::createLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
723 Value *Predicate, int Alignment, Value *Mask,
724 Value *PassThru,
725 ArrayRef<Value *> MDSources) const -> Value * {
726 bool HvxHasPredLoad = HVC.HST.useHVXV62Ops();
727 // Predicate is nullptr if not creating predicated load
728 if (Predicate) {
729 assert(!Predicate->getType()->isVectorTy() &&
730 "Expectning scalar predicate");
731 if (HVC.isFalse(Predicate))
732 return UndefValue::get(ValTy);
733 if (!HVC.isTrue(Predicate) && HvxHasPredLoad) {
734 Value *Load = createPredicatedLoad(Builder, ValTy, Ptr, Predicate,
735 Alignment, MDSources);
736 return Builder.CreateSelect(Mask, Load, PassThru);
737 }
738 // Predicate == true here.
739 }
740 assert(!HVC.isUndef(Mask)); // Should this be allowed?
741 if (HVC.isZero(Mask))
742 return PassThru;
743 if (HVC.isTrue(Mask))
744 return createSimpleLoad(Builder, ValTy, Ptr, Alignment, MDSources);
745
746 Instruction *Load = Builder.CreateMaskedLoad(ValTy, Ptr, Align(Alignment),
747 Mask, PassThru, "mld");
748 propagateMetadata(Load, MDSources);
749 return Load;
750}
751
752auto AlignVectors::createSimpleLoad(IRBuilderBase &Builder, Type *ValTy,
753 Value *Ptr, int Alignment,
754 ArrayRef<Value *> MDSources) const
755 -> Value * {
757 Builder.CreateAlignedLoad(ValTy, Ptr, Align(Alignment), "ald");
758 propagateMetadata(Load, MDSources);
759 return Load;
760}
761
762auto AlignVectors::createPredicatedLoad(IRBuilderBase &Builder, Type *ValTy,
763 Value *Ptr, Value *Predicate,
764 int Alignment,
765 ArrayRef<Value *> MDSources) const
766 -> Value * {
767 assert(HVC.HST.isTypeForHVX(ValTy) &&
768 "Predicates 'scalar' vector loads not yet supported");
769 assert(Predicate);
770 assert(!Predicate->getType()->isVectorTy() && "Expectning scalar predicate");
771 assert(HVC.getSizeOf(ValTy, HVC.Alloc) % Alignment == 0);
772 if (HVC.isFalse(Predicate))
773 return UndefValue::get(ValTy);
774 if (HVC.isTrue(Predicate))
775 return createSimpleLoad(Builder, ValTy, Ptr, Alignment, MDSources);
776
777 auto V6_vL32b_pred_ai = HVC.HST.getIntrinsicId(Hexagon::V6_vL32b_pred_ai);
778 // FIXME: This may not put the offset from Ptr into the vmem offset.
779 return HVC.createHvxIntrinsic(Builder, V6_vL32b_pred_ai, ValTy,
780 {Predicate, Ptr, HVC.getConstInt(0)}, {},
781 MDSources);
782}
783
784auto AlignVectors::createStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
785 Value *Predicate, int Alignment, Value *Mask,
786 ArrayRef<Value *> MDSources) const -> Value * {
787 if (HVC.isZero(Mask) || HVC.isUndef(Val) || HVC.isUndef(Mask))
788 return UndefValue::get(Val->getType());
789 assert(!Predicate || (!Predicate->getType()->isVectorTy() &&
790 "Expectning scalar predicate"));
791 if (Predicate) {
792 if (HVC.isFalse(Predicate))
793 return UndefValue::get(Val->getType());
794 if (HVC.isTrue(Predicate))
795 Predicate = nullptr;
796 }
797 // Here both Predicate and Mask are true or unknown.
798
799 if (HVC.isTrue(Mask)) {
800 if (Predicate) { // Predicate unknown
801 return createPredicatedStore(Builder, Val, Ptr, Predicate, Alignment,
802 MDSources);
803 }
804 // Predicate is true:
805 return createSimpleStore(Builder, Val, Ptr, Alignment, MDSources);
806 }
807
808 // Mask is unknown
809 if (!Predicate) {
811 Builder.CreateMaskedStore(Val, Ptr, Align(Alignment), Mask);
812 propagateMetadata(Store, MDSources);
813 return Store;
814 }
815
816 // Both Predicate and Mask are unknown.
817 // Emulate masked store with predicated-load + mux + predicated-store.
818 Value *PredLoad = createPredicatedLoad(Builder, Val->getType(), Ptr,
819 Predicate, Alignment, MDSources);
820 Value *Mux = Builder.CreateSelect(Mask, Val, PredLoad);
821 return createPredicatedStore(Builder, Mux, Ptr, Predicate, Alignment,
822 MDSources);
823}
824
825auto AlignVectors::createSimpleStore(IRBuilderBase &Builder, Value *Val,
826 Value *Ptr, int Alignment,
827 ArrayRef<Value *> MDSources) const
828 -> Value * {
829 Instruction *Store = Builder.CreateAlignedStore(Val, Ptr, Align(Alignment));
830 propagateMetadata(Store, MDSources);
831 return Store;
832}
833
834auto AlignVectors::createPredicatedStore(IRBuilderBase &Builder, Value *Val,
835 Value *Ptr, Value *Predicate,
836 int Alignment,
837 ArrayRef<Value *> MDSources) const
838 -> Value * {
839 assert(HVC.HST.isTypeForHVX(Val->getType()) &&
840 "Predicates 'scalar' vector stores not yet supported");
841 assert(Predicate);
842 if (HVC.isFalse(Predicate))
843 return UndefValue::get(Val->getType());
844 if (HVC.isTrue(Predicate))
845 return createSimpleStore(Builder, Val, Ptr, Alignment, MDSources);
846
847 assert(HVC.getSizeOf(Val, HVC.Alloc) % Alignment == 0);
848 auto V6_vS32b_pred_ai = HVC.HST.getIntrinsicId(Hexagon::V6_vS32b_pred_ai);
849 // FIXME: This may not put the offset from Ptr into the vmem offset.
850 return HVC.createHvxIntrinsic(Builder, V6_vS32b_pred_ai, nullptr,
851 {Predicate, Ptr, HVC.getConstInt(0), Val}, {},
852 MDSources);
853}
854
855auto AlignVectors::getUpwardDeps(Instruction *In, Instruction *Base) const
856 -> DepList {
857 BasicBlock *Parent = Base->getParent();
858 assert(In->getParent() == Parent &&
859 "Base and In should be in the same block");
860 assert(Base->comesBefore(In) && "Base should come before In");
861
862 DepList Deps;
863 std::deque<Instruction *> WorkQ = {In};
864 while (!WorkQ.empty()) {
865 Instruction *D = WorkQ.front();
866 WorkQ.pop_front();
867 if (D != In)
868 Deps.insert(D);
869 for (Value *Op : D->operands()) {
870 if (auto *I = dyn_cast<Instruction>(Op)) {
871 if (I->getParent() == Parent && Base->comesBefore(I))
872 WorkQ.push_back(I);
873 }
874 }
875 }
876 return Deps;
877}
878
879auto AlignVectors::createAddressGroups() -> bool {
880 // An address group created here may contain instructions spanning
881 // multiple basic blocks.
882 AddrList WorkStack;
883
884 auto findBaseAndOffset = [&](AddrInfo &AI) -> std::pair<Instruction *, int> {
885 for (AddrInfo &W : WorkStack) {
886 if (auto D = HVC.calculatePointerDifference(AI.Addr, W.Addr))
887 return std::make_pair(W.Inst, *D);
888 }
889 return std::make_pair(nullptr, 0);
890 };
891
892 auto traverseBlock = [&](DomTreeNode *DomN, auto Visit) -> void {
893 BasicBlock &Block = *DomN->getBlock();
894 for (Instruction &I : Block) {
895 auto AI = this->getAddrInfo(I); // Use this-> for gcc6.
896 if (!AI)
897 continue;
898 auto F = findBaseAndOffset(*AI);
899 Instruction *GroupInst;
900 if (Instruction *BI = F.first) {
901 AI->Offset = F.second;
902 GroupInst = BI;
903 } else {
904 WorkStack.push_back(*AI);
905 GroupInst = AI->Inst;
906 }
907 AddrGroups[GroupInst].push_back(*AI);
908 }
909
910 for (DomTreeNode *C : DomN->children())
911 Visit(C, Visit);
912
913 while (!WorkStack.empty() && WorkStack.back().Inst->getParent() == &Block)
914 WorkStack.pop_back();
915 };
916
917 traverseBlock(HVC.DT.getRootNode(), traverseBlock);
918 assert(WorkStack.empty());
919
920 // AddrGroups are formed.
921
922 // Remove groups of size 1.
923 erase_if(AddrGroups, [](auto &G) { return G.second.size() == 1; });
924 // Remove groups that don't use HVX types.
925 erase_if(AddrGroups, [&](auto &G) {
926 return llvm::none_of(
927 G.second, [&](auto &I) { return HVC.HST.isTypeForHVX(I.ValTy); });
928 });
929
930 return !AddrGroups.empty();
931}
932
933auto AlignVectors::createLoadGroups(const AddrList &Group) const -> MoveList {
934 // Form load groups.
935 // To avoid complications with moving code across basic blocks, only form
936 // groups that are contained within a single basic block.
937 unsigned SizeLimit = VAGroupSizeLimit;
938 if (SizeLimit == 0)
939 return {};
940
941 auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
942 assert(!Move.Main.empty() && "Move group should have non-empty Main");
943 if (Move.Main.size() >= SizeLimit)
944 return false;
945 // Don't mix HVX and non-HVX instructions.
946 if (Move.IsHvx != isHvx(Info))
947 return false;
948 // Leading instruction in the load group.
949 Instruction *Base = Move.Main.front();
950 if (Base->getParent() != Info.Inst->getParent())
951 return false;
952 // Check if it's safe to move the load.
953 if (!HVC.isSafeToMoveBeforeInBB(*Info.Inst, Base->getIterator()))
954 return false;
955 // And if it's safe to clone the dependencies.
956 auto isSafeToCopyAtBase = [&](const Instruction *I) {
957 return HVC.isSafeToMoveBeforeInBB(*I, Base->getIterator()) &&
958 HVC.isSafeToClone(*I);
959 };
960 DepList Deps = getUpwardDeps(Info.Inst, Base);
961 if (!llvm::all_of(Deps, isSafeToCopyAtBase))
962 return false;
963
964 Move.Main.push_back(Info.Inst);
965 llvm::append_range(Move.Deps, Deps);
966 return true;
967 };
968
969 MoveList LoadGroups;
970
971 for (const AddrInfo &Info : Group) {
972 if (!Info.Inst->mayReadFromMemory())
973 continue;
974 if (LoadGroups.empty() || !tryAddTo(Info, LoadGroups.back()))
975 LoadGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), true);
976 }
977
978 // Erase singleton groups.
979 erase_if(LoadGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
980
981 // Erase HVX groups on targets < HvxV62 (due to lack of predicated loads).
982 if (!HVC.HST.useHVXV62Ops())
983 erase_if(LoadGroups, [](const MoveGroup &G) { return G.IsHvx; });
984
985 return LoadGroups;
986}
987
988auto AlignVectors::createStoreGroups(const AddrList &Group) const -> MoveList {
989 // Form store groups.
990 // To avoid complications with moving code across basic blocks, only form
991 // groups that are contained within a single basic block.
992 unsigned SizeLimit = VAGroupSizeLimit;
993 if (SizeLimit == 0)
994 return {};
995
996 auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
997 assert(!Move.Main.empty() && "Move group should have non-empty Main");
998 if (Move.Main.size() >= SizeLimit)
999 return false;
1000 // For stores with return values we'd have to collect downward dependencies.
1001 // There are no such stores that we handle at the moment, so omit that.
1002 assert(Info.Inst->getType()->isVoidTy() &&
1003 "Not handling stores with return values");
1004 // Don't mix HVX and non-HVX instructions.
1005 if (Move.IsHvx != isHvx(Info))
1006 return false;
1007 // For stores we need to be careful whether it's safe to move them.
1008 // Stores that are otherwise safe to move together may not appear safe
1009 // to move over one another (i.e. isSafeToMoveBefore may return false).
1010 Instruction *Base = Move.Main.front();
1011 if (Base->getParent() != Info.Inst->getParent())
1012 return false;
1013 if (!HVC.isSafeToMoveBeforeInBB(*Info.Inst, Base->getIterator(), Move.Main))
1014 return false;
1015 Move.Main.push_back(Info.Inst);
1016 return true;
1017 };
1018
1019 MoveList StoreGroups;
1020
1021 for (auto I = Group.rbegin(), E = Group.rend(); I != E; ++I) {
1022 const AddrInfo &Info = *I;
1023 if (!Info.Inst->mayWriteToMemory())
1024 continue;
1025 if (StoreGroups.empty() || !tryAddTo(Info, StoreGroups.back()))
1026 StoreGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), false);
1027 }
1028
1029 // Erase singleton groups.
1030 erase_if(StoreGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
1031
1032 // Erase HVX groups on targets < HvxV62 (due to lack of predicated loads).
1033 if (!HVC.HST.useHVXV62Ops())
1034 erase_if(StoreGroups, [](const MoveGroup &G) { return G.IsHvx; });
1035
1036 // Erase groups where every store is a full HVX vector. The reason is that
1037 // aligning predicated stores generates complex code that may be less
1038 // efficient than a sequence of unaligned vector stores.
1039 if (!VADoFullStores) {
1040 erase_if(StoreGroups, [this](const MoveGroup &G) {
1041 return G.IsHvx && llvm::all_of(G.Main, [this](Instruction *S) {
1042 auto MaybeInfo = this->getAddrInfo(*S);
1043 assert(MaybeInfo.has_value());
1044 return HVC.HST.isHVXVectorType(
1045 EVT::getEVT(MaybeInfo->ValTy, false));
1046 });
1047 });
1048 }
1049
1050 return StoreGroups;
1051}
1052
1053auto AlignVectors::moveTogether(MoveGroup &Move) const -> bool {
1054 // Move all instructions to be adjacent.
1055 assert(!Move.Main.empty() && "Move group should have non-empty Main");
1056 Instruction *Where = Move.Main.front();
1057
1058 if (Move.IsLoad) {
1059 // Move all the loads (and dependencies) to where the first load is.
1060 // Clone all deps to before Where, keeping order.
1061 Move.Clones = cloneBefore(Where->getIterator(), Move.Deps);
1062 // Move all main instructions to after Where, keeping order.
1063 ArrayRef<Instruction *> Main(Move.Main);
1064 for (Instruction *M : Main) {
1065 if (M != Where)
1066 M->moveAfter(Where);
1067 for (auto [Old, New] : Move.Clones)
1068 M->replaceUsesOfWith(Old, New);
1069 Where = M;
1070 }
1071 // Replace Deps with the clones.
1072 for (int i = 0, e = Move.Deps.size(); i != e; ++i)
1073 Move.Deps[i] = Move.Clones[Move.Deps[i]];
1074 } else {
1075 // Move all the stores to where the last store is.
1076 // NOTE: Deps are empty for "store" groups. If they need to be
1077 // non-empty, decide on the order.
1078 assert(Move.Deps.empty());
1079 // Move all main instructions to before Where, inverting order.
1080 ArrayRef<Instruction *> Main(Move.Main);
1081 for (Instruction *M : Main.drop_front(1)) {
1082 M->moveBefore(Where->getIterator());
1083 Where = M;
1084 }
1085 }
1086
1087 return Move.Main.size() + Move.Deps.size() > 1;
1088}
1089
1090template <typename T>
1091auto AlignVectors::cloneBefore(BasicBlock::iterator To, T &&Insts) const
1092 -> InstMap {
1093 InstMap Map;
1094
1095 for (Instruction *I : Insts) {
1096 assert(HVC.isSafeToClone(*I));
1097 Instruction *C = I->clone();
1098 C->setName(Twine("c.") + I->getName() + ".");
1099 C->insertBefore(To);
1100
1101 for (auto [Old, New] : Map)
1102 C->replaceUsesOfWith(Old, New);
1103 Map.insert(std::make_pair(I, C));
1104 }
1105 return Map;
1106}
1107
1108auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
1109 const ByteSpan &VSpan, int ScLen,
1110 Value *AlignVal, Value *AlignAddr) const
1111 -> void {
1112 LLVM_DEBUG(dbgs() << __func__ << "\n");
1113
1114 Type *SecTy = HVC.getByteTy(ScLen);
1115 int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
1116 bool DoAlign = !HVC.isZero(AlignVal);
1117 BasicBlock::iterator BasePos = Builder.GetInsertPoint();
1118 BasicBlock *BaseBlock = Builder.GetInsertBlock();
1119
1120 ByteSpan ASpan;
1121 auto *True = Constant::getAllOnesValue(HVC.getBoolTy(ScLen));
1122 auto *Undef = UndefValue::get(SecTy);
1123
1124 // Created load does not have to be "Instruction" (e.g. "undef").
1125 SmallVector<Value *> Loads(NumSectors + DoAlign, nullptr);
1126
1127 // We could create all of the aligned loads, and generate the valigns
1128 // at the location of the first load, but for large load groups, this
1129 // could create highly suboptimal code (there have been groups of 140+
1130 // loads in real code).
1131 // Instead, place the loads/valigns as close to the users as possible.
1132 // In any case we need to have a mapping from the blocks of VSpan (the
1133 // span covered by the pre-existing loads) to ASpan (the span covered
1134 // by the aligned loads). There is a small problem, though: ASpan needs
1135 // to have pointers to the loads/valigns, but we don't have these loads
1136 // because we don't know where to put them yet. We find out by creating
1137 // a section of ASpan that corresponds to values (blocks) from VSpan,
1138 // and checking where the new load should be placed. We need to attach
1139 // this location information to each block in ASpan somehow, so we put
1140 // distincts values for Seg.Val in each ASpan.Blocks[i], and use a map
1141 // to store the location for each Seg.Val.
1142 // The distinct values happen to be Blocks[i].Seg.Val = &Blocks[i],
1143 // which helps with printing ByteSpans without crashing when printing
1144 // Segments with these temporary identifiers in place of Val.
1145
1146 // Populate the blocks first, to avoid reallocations of the vector
1147 // interfering with generating the placeholder addresses.
1148 for (int Index = 0; Index != NumSectors; ++Index)
1149 ASpan.Blocks.emplace_back(nullptr, ScLen, Index * ScLen);
1150 for (int Index = 0; Index != NumSectors; ++Index) {
1151 ASpan.Blocks[Index].Seg.Val =
1152 reinterpret_cast<Value *>(&ASpan.Blocks[Index]);
1153 }
1154
1155 // Multiple values from VSpan can map to the same value in ASpan. Since we
1156 // try to create loads lazily, we need to find the earliest use for each
1157 // value from ASpan.
1158 DenseMap<void *, Instruction *> EarliestUser;
1159 auto isEarlier = [](Instruction *A, Instruction *B) {
1160 if (B == nullptr)
1161 return true;
1162 if (A == nullptr)
1163 return false;
1164 assert(A->getParent() == B->getParent());
1165 return A->comesBefore(B);
1166 };
1167 auto earliestUser = [&](const auto &Uses) {
1168 Instruction *User = nullptr;
1169 for (const Use &U : Uses) {
1170 auto *I = dyn_cast<Instruction>(U.getUser());
1171 assert(I != nullptr && "Load used in a non-instruction?");
1172 // Make sure we only consider users in this block, but we need
1173 // to remember if there were users outside the block too. This is
1174 // because if no users are found, aligned loads will not be created.
1175 if (I->getParent() == BaseBlock) {
1176 if (!isa<PHINode>(I))
1177 User = std::min(User, I, isEarlier);
1178 } else {
1179 User = std::min(User, BaseBlock->getTerminator(), isEarlier);
1180 }
1181 }
1182 return User;
1183 };
1184
1185 for (const ByteSpan::Block &B : VSpan) {
1186 ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size);
1187 for (const ByteSpan::Block &S : ASection) {
1188 auto &EU = EarliestUser[S.Seg.Val];
1189 EU = std::min(EU, earliestUser(B.Seg.Val->uses()), isEarlier);
1190 }
1191 }
1192
1193 LLVM_DEBUG({
1194 dbgs() << "ASpan:\n" << ASpan << '\n';
1195 dbgs() << "Earliest users of ASpan:\n";
1196 for (auto &[Val, User] : EarliestUser) {
1197 dbgs() << Val << "\n ->" << *User << '\n';
1198 }
1199 });
1200
1201 auto createLoad = [&](IRBuilderBase &Builder, const ByteSpan &VSpan,
1202 int Index, bool MakePred) {
1203 Value *Ptr =
1204 createAdjustedPointer(Builder, AlignAddr, SecTy, Index * ScLen);
1205 Value *Predicate =
1206 MakePred ? makeTestIfUnaligned(Builder, AlignVal, ScLen) : nullptr;
1207
1208 // If vector shifting is potentially needed, accumulate metadata
1209 // from source sections of twice the load width.
1210 int Start = (Index - DoAlign) * ScLen;
1211 int Width = (1 + DoAlign) * ScLen;
1212 return this->createLoad(Builder, SecTy, Ptr, Predicate, ScLen, True, Undef,
1213 VSpan.section(Start, Width).values());
1214 };
1215
1216 auto moveBefore = [this](BasicBlock::iterator In, BasicBlock::iterator To) {
1217 // Move In and its upward dependencies to before To.
1218 assert(In->getParent() == To->getParent());
1219 DepList Deps = getUpwardDeps(&*In, &*To);
1220 In->moveBefore(To);
1221 // DepList is sorted with respect to positions in the basic block.
1222 InstMap Map = cloneBefore(In, Deps);
1223 for (auto [Old, New] : Map)
1224 In->replaceUsesOfWith(Old, New);
1225 };
1226
1227 // Generate necessary loads at appropriate locations.
1228 LLVM_DEBUG(dbgs() << "Creating loads for ASpan sectors\n");
1229 for (int Index = 0; Index != NumSectors + 1; ++Index) {
1230 // In ASpan, each block will be either a single aligned load, or a
1231 // valign of a pair of loads. In the latter case, an aligned load j
1232 // will belong to the current valign, and the one in the previous
1233 // block (for j > 0).
1234 // Place the load at a location which will dominate the valign, assuming
1235 // the valign will be placed right before the earliest user.
1236 Instruction *PrevAt =
1237 DoAlign && Index > 0 ? EarliestUser[&ASpan[Index - 1]] : nullptr;
1238 Instruction *ThisAt =
1239 Index < NumSectors ? EarliestUser[&ASpan[Index]] : nullptr;
1240 if (auto *Where = std::min(PrevAt, ThisAt, isEarlier)) {
1241 Builder.SetInsertPoint(Where);
1242 Loads[Index] =
1243 createLoad(Builder, VSpan, Index, DoAlign && Index == NumSectors);
1244 // We know it's safe to put the load at BasePos, but we'd prefer to put
1245 // it at "Where". To see if the load is safe to be placed at Where, put
1246 // it there first and then check if it's safe to move it to BasePos.
1247 // If not, then the load needs to be placed at BasePos.
1248 // We can't do this check proactively because we need the load to exist
1249 // in order to check legality.
1250 if (auto *Load = dyn_cast<Instruction>(Loads[Index])) {
1251 if (!HVC.isSafeToMoveBeforeInBB(*Load, BasePos))
1252 moveBefore(Load->getIterator(), BasePos);
1253 }
1254 LLVM_DEBUG(dbgs() << "Loads[" << Index << "]:" << *Loads[Index] << '\n');
1255 }
1256 }
1257
1258 // Generate valigns if needed, and fill in proper values in ASpan
1259 LLVM_DEBUG(dbgs() << "Creating values for ASpan sectors\n");
1260 for (int Index = 0; Index != NumSectors; ++Index) {
1261 ASpan[Index].Seg.Val = nullptr;
1262 if (auto *Where = EarliestUser[&ASpan[Index]]) {
1263 Builder.SetInsertPoint(Where);
1264 Value *Val = Loads[Index];
1265 assert(Val != nullptr);
1266 if (DoAlign) {
1267 Value *NextLoad = Loads[Index + 1];
1268 assert(NextLoad != nullptr);
1269 Val = HVC.vralignb(Builder, Val, NextLoad, AlignVal);
1270 }
1271 ASpan[Index].Seg.Val = Val;
1272 LLVM_DEBUG(dbgs() << "ASpan[" << Index << "]:" << *Val << '\n');
1273 }
1274 }
1275
1276 for (const ByteSpan::Block &B : VSpan) {
1277 ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);
1278 Value *Accum = UndefValue::get(HVC.getByteTy(B.Seg.Size));
1279 Builder.SetInsertPoint(cast<Instruction>(B.Seg.Val));
1280
1281 // We're generating a reduction, where each instruction depends on
1282 // the previous one, so we need to order them according to the position
1283 // of their inputs in the code.
1284 std::vector<ByteSpan::Block *> ABlocks;
1285 for (ByteSpan::Block &S : ASection) {
1286 if (S.Seg.Val != nullptr)
1287 ABlocks.push_back(&S);
1288 }
1289 llvm::sort(ABlocks,
1290 [&](const ByteSpan::Block *A, const ByteSpan::Block *B) {
1291 return isEarlier(cast<Instruction>(A->Seg.Val),
1292 cast<Instruction>(B->Seg.Val));
1293 });
1294 for (ByteSpan::Block *S : ABlocks) {
1295 // The processing of the data loaded by the aligned loads
1296 // needs to be inserted after the data is available.
1297 Instruction *SegI = cast<Instruction>(S->Seg.Val);
1298 Builder.SetInsertPoint(&*std::next(SegI->getIterator()));
1299 Value *Pay = HVC.vbytes(Builder, getPayload(S->Seg.Val));
1300 Accum =
1301 HVC.insertb(Builder, Accum, Pay, S->Seg.Start, S->Seg.Size, S->Pos);
1302 }
1303 // Instead of casting everything to bytes for the vselect, cast to the
1304 // original value type. This will avoid complications with casting masks.
1305 // For example, in cases when the original mask applied to i32, it could
1306 // be converted to a mask applicable to i8 via pred_typecast intrinsic,
1307 // but if the mask is not exactly of HVX length, extra handling would be
1308 // needed to make it work.
1309 Type *ValTy = getPayload(B.Seg.Val)->getType();
1310 Value *Cast = Builder.CreateBitCast(Accum, ValTy, "cst");
1311 Value *Sel = Builder.CreateSelect(getMask(B.Seg.Val), Cast,
1312 getPassThrough(B.Seg.Val), "sel");
1313 B.Seg.Val->replaceAllUsesWith(Sel);
1314 }
1315}
1316
1317auto AlignVectors::realignStoreGroup(IRBuilderBase &Builder,
1318 const ByteSpan &VSpan, int ScLen,
1319 Value *AlignVal, Value *AlignAddr) const
1320 -> void {
1321 LLVM_DEBUG(dbgs() << __func__ << "\n");
1322
1323 Type *SecTy = HVC.getByteTy(ScLen);
1324 int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
1325 bool DoAlign = !HVC.isZero(AlignVal);
1326
1327 // Stores.
1328 ByteSpan ASpanV, ASpanM;
1329
1330 // Return a vector value corresponding to the input value Val:
1331 // either <1 x Val> for scalar Val, or Val itself for vector Val.
1332 auto MakeVec = [](IRBuilderBase &Builder, Value *Val) -> Value * {
1333 Type *Ty = Val->getType();
1334 if (Ty->isVectorTy())
1335 return Val;
1336 auto *VecTy = VectorType::get(Ty, 1, /*Scalable=*/false);
1337 return Builder.CreateBitCast(Val, VecTy, "cst");
1338 };
1339
1340 // Create an extra "undef" sector at the beginning and at the end.
1341 // They will be used as the left/right filler in the vlalign step.
1342 for (int Index = (DoAlign ? -1 : 0); Index != NumSectors + DoAlign; ++Index) {
1343 // For stores, the size of each section is an aligned vector length.
1344 // Adjust the store offsets relative to the section start offset.
1345 ByteSpan VSection =
1346 VSpan.section(Index * ScLen, ScLen).shift(-Index * ScLen);
1347 Value *Undef = UndefValue::get(SecTy);
1349 Value *AccumV = Undef;
1350 Value *AccumM = Zero;
1351 for (ByteSpan::Block &S : VSection) {
1352 Value *Pay = getPayload(S.Seg.Val);
1353 Value *Mask = HVC.rescale(Builder, MakeVec(Builder, getMask(S.Seg.Val)),
1354 Pay->getType(), HVC.getByteTy());
1355 Value *PartM = HVC.insertb(Builder, Zero, HVC.vbytes(Builder, Mask),
1356 S.Seg.Start, S.Seg.Size, S.Pos);
1357 AccumM = Builder.CreateOr(AccumM, PartM);
1358
1359 Value *PartV = HVC.insertb(Builder, Undef, HVC.vbytes(Builder, Pay),
1360 S.Seg.Start, S.Seg.Size, S.Pos);
1361
1362 AccumV = Builder.CreateSelect(
1363 Builder.CreateICmp(CmpInst::ICMP_NE, PartM, Zero), PartV, AccumV);
1364 }
1365 ASpanV.Blocks.emplace_back(AccumV, ScLen, Index * ScLen);
1366 ASpanM.Blocks.emplace_back(AccumM, ScLen, Index * ScLen);
1367 }
1368
1369 LLVM_DEBUG({
1370 dbgs() << "ASpanV before vlalign:\n" << ASpanV << '\n';
1371 dbgs() << "ASpanM before vlalign:\n" << ASpanM << '\n';
1372 });
1373
1374 // vlalign
1375 if (DoAlign) {
1376 for (int Index = 1; Index != NumSectors + 2; ++Index) {
1377 Value *PrevV = ASpanV[Index - 1].Seg.Val, *ThisV = ASpanV[Index].Seg.Val;
1378 Value *PrevM = ASpanM[Index - 1].Seg.Val, *ThisM = ASpanM[Index].Seg.Val;
1379 assert(isSectorTy(PrevV->getType()) && isSectorTy(PrevM->getType()));
1380 ASpanV[Index - 1].Seg.Val = HVC.vlalignb(Builder, PrevV, ThisV, AlignVal);
1381 ASpanM[Index - 1].Seg.Val = HVC.vlalignb(Builder, PrevM, ThisM, AlignVal);
1382 }
1383 }
1384
1385 LLVM_DEBUG({
1386 dbgs() << "ASpanV after vlalign:\n" << ASpanV << '\n';
1387 dbgs() << "ASpanM after vlalign:\n" << ASpanM << '\n';
1388 });
1389
1390 auto createStore = [&](IRBuilderBase &Builder, const ByteSpan &ASpanV,
1391 const ByteSpan &ASpanM, int Index, bool MakePred) {
1392 Value *Val = ASpanV[Index].Seg.Val;
1393 Value *Mask = ASpanM[Index].Seg.Val; // bytes
1394 if (HVC.isUndef(Val) || HVC.isZero(Mask))
1395 return;
1396 Value *Ptr =
1397 createAdjustedPointer(Builder, AlignAddr, SecTy, Index * ScLen);
1398 Value *Predicate =
1399 MakePred ? makeTestIfUnaligned(Builder, AlignVal, ScLen) : nullptr;
1400
1401 // If vector shifting is potentially needed, accumulate metadata
1402 // from source sections of twice the store width.
1403 int Start = (Index - DoAlign) * ScLen;
1404 int Width = (1 + DoAlign) * ScLen;
1405 this->createStore(Builder, Val, Ptr, Predicate, ScLen,
1406 HVC.vlsb(Builder, Mask),
1407 VSpan.section(Start, Width).values());
1408 };
1409
1410 for (int Index = 0; Index != NumSectors + DoAlign; ++Index) {
1411 createStore(Builder, ASpanV, ASpanM, Index, DoAlign && Index == NumSectors);
1412 }
1413}
1414
1415auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
1416 LLVM_DEBUG(dbgs() << "Realigning group:\n" << Move << '\n');
1417
1418 // TODO: Needs support for masked loads/stores of "scalar" vectors.
1419 if (!Move.IsHvx)
1420 return false;
1421
1422 // Return the element with the maximum alignment from Range,
1423 // where GetValue obtains the value to compare from an element.
1424 auto getMaxOf = [](auto Range, auto GetValue) {
1425 return *llvm::max_element(Range, [&GetValue](auto &A, auto &B) {
1426 return GetValue(A) < GetValue(B);
1427 });
1428 };
1429
1430 const AddrList &BaseInfos = AddrGroups.at(Move.Base);
1431
1432 // Conceptually, there is a vector of N bytes covering the addresses
1433 // starting from the minimum offset (i.e. Base.Addr+Start). This vector
1434 // represents a contiguous memory region that spans all accessed memory
1435 // locations.
1436 // The correspondence between loaded or stored values will be expressed
1437 // in terms of this vector. For example, the 0th element of the vector
1438 // from the Base address info will start at byte Start from the beginning
1439 // of this conceptual vector.
1440 //
1441 // This vector will be loaded/stored starting at the nearest down-aligned
1442 // address and the amount od the down-alignment will be AlignVal:
1443 // valign(load_vector(align_down(Base+Start)), AlignVal)
1444
1445 std::set<Instruction *> TestSet(Move.Main.begin(), Move.Main.end());
1446 AddrList MoveInfos;
1448 BaseInfos, std::back_inserter(MoveInfos),
1449 [&TestSet](const AddrInfo &AI) { return TestSet.count(AI.Inst); });
1450
1451 // Maximum alignment present in the whole address group.
1452 const AddrInfo &WithMaxAlign =
1453 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.HaveAlign; });
1454 Align MaxGiven = WithMaxAlign.HaveAlign;
1455
1456 // Minimum alignment present in the move address group.
1457 const AddrInfo &WithMinOffset =
1458 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return -AI.Offset; });
1459
1460 const AddrInfo &WithMaxNeeded =
1461 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.NeedAlign; });
1462 Align MinNeeded = WithMaxNeeded.NeedAlign;
1463
1464 // Set the builder's insertion point right before the load group, or
1465 // immediately after the store group. (Instructions in a store group are
1466 // listed in reverse order.)
1467 Instruction *InsertAt = Move.Main.front();
1468 if (!Move.IsLoad) {
1469 // There should be a terminator (which store isn't, but check anyways).
1470 assert(InsertAt->getIterator() != InsertAt->getParent()->end());
1471 InsertAt = &*std::next(InsertAt->getIterator());
1472 }
1473
1474 IRBuilder Builder(InsertAt->getParent(), InsertAt->getIterator(),
1475 InstSimplifyFolder(HVC.DL));
1476 Value *AlignAddr = nullptr; // Actual aligned address.
1477 Value *AlignVal = nullptr; // Right-shift amount (for valign).
1478
1479 if (MinNeeded <= MaxGiven) {
1480 int Start = WithMinOffset.Offset;
1481 int OffAtMax = WithMaxAlign.Offset;
1482 // Shift the offset of the maximally aligned instruction (OffAtMax)
1483 // back by just enough multiples of the required alignment to cover the
1484 // distance from Start to OffAtMax.
1485 // Calculate the address adjustment amount based on the address with the
1486 // maximum alignment. This is to allow a simple gep instruction instead
1487 // of potential bitcasts to i8*.
1488 int Adjust = -alignTo(OffAtMax - Start, MinNeeded.value());
1489 AlignAddr = createAdjustedPointer(Builder, WithMaxAlign.Addr,
1490 WithMaxAlign.ValTy, Adjust, Move.Clones);
1491 int Diff = Start - (OffAtMax + Adjust);
1492 AlignVal = HVC.getConstInt(Diff);
1493 assert(Diff >= 0);
1494 assert(static_cast<decltype(MinNeeded.value())>(Diff) < MinNeeded.value());
1495 } else {
1496 // WithMinOffset is the lowest address in the group,
1497 // WithMinOffset.Addr = Base+Start.
1498 // Align instructions for both HVX (V6_valign) and scalar (S2_valignrb)
1499 // mask off unnecessary bits, so it's ok to just the original pointer as
1500 // the alignment amount.
1501 // Do an explicit down-alignment of the address to avoid creating an
1502 // aligned instruction with an address that is not really aligned.
1503 AlignAddr =
1504 createAlignedPointer(Builder, WithMinOffset.Addr, WithMinOffset.ValTy,
1505 MinNeeded.value(), Move.Clones);
1506 AlignVal =
1507 Builder.CreatePtrToInt(WithMinOffset.Addr, HVC.getIntTy(), "pti");
1508 if (auto *I = dyn_cast<Instruction>(AlignVal)) {
1509 for (auto [Old, New] : Move.Clones)
1510 I->replaceUsesOfWith(Old, New);
1511 }
1512 }
1513
1514 ByteSpan VSpan;
1515 for (const AddrInfo &AI : MoveInfos) {
1516 VSpan.Blocks.emplace_back(AI.Inst, HVC.getSizeOf(AI.ValTy),
1517 AI.Offset - WithMinOffset.Offset);
1518 }
1519
1520 // The aligned loads/stores will use blocks that are either scalars,
1521 // or HVX vectors. Let "sector" be the unified term for such a block.
1522 // blend(scalar, vector) -> sector...
1523 int ScLen = Move.IsHvx ? HVC.HST.getVectorLength()
1524 : std::max<int>(MinNeeded.value(), 4);
1525 assert(!Move.IsHvx || ScLen == 64 || ScLen == 128);
1526 assert(Move.IsHvx || ScLen == 4 || ScLen == 8);
1527
1528 LLVM_DEBUG({
1529 dbgs() << "ScLen: " << ScLen << "\n";
1530 dbgs() << "AlignVal:" << *AlignVal << "\n";
1531 dbgs() << "AlignAddr:" << *AlignAddr << "\n";
1532 dbgs() << "VSpan:\n" << VSpan << '\n';
1533 });
1534
1535 if (Move.IsLoad)
1536 realignLoadGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1537 else
1538 realignStoreGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1539
1540 for (auto *Inst : Move.Main)
1541 Inst->eraseFromParent();
1542
1543 return true;
1544}
1545
1546auto AlignVectors::makeTestIfUnaligned(IRBuilderBase &Builder, Value *AlignVal,
1547 int Alignment) const -> Value * {
1548 auto *AlignTy = AlignVal->getType();
1549 Value *And = Builder.CreateAnd(
1550 AlignVal, ConstantInt::get(AlignTy, Alignment - 1), "and");
1551 Value *Zero = ConstantInt::get(AlignTy, 0);
1552 return Builder.CreateICmpNE(And, Zero, "isz");
1553}
1554
1555auto AlignVectors::isSectorTy(Type *Ty) const -> bool {
1556 if (!HVC.isByteVecTy(Ty))
1557 return false;
1558 int Size = HVC.getSizeOf(Ty);
1559 if (HVC.HST.isTypeForHVX(Ty))
1560 return Size == static_cast<int>(HVC.HST.getVectorLength());
1561 return Size == 4 || Size == 8;
1562}
1563
1564auto AlignVectors::run() -> bool {
1565 LLVM_DEBUG(dbgs() << "\nRunning HVC::AlignVectors on " << HVC.F.getName()
1566 << '\n');
1567 if (!createAddressGroups())
1568 return false;
1569
1570 LLVM_DEBUG({
1571 dbgs() << "Address groups(" << AddrGroups.size() << "):\n";
1572 for (auto &[In, AL] : AddrGroups) {
1573 for (const AddrInfo &AI : AL)
1574 dbgs() << "---\n" << AI << '\n';
1575 }
1576 });
1577
1578 bool Changed = false;
1579 MoveList LoadGroups, StoreGroups;
1580
1581 for (auto &G : AddrGroups) {
1582 llvm::append_range(LoadGroups, createLoadGroups(G.second));
1583 llvm::append_range(StoreGroups, createStoreGroups(G.second));
1584 }
1585
1586 LLVM_DEBUG({
1587 dbgs() << "\nLoad groups(" << LoadGroups.size() << "):\n";
1588 for (const MoveGroup &G : LoadGroups)
1589 dbgs() << G << "\n";
1590 dbgs() << "Store groups(" << StoreGroups.size() << "):\n";
1591 for (const MoveGroup &G : StoreGroups)
1592 dbgs() << G << "\n";
1593 });
1594
1595 // Cumulative limit on the number of groups.
1596 unsigned CountLimit = VAGroupCountLimit;
1597 if (CountLimit == 0)
1598 return false;
1599
1600 if (LoadGroups.size() > CountLimit) {
1601 LoadGroups.resize(CountLimit);
1602 StoreGroups.clear();
1603 } else {
1604 unsigned StoreLimit = CountLimit - LoadGroups.size();
1605 if (StoreGroups.size() > StoreLimit)
1606 StoreGroups.resize(StoreLimit);
1607 }
1608
1609 for (auto &M : LoadGroups)
1610 Changed |= moveTogether(M);
1611 for (auto &M : StoreGroups)
1612 Changed |= moveTogether(M);
1613
1614 LLVM_DEBUG(dbgs() << "After moveTogether:\n" << HVC.F);
1615
1616 for (auto &M : LoadGroups)
1617 Changed |= realignGroup(M);
1618 for (auto &M : StoreGroups)
1619 Changed |= realignGroup(M);
1620
1621 return Changed;
1622}
1623
1624// --- End AlignVectors
1625
1626// --- Begin HvxIdioms
1627
1628auto HvxIdioms::getNumSignificantBits(Value *V, Instruction *In) const
1629 -> std::pair<unsigned, Signedness> {
1630 unsigned Bits = HVC.getNumSignificantBits(V, In);
1631 // The significant bits are calculated including the sign bit. This may
1632 // add an extra bit for zero-extended values, e.g. (zext i32 to i64) may
1633 // result in 33 significant bits. To avoid extra words, skip the extra
1634 // sign bit, but keep information that the value is to be treated as
1635 // unsigned.
1636 KnownBits Known = HVC.getKnownBits(V, In);
1637 Signedness Sign = Signed;
1638 unsigned NumToTest = 0; // Number of bits used in test for unsignedness.
1639 if (isPowerOf2_32(Bits))
1640 NumToTest = Bits;
1641 else if (Bits > 1 && isPowerOf2_32(Bits - 1))
1642 NumToTest = Bits - 1;
1643
1644 if (NumToTest != 0 && Known.Zero.ashr(NumToTest).isAllOnes()) {
1645 Sign = Unsigned;
1646 Bits = NumToTest;
1647 }
1648
1649 // If the top bit of the nearest power-of-2 is zero, this value is
1650 // positive. It could be treated as either signed or unsigned.
1651 if (unsigned Pow2 = PowerOf2Ceil(Bits); Pow2 != Bits) {
1652 if (Known.Zero.ashr(Pow2 - 1).isAllOnes())
1653 Sign = Positive;
1654 }
1655 return {Bits, Sign};
1656}
1657
1658auto HvxIdioms::canonSgn(SValue X, SValue Y) const
1659 -> std::pair<SValue, SValue> {
1660 // Canonicalize the signedness of X and Y, so that the result is one of:
1661 // S, S
1662 // U/P, S
1663 // U/P, U/P
1664 if (X.Sgn == Signed && Y.Sgn != Signed)
1665 std::swap(X, Y);
1666 return {X, Y};
1667}
1668
1669// Match
1670// (X * Y) [>> N], or
1671// ((X * Y) + (1 << M)) >> N
1672auto HvxIdioms::matchFxpMul(Instruction &In) const -> std::optional<FxpOp> {
1673 using namespace PatternMatch;
1674 auto *Ty = In.getType();
1675
1676 if (!Ty->isVectorTy() || !Ty->getScalarType()->isIntegerTy())
1677 return std::nullopt;
1678
1679 unsigned Width = cast<IntegerType>(Ty->getScalarType())->getBitWidth();
1680
1681 FxpOp Op;
1682 Value *Exp = &In;
1683
1684 // Fixed-point multiplication is always shifted right (except when the
1685 // fraction is 0 bits).
1686 auto m_Shr = [](auto &&V, auto &&S) {
1687 return m_CombineOr(m_LShr(V, S), m_AShr(V, S));
1688 };
1689
1690 uint64_t Qn = 0;
1691 if (Value *T; match(Exp, m_Shr(m_Value(T), m_ConstantInt(Qn)))) {
1692 Op.Frac = Qn;
1693 Exp = T;
1694 } else {
1695 Op.Frac = 0;
1696 }
1697
1698 if (Op.Frac > Width)
1699 return std::nullopt;
1700
1701 // Check if there is rounding added.
1702 uint64_t CV;
1703 if (Value *T;
1704 Op.Frac > 0 && match(Exp, m_Add(m_Value(T), m_ConstantInt(CV)))) {
1705 if (CV != 0 && !isPowerOf2_64(CV))
1706 return std::nullopt;
1707 if (CV != 0)
1708 Op.RoundAt = Log2_64(CV);
1709 Exp = T;
1710 }
1711
1712 // Check if the rest is a multiplication.
1713 if (match(Exp, m_Mul(m_Value(Op.X.Val), m_Value(Op.Y.Val)))) {
1714 Op.Opcode = Instruction::Mul;
1715 // FIXME: The information below is recomputed.
1716 Op.X.Sgn = getNumSignificantBits(Op.X.Val, &In).second;
1717 Op.Y.Sgn = getNumSignificantBits(Op.Y.Val, &In).second;
1718 Op.ResTy = cast<VectorType>(Ty);
1719 return Op;
1720 }
1721
1722 return std::nullopt;
1723}
1724
1725auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const
1726 -> Value * {
1727 assert(Op.X.Val->getType() == Op.Y.Val->getType());
1728
1729 auto *VecTy = dyn_cast<VectorType>(Op.X.Val->getType());
1730 if (VecTy == nullptr)
1731 return nullptr;
1732 auto *ElemTy = cast<IntegerType>(VecTy->getElementType());
1733 unsigned ElemWidth = ElemTy->getBitWidth();
1734
1735 // TODO: This can be relaxed after legalization is done pre-isel.
1736 if ((HVC.length(VecTy) * ElemWidth) % (8 * HVC.HST.getVectorLength()) != 0)
1737 return nullptr;
1738
1739 // There are no special intrinsics that should be used for multiplying
1740 // signed 8-bit values, so just skip them. Normal codegen should handle
1741 // this just fine.
1742 if (ElemWidth <= 8)
1743 return nullptr;
1744 // Similarly, if this is just a multiplication that can be handled without
1745 // intervention, then leave it alone.
1746 if (ElemWidth <= 32 && Op.Frac == 0)
1747 return nullptr;
1748
1749 auto [BitsX, SignX] = getNumSignificantBits(Op.X.Val, &In);
1750 auto [BitsY, SignY] = getNumSignificantBits(Op.Y.Val, &In);
1751
1752 // TODO: Add multiplication of vectors by scalar registers (up to 4 bytes).
1753
1754 Value *X = Op.X.Val, *Y = Op.Y.Val;
1755 IRBuilder Builder(In.getParent(), In.getIterator(),
1756 InstSimplifyFolder(HVC.DL));
1757
1758 auto roundUpWidth = [](unsigned Width) -> unsigned {
1759 if (Width <= 32 && !isPowerOf2_32(Width)) {
1760 // If the element width is not a power of 2, round it up
1761 // to the next one. Do this for widths not exceeding 32.
1762 return PowerOf2Ceil(Width);
1763 }
1764 if (Width > 32 && Width % 32 != 0) {
1765 // For wider elements, round it up to the multiple of 32.
1766 return alignTo(Width, 32u);
1767 }
1768 return Width;
1769 };
1770
1771 BitsX = roundUpWidth(BitsX);
1772 BitsY = roundUpWidth(BitsY);
1773
1774 // For elementwise multiplication vectors must have the same lengths, so
1775 // resize the elements of both inputs to the same width, the max of the
1776 // calculated significant bits.
1777 unsigned Width = std::max(BitsX, BitsY);
1778
1779 auto *ResizeTy = VectorType::get(HVC.getIntTy(Width), VecTy);
1780 if (Width < ElemWidth) {
1781 X = Builder.CreateTrunc(X, ResizeTy, "trn");
1782 Y = Builder.CreateTrunc(Y, ResizeTy, "trn");
1783 } else if (Width > ElemWidth) {
1784 X = SignX == Signed ? Builder.CreateSExt(X, ResizeTy, "sxt")
1785 : Builder.CreateZExt(X, ResizeTy, "zxt");
1786 Y = SignY == Signed ? Builder.CreateSExt(Y, ResizeTy, "sxt")
1787 : Builder.CreateZExt(Y, ResizeTy, "zxt");
1788 };
1789
1790 assert(X->getType() == Y->getType() && X->getType() == ResizeTy);
1791
1792 unsigned VecLen = HVC.length(ResizeTy);
1793 unsigned ChopLen = (8 * HVC.HST.getVectorLength()) / std::min(Width, 32u);
1794
1796 FxpOp ChopOp = Op;
1797 ChopOp.ResTy = VectorType::get(Op.ResTy->getElementType(), ChopLen, false);
1798
1799 for (unsigned V = 0; V != VecLen / ChopLen; ++V) {
1800 ChopOp.X.Val = HVC.subvector(Builder, X, V * ChopLen, ChopLen);
1801 ChopOp.Y.Val = HVC.subvector(Builder, Y, V * ChopLen, ChopLen);
1802 Results.push_back(processFxpMulChopped(Builder, In, ChopOp));
1803 if (Results.back() == nullptr)
1804 break;
1805 }
1806
1807 if (Results.empty() || Results.back() == nullptr)
1808 return nullptr;
1809
1810 Value *Cat = HVC.concat(Builder, Results);
1811 Value *Ext = SignX == Signed || SignY == Signed
1812 ? Builder.CreateSExt(Cat, VecTy, "sxt")
1813 : Builder.CreateZExt(Cat, VecTy, "zxt");
1814 return Ext;
1815}
1816
1817inline bool HvxIdioms::matchScatter(Instruction &In) const {
1818 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
1819 if (!II)
1820 return false;
1821 return (II->getIntrinsicID() == Intrinsic::masked_scatter);
1822}
1823
1824inline bool HvxIdioms::matchGather(Instruction &In) const {
1825 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
1826 if (!II)
1827 return false;
1828 return (II->getIntrinsicID() == Intrinsic::masked_gather);
1829}
1830
1831Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual);
1832
1833// Binary instructions we want to handle as users of gather/scatter.
1834inline bool isArithmetic(unsigned Opc) {
1835 switch (Opc) {
1836 case Instruction::Add:
1837 case Instruction::Sub:
1838 case Instruction::Mul:
1839 case Instruction::And:
1840 case Instruction::Or:
1841 case Instruction::Xor:
1842 case Instruction::AShr:
1843 case Instruction::LShr:
1844 case Instruction::Shl:
1845 case Instruction::UDiv:
1846 return true;
1847 }
1848 return false;
1849}
1850
1851// TODO: Maybe use MemoryLocation for this. See getLocOrNone above.
1852inline Value *getPointer(Value *Ptr) {
1853 assert(Ptr && "Unable to extract pointer");
1854 if (isa<AllocaInst>(Ptr) || isa<Argument>(Ptr) || isa<GlobalValue>(Ptr))
1855 return Ptr;
1856 if (isa<LoadInst>(Ptr) || isa<StoreInst>(Ptr))
1857 return getLoadStorePointerOperand(Ptr);
1859 if (II->getIntrinsicID() == Intrinsic::masked_store)
1860 return II->getOperand(1);
1861 }
1862 return nullptr;
1863}
1864
1866 HvxIdioms::DstQualifier &Qual) {
1867 Instruction *Destination = nullptr;
1868 if (!In)
1869 return Destination;
1870 if (isa<StoreInst>(In)) {
1871 Destination = In;
1872 Qual = HvxIdioms::LdSt;
1873 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {
1874 if (II->getIntrinsicID() == Intrinsic::masked_gather) {
1875 Destination = In;
1876 Qual = HvxIdioms::LLVM_Gather;
1877 } else if (II->getIntrinsicID() == Intrinsic::masked_scatter) {
1878 Destination = In;
1879 Qual = HvxIdioms::LLVM_Scatter;
1880 } else if (II->getIntrinsicID() == Intrinsic::masked_store) {
1881 Destination = In;
1882 Qual = HvxIdioms::LdSt;
1883 } else if (II->getIntrinsicID() ==
1884 Intrinsic::hexagon_V6_vgather_vscattermh) {
1885 Destination = In;
1886 Qual = HvxIdioms::HEX_Gather_Scatter;
1887 } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vscattermh_128B) {
1888 Destination = In;
1889 Qual = HvxIdioms::HEX_Scatter;
1890 } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vgathermh_128B) {
1891 Destination = In;
1892 Qual = HvxIdioms::HEX_Gather;
1893 }
1894 } else if (isa<ZExtInst>(In)) {
1895 return locateDestination(In, Qual);
1896 } else if (isa<CastInst>(In)) {
1897 return locateDestination(In, Qual);
1898 } else if (isa<CallInst>(In)) {
1899 Destination = In;
1900 Qual = HvxIdioms::Call;
1901 } else if (isa<GetElementPtrInst>(In)) {
1902 return locateDestination(In, Qual);
1903 } else if (isArithmetic(In->getOpcode())) {
1904 Destination = In;
1905 Qual = HvxIdioms::Arithmetic;
1906 } else {
1907 LLVM_DEBUG(dbgs() << "Unhandled destination : " << *In << "\n");
1908 }
1909 return Destination;
1910}
1911
1912// This method attempts to find destination (user) for a given intrinsic.
1913// Given that these are produced only by Ripple, the number of options is
1914// limited. Simplest case is explicit store which in fact is redundant (since
1915// HVX gater creates its own store during packetization). Nevertheless we need
1916// to figure address where we storing. Other cases are more complicated, but
1917// still few.
1918Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual) {
1919 Instruction *Destination = nullptr;
1920 if (!In)
1921 return Destination;
1922 // Get all possible destinations
1924 // Iterate over the uses of the instruction
1925 for (auto &U : In->uses()) {
1926 if (auto *UI = dyn_cast<Instruction>(U.getUser())) {
1927 Destination = selectDestination(UI, Qual);
1928 if (Destination)
1929 Users.push_back(Destination);
1930 }
1931 }
1932 // Now see which of the users (if any) is a memory destination.
1933 for (auto *I : Users)
1934 if (getPointer(I))
1935 return I;
1936 return Destination;
1937}
1938
1939// The two intrinsics we handle here have GEP in a different position.
1941 assert(In && "Bad instruction");
1943 assert((IIn && (IIn->getIntrinsicID() == Intrinsic::masked_gather ||
1944 IIn->getIntrinsicID() == Intrinsic::masked_scatter)) &&
1945 "Not a gather Intrinsic");
1946 GetElementPtrInst *GEPIndex = nullptr;
1947 if (IIn->getIntrinsicID() == Intrinsic::masked_gather)
1948 GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(0));
1949 else
1950 GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(1));
1951 return GEPIndex;
1952}
1953
1954// Given the intrinsic find its GEP argument and extract base address it uses.
1955// The method relies on the way how Ripple typically forms the GEP for
1956// scatter/gather.
1959 if (!GEPIndex) {
1960 LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n");
1961 return nullptr;
1962 }
1963 Value *BaseAddress = GEPIndex->getPointerOperand();
1964 auto *IndexLoad = dyn_cast<LoadInst>(BaseAddress);
1965 if (IndexLoad)
1966 return IndexLoad;
1967
1968 auto *IndexZEx = dyn_cast<ZExtInst>(BaseAddress);
1969 if (IndexZEx) {
1970 IndexLoad = dyn_cast<LoadInst>(IndexZEx->getOperand(0));
1971 if (IndexLoad)
1972 return IndexLoad;
1973 IntrinsicInst *II = dyn_cast<IntrinsicInst>(IndexZEx->getOperand(0));
1974 if (II && II->getIntrinsicID() == Intrinsic::masked_gather)
1976 }
1977 auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(BaseAddress);
1978 if (BaseShuffle) {
1979 IndexLoad = dyn_cast<LoadInst>(BaseShuffle->getOperand(0));
1980 if (IndexLoad)
1981 return IndexLoad;
1982 auto *IE = dyn_cast<InsertElementInst>(BaseShuffle->getOperand(0));
1983 if (IE) {
1984 auto *Src = IE->getOperand(1);
1985 IndexLoad = dyn_cast<LoadInst>(Src);
1986 if (IndexLoad)
1987 return IndexLoad;
1988 auto *Alloca = dyn_cast<AllocaInst>(Src);
1989 if (Alloca)
1990 return Alloca;
1991 if (isa<Argument>(Src)) {
1992 return Src;
1993 }
1994 if (isa<GlobalValue>(Src)) {
1995 return Src;
1996 }
1997 }
1998 }
1999 LLVM_DEBUG(dbgs() << " Unable to locate Address from intrinsic\n");
2000 return nullptr;
2001}
2002
2004 if (!In)
2005 return nullptr;
2006
2007 if (isa<LoadInst>(In) || isa<StoreInst>(In))
2008 return getLoadStoreType(In);
2009
2011 if (II->getIntrinsicID() == Intrinsic::masked_load)
2012 return II->getType();
2013 if (II->getIntrinsicID() == Intrinsic::masked_store)
2014 return II->getOperand(0)->getType();
2015 }
2016 return In->getType();
2017}
2018
2020 if (!In)
2021 return nullptr;
2022 if (isa<LoadInst>(In))
2023 return In;
2025 if (II->getIntrinsicID() == Intrinsic::masked_load)
2026 return In;
2027 if (II->getIntrinsicID() == Intrinsic::masked_gather)
2028 return In;
2029 }
2030 if (auto *IndexZEx = dyn_cast<ZExtInst>(In))
2031 return locateIndexesFromGEP(IndexZEx->getOperand(0));
2032 if (auto *IndexSEx = dyn_cast<SExtInst>(In))
2033 return locateIndexesFromGEP(IndexSEx->getOperand(0));
2034 if (auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(In))
2035 return locateIndexesFromGEP(BaseShuffle->getOperand(0));
2036 if (auto *IE = dyn_cast<InsertElementInst>(In))
2037 return locateIndexesFromGEP(IE->getOperand(1));
2038 if (auto *cstDataVector = dyn_cast<ConstantDataVector>(In))
2039 return cstDataVector;
2040 if (auto *GEPIndex = dyn_cast<GetElementPtrInst>(In))
2041 return GEPIndex->getOperand(0);
2042 return nullptr;
2043}
2044
2045// Given the intrinsic find its GEP argument and extract offsetts from the base
2046// address it uses.
2049 if (!GEPIndex) {
2050 LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n");
2051 return nullptr;
2052 }
2053 Value *Indexes = GEPIndex->getOperand(1);
2054 if (auto *IndexLoad = locateIndexesFromGEP(Indexes))
2055 return IndexLoad;
2056
2057 LLVM_DEBUG(dbgs() << " Unable to locate Index from intrinsic\n");
2058 return nullptr;
2059}
2060
2061// Because of aukward definition of many Hex intrinsics we often have to
2062// reinterprete HVX native <64 x i16> as <32 x i32> which in practice is a NOP
2063// for all use cases, so this only exist to make IR builder happy.
2064inline Value *getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC,
2065 IRBuilderBase &Builder,
2066 LLVMContext &Ctx, Value *I) {
2067 assert(I && "Unable to reinterprete cast");
2068 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2069 std::vector<unsigned> shuffleMask;
2070 for (unsigned i = 0; i < 64; ++i)
2071 shuffleMask.push_back(i);
2072 Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);
2073 Value *CastShuffle =
2074 Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");
2075 return Builder.CreateBitCast(CastShuffle, NT, "cst64_i16_to_32_i32");
2076}
2077
2078// Recast <128 x i8> as <32 x i32>
2079inline Value *getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC,
2080 IRBuilderBase &Builder,
2081 LLVMContext &Ctx, Value *I) {
2082 assert(I && "Unable to reinterprete cast");
2083 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2084 std::vector<unsigned> shuffleMask;
2085 for (unsigned i = 0; i < 128; ++i)
2086 shuffleMask.push_back(i);
2087 Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);
2088 Value *CastShuffle =
2089 Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");
2090 return Builder.CreateBitCast(CastShuffle, NT, "cst128_i8_to_32_i32");
2091}
2092
2093// Create <32 x i32> mask reinterpreted as <128 x i1> with a given pattern
2094inline Value *get_i32_Mask(const HexagonVectorCombine &HVC,
2095 IRBuilderBase &Builder, LLVMContext &Ctx,
2096 unsigned int pattern) {
2097 std::vector<unsigned int> byteMask;
2098 for (unsigned i = 0; i < 32; ++i)
2099 byteMask.push_back(pattern);
2100
2101 return Builder.CreateIntrinsic(
2102 HVC.getBoolTy(128), HVC.HST.getIntrinsicId(Hexagon::V6_vandvrt),
2103 {llvm::ConstantDataVector::get(Ctx, byteMask), HVC.getConstInt(~0)},
2104 nullptr);
2105}
2106
2107Value *HvxIdioms::processVScatter(Instruction &In) const {
2108 auto *InpTy = dyn_cast<VectorType>(In.getOperand(0)->getType());
2109 assert(InpTy && "Cannot handle no vector type for llvm.scatter/gather");
2110 unsigned InpSize = HVC.getSizeOf(InpTy);
2111 auto *F = In.getFunction();
2112 LLVMContext &Ctx = F->getContext();
2113 auto *ElemTy = dyn_cast<IntegerType>(InpTy->getElementType());
2114 assert(ElemTy && "llvm.scatter needs integer type argument");
2115 unsigned ElemWidth = HVC.DL.getTypeAllocSize(ElemTy);
2116 LLVM_DEBUG({
2117 unsigned Elements = HVC.length(InpTy);
2118 dbgs() << "\n[Process scatter](" << In << ")\n" << *In.getParent() << "\n";
2119 dbgs() << " Input type(" << *InpTy << ") elements(" << Elements
2120 << ") VecLen(" << InpSize << ") type(" << *ElemTy << ") ElemWidth("
2121 << ElemWidth << ")\n";
2122 });
2123
2124 IRBuilder Builder(In.getParent(), In.getIterator(),
2125 InstSimplifyFolder(HVC.DL));
2126
2127 auto *ValueToScatter = In.getOperand(0);
2128 LLVM_DEBUG(dbgs() << " ValueToScatter : " << *ValueToScatter << "\n");
2129
2130 if (HVC.HST.getVectorLength() != InpSize) {
2131 LLVM_DEBUG(dbgs() << "Unhandled vector size(" << InpSize
2132 << ") for vscatter\n");
2133 return nullptr;
2134 }
2135
2136 // Base address of indexes.
2137 auto *IndexLoad = locateAddressFromIntrinsic(&In);
2138 if (!IndexLoad)
2139 return nullptr;
2140 LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n");
2141
2142 // Address of destination. Must be in VTCM.
2143 auto *Ptr = getPointer(IndexLoad);
2144 if (!Ptr)
2145 return nullptr;
2146 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2147 // Indexes/offsets
2148 auto *Indexes = locateIndexesFromIntrinsic(&In);
2149 if (!Indexes)
2150 return nullptr;
2151 LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n");
2152 Value *CastedDst = Builder.CreateBitOrPointerCast(Ptr, Type::getInt32Ty(Ctx),
2153 "cst_ptr_to_i32");
2154 LLVM_DEBUG(dbgs() << " CastedDst : " << *CastedDst << "\n");
2155 // Adjust Indexes
2156 auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
2157 Value *CastIndex = nullptr;
2158 if (cstDataVector) {
2159 // Our indexes are represented as a constant. We need it in a reg.
2160 AllocaInst *IndexesAlloca =
2161 Builder.CreateAlloca(HVC.getHvxTy(HVC.getIntTy(32), false));
2162 [[maybe_unused]] auto *StoreIndexes =
2163 Builder.CreateStore(cstDataVector, IndexesAlloca);
2164 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2165 CastIndex = Builder.CreateLoad(IndexesAlloca->getAllocatedType(),
2166 IndexesAlloca, "reload_index");
2167 } else {
2168 if (ElemWidth == 2)
2169 CastIndex = getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
2170 else
2171 CastIndex = Indexes;
2172 }
2173 LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n");
2174
2175 if (ElemWidth == 1) {
2176 // v128i8 There is no native instruction for this.
2177 // Do this as two Hi/Lo gathers with masking.
2178 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2179 // Extend indexes. We assume that indexes are in 128i8 format - need to
2180 // expand them to Hi/Lo 64i16
2181 Value *CastIndexes = Builder.CreateBitCast(CastIndex, NT, "cast_to_32i32");
2182 auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);
2183 auto *UnpackedIndexes = Builder.CreateIntrinsic(
2184 HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastIndexes, nullptr);
2185 LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes << ")\n");
2186
2187 auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);
2188 auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);
2189 [[maybe_unused]] Value *IndexHi =
2190 HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);
2191 [[maybe_unused]] Value *IndexLo =
2192 HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);
2193 LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n");
2194 LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n");
2195 // Now unpack values to scatter
2196 Value *CastSrc =
2197 getReinterpretiveCast_i8_to_i32(HVC, Builder, Ctx, ValueToScatter);
2198 LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n");
2199 auto *UnpackedValueToScatter = Builder.CreateIntrinsic(
2200 HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastSrc, nullptr);
2201 LLVM_DEBUG(dbgs() << " UnpackedValToScat: " << *UnpackedValueToScatter
2202 << ")\n");
2203
2204 [[maybe_unused]] Value *UVSHi =
2205 HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedValueToScatter);
2206 [[maybe_unused]] Value *UVSLo =
2207 HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedValueToScatter);
2208 LLVM_DEBUG(dbgs() << " UVSHi : " << *UVSHi << ")\n");
2209 LLVM_DEBUG(dbgs() << " UVSLo : " << *UVSLo << ")\n");
2210
2211 // Create the mask for individual bytes
2212 auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);
2213 LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n");
2214 [[maybe_unused]] auto *ResHi = Builder.CreateIntrinsic(
2215 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,
2216 {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2217 IndexHi, UVSHi},
2218 nullptr);
2219 LLVM_DEBUG(dbgs() << " ResHi : " << *ResHi << ")\n");
2220 return Builder.CreateIntrinsic(
2221 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,
2222 {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2223 IndexLo, UVSLo},
2224 nullptr);
2225 } else if (ElemWidth == 2) {
2226 Value *CastSrc =
2227 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, ValueToScatter);
2228 LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n");
2229 return Builder.CreateIntrinsic(
2230 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermh_128B,
2231 {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
2232 CastSrc},
2233 nullptr);
2234 } else if (ElemWidth == 4) {
2235 return Builder.CreateIntrinsic(
2236 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermw_128B,
2237 {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
2238 ValueToScatter},
2239 nullptr);
2240 } else {
2241 LLVM_DEBUG(dbgs() << "Unhandled element type for vscatter\n");
2242 return nullptr;
2243 }
2244}
2245
2246Value *HvxIdioms::processVGather(Instruction &In) const {
2247 [[maybe_unused]] auto *InpTy =
2248 dyn_cast<VectorType>(In.getOperand(0)->getType());
2249 assert(InpTy && "Cannot handle no vector type for llvm.gather");
2250 [[maybe_unused]] auto *ElemTy =
2251 dyn_cast<PointerType>(InpTy->getElementType());
2252 assert(ElemTy && "llvm.gather needs vector of ptr argument");
2253 auto *F = In.getFunction();
2254 LLVMContext &Ctx = F->getContext();
2255 LLVM_DEBUG(dbgs() << "\n[Process gather](" << In << ")\n"
2256 << *In.getParent() << "\n");
2257 LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements("
2258 << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
2259 << ") type(" << *ElemTy << ") Access alignment("
2260 << *In.getOperand(1) << ") AddressSpace("
2261 << ElemTy->getAddressSpace() << ")\n");
2262
2263 // TODO: Handle masking of elements.
2264 assert(dyn_cast<VectorType>(In.getOperand(2)->getType()) &&
2265 "llvm.gather needs vector for mask");
2266 IRBuilder Builder(In.getParent(), In.getIterator(),
2267 InstSimplifyFolder(HVC.DL));
2268
2269 // See who is using the result. The difference between LLVM and HVX vgather
2270 // Intrinsic makes it impossible to handle all cases with temp storage. Alloca
2271 // in VTCM is not yet supported, so for now we just bail out for those cases.
2272 HvxIdioms::DstQualifier Qual = HvxIdioms::Undefined;
2273 Instruction *Dst = locateDestination(&In, Qual);
2274 if (!Dst) {
2275 LLVM_DEBUG(dbgs() << " Unable to locate vgather destination\n");
2276 return nullptr;
2277 }
2278 LLVM_DEBUG(dbgs() << " Destination : " << *Dst << " Qual(" << Qual
2279 << ")\n");
2280
2281 // Address of destination. Must be in VTCM.
2282 auto *Ptr = getPointer(Dst);
2283 if (!Ptr) {
2284 LLVM_DEBUG(dbgs() << "Could not locate vgather destination ptr\n");
2285 return nullptr;
2286 }
2287
2288 // Result type. Assume it is a vector type.
2289 auto *DstType = cast<VectorType>(getIndexType(Dst));
2290 assert(DstType && "Cannot handle non vector dst type for llvm.gather");
2291
2292 // Base address for sources to be loaded
2293 auto *IndexLoad = locateAddressFromIntrinsic(&In);
2294 if (!IndexLoad)
2295 return nullptr;
2296 LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n");
2297
2298 // Gather indexes/offsets
2299 auto *Indexes = locateIndexesFromIntrinsic(&In);
2300 if (!Indexes)
2301 return nullptr;
2302 LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n");
2303
2304 Instruction *Gather = nullptr;
2305 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2306 if (Qual == HvxIdioms::LdSt || Qual == HvxIdioms::Arithmetic) {
2307 // We fully assume the address space is in VTCM. We also assume that all
2308 // pointers in Operand(0) have the same base(!).
2309 // This is the most basic case of all the above.
2310 unsigned OutputSize = HVC.getSizeOf(DstType);
2311 auto *DstElemTy = cast<IntegerType>(DstType->getElementType());
2312 unsigned ElemWidth = HVC.DL.getTypeAllocSize(DstElemTy);
2313 LLVM_DEBUG(dbgs() << " Buffer type : " << *Ptr->getType()
2314 << " Address space ("
2315 << Ptr->getType()->getPointerAddressSpace() << ")\n"
2316 << " Result type : " << *DstType
2317 << "\n Size in bytes : " << OutputSize
2318 << " element type(" << *DstElemTy
2319 << ")\n ElemWidth : " << ElemWidth << " bytes\n");
2320
2321 auto *IndexType = cast<VectorType>(getIndexType(Indexes));
2322 assert(IndexType && "Cannot handle non vector index type for llvm.gather");
2323 unsigned IndexWidth = HVC.DL.getTypeAllocSize(IndexType->getElementType());
2324 LLVM_DEBUG(dbgs() << " IndexWidth(" << IndexWidth << ")\n");
2325
2326 // Intrinsic takes i32 instead of pointer so cast.
2327 Value *CastedPtr = Builder.CreateBitOrPointerCast(
2328 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2329 // [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ...]
2330 // int_hexagon_V6_vgathermh [... , llvm_v16i32_ty]
2331 // int_hexagon_V6_vgathermh_128B [... , llvm_v32i32_ty]
2332 // int_hexagon_V6_vgathermhw [... , llvm_v32i32_ty]
2333 // int_hexagon_V6_vgathermhw_128B [... , llvm_v64i32_ty]
2334 // int_hexagon_V6_vgathermw [... , llvm_v16i32_ty]
2335 // int_hexagon_V6_vgathermw_128B [... , llvm_v32i32_ty]
2336 if (HVC.HST.getVectorLength() == OutputSize) {
2337 if (ElemWidth == 1) {
2338 // v128i8 There is no native instruction for this.
2339 // Do this as two Hi/Lo gathers with masking.
2340 // Unpack indexes. We assume that indexes are in 128i8 format - need to
2341 // expand them to Hi/Lo 64i16
2342 Value *CastIndexes =
2343 Builder.CreateBitCast(Indexes, NT, "cast_to_32i32");
2344 auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);
2345 auto *UnpackedIndexes =
2346 Builder.CreateIntrinsic(HVC.getHvxTy(HVC.getIntTy(32), true),
2347 V6_vunpack, CastIndexes, nullptr);
2348 LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes
2349 << ")\n");
2350
2351 auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);
2352 auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);
2353 [[maybe_unused]] Value *IndexHi =
2354 HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);
2355 [[maybe_unused]] Value *IndexLo =
2356 HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);
2357 LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n");
2358 LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n");
2359 // Create the mask for individual bytes
2360 auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);
2361 LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n");
2362 // We use our destination allocation as a temp storage
2363 // This is unlikely to work properly for masked gather.
2364 auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermhq);
2365 [[maybe_unused]] auto GatherHi = Builder.CreateIntrinsic(
2366 Type::getVoidTy(Ctx), V6_vgather,
2367 {Ptr, QByteMask, CastedPtr,
2368 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexHi},
2369 nullptr);
2370 LLVM_DEBUG(dbgs() << " GatherHi : " << *GatherHi << ")\n");
2371 // Rematerialize the result
2372 [[maybe_unused]] Value *LoadedResultHi = Builder.CreateLoad(
2373 HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_hi");
2374 LLVM_DEBUG(dbgs() << " LoadedResultHi : " << *LoadedResultHi << "\n");
2375 // Same for the low part. Here we use Gather to return non-NULL result
2376 // from this function and continue to iterate. We also are deleting Dst
2377 // store below.
2378 Gather = Builder.CreateIntrinsic(
2379 Type::getVoidTy(Ctx), V6_vgather,
2380 {Ptr, QByteMask, CastedPtr,
2381 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexLo},
2382 nullptr);
2383 LLVM_DEBUG(dbgs() << " GatherLo : " << *Gather << ")\n");
2384 Value *LoadedResultLo = Builder.CreateLoad(
2385 HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_lo");
2386 LLVM_DEBUG(dbgs() << " LoadedResultLo : " << *LoadedResultLo << "\n");
2387 // Now we have properly sized bytes in every other position
2388 // B b A a c a A b B c f F g G h H is presented as
2389 // B . b . A . a . c . a . A . b . B . c . f . F . g . G . h . H
2390 // Use vpack to gather them
2391 auto V6_vpackeb = HVC.HST.getIntrinsicId(Hexagon::V6_vpackeb);
2392 [[maybe_unused]] auto Res = Builder.CreateIntrinsic(
2393 NT, V6_vpackeb, {LoadedResultHi, LoadedResultLo}, nullptr);
2394 LLVM_DEBUG(dbgs() << " ScaledRes : " << *Res << "\n");
2395 [[maybe_unused]] auto *StoreRes = Builder.CreateStore(Res, Ptr);
2396 LLVM_DEBUG(dbgs() << " StoreRes : " << *StoreRes << "\n");
2397 } else if (ElemWidth == 2) {
2398 // v32i16
2399 if (IndexWidth == 2) {
2400 // Reinterprete 64i16 as 32i32. Only needed for syntactic IR match.
2401 Value *CastIndex =
2402 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
2403 LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n");
2404 // shift all i16 left by 1 to match short addressing mode instead of
2405 // byte.
2406 auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
2407 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2408 Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});
2410 << " Shifted half index: " << *AdjustedIndex << ")\n");
2411
2412 auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermh);
2413 // The 3rd argument is the size of the region to gather from. Probably
2414 // want to set it to max VTCM size.
2415 Gather = Builder.CreateIntrinsic(
2416 Type::getVoidTy(Ctx), V6_vgather,
2417 {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2418 AdjustedIndex},
2419 nullptr);
2420 for (auto &U : Dst->uses()) {
2421 if (auto *UI = dyn_cast<Instruction>(U.getUser()))
2422 dbgs() << " dst used by: " << *UI << "\n";
2423 }
2424 for (auto &U : In.uses()) {
2425 if (auto *UI = dyn_cast<Instruction>(U.getUser()))
2426 dbgs() << " In used by : " << *UI << "\n";
2427 }
2428 // Create temp load from result in case the result is used by any
2429 // other instruction.
2430 Value *LoadedResult = Builder.CreateLoad(
2431 HVC.getHvxTy(HVC.getIntTy(16), false), Ptr, "temp_result");
2432 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2433 In.replaceAllUsesWith(LoadedResult);
2434 } else {
2435 dbgs() << " Unhandled index type for vgather\n";
2436 return nullptr;
2437 }
2438 } else if (ElemWidth == 4) {
2439 if (IndexWidth == 4) {
2440 // v32i32
2441 auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
2442 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2443 Builder, V6_vaslh, NT, {Indexes, HVC.getConstInt(2)});
2445 << " Shifted word index: " << *AdjustedIndex << ")\n");
2446 Gather = Builder.CreateIntrinsic(
2447 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermw_128B,
2448 {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2449 AdjustedIndex},
2450 nullptr);
2451 } else {
2452 LLVM_DEBUG(dbgs() << " Unhandled index type for vgather\n");
2453 return nullptr;
2454 }
2455 } else {
2456 LLVM_DEBUG(dbgs() << " Unhandled element type for vgather\n");
2457 return nullptr;
2458 }
2459 } else if (HVC.HST.getVectorLength() == OutputSize * 2) {
2460 // This is half of the reg width, duplicate low in high
2461 LLVM_DEBUG(dbgs() << " Unhandled half of register size\n");
2462 return nullptr;
2463 } else if (HVC.HST.getVectorLength() * 2 == OutputSize) {
2464 LLVM_DEBUG(dbgs() << " Unhandle twice the register size\n");
2465 return nullptr;
2466 }
2467 // Erase the original intrinsic and store that consumes it.
2468 // HVX will create a pseudo for gather that is expanded to gather + store
2469 // during packetization.
2470 Dst->eraseFromParent();
2471 } else if (Qual == HvxIdioms::LLVM_Scatter) {
2472 // Gather feeds directly into scatter.
2473 auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType());
2474 assert(DstInpTy && "Cannot handle no vector type for llvm.scatter");
2475 [[maybe_unused]] unsigned DstInpSize = HVC.getSizeOf(DstInpTy);
2476 [[maybe_unused]] unsigned DstElements = HVC.length(DstInpTy);
2477 [[maybe_unused]] auto *DstElemTy =
2478 cast<PointerType>(DstInpTy->getElementType());
2479 assert(DstElemTy && "llvm.scatter needs vector of ptr argument");
2480 LLVM_DEBUG(dbgs() << " Gather feeds into scatter\n Values to scatter : "
2481 << *Dst->getOperand(0) << "\n");
2482 LLVM_DEBUG(dbgs() << " Dst type(" << *DstInpTy << ") elements("
2483 << DstElements << ") VecLen(" << DstInpSize << ") type("
2484 << *DstElemTy << ") Access alignment("
2485 << *Dst->getOperand(2) << ")\n");
2486 // Address of source
2487 auto *Src = getPointer(IndexLoad);
2488 if (!Src)
2489 return nullptr;
2490 LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n");
2491
2492 if (!isa<PointerType>(Src->getType())) {
2493 LLVM_DEBUG(dbgs() << " Source is not a pointer type...\n");
2494 return nullptr;
2495 }
2496
2497 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2498 Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2499 LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n");
2500
2501 auto *DstLoad = locateAddressFromIntrinsic(Dst);
2502 if (!DstLoad) {
2503 LLVM_DEBUG(dbgs() << " Unable to locate DstLoad\n");
2504 return nullptr;
2505 }
2506 LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n");
2507
2508 Value *Ptr = getPointer(DstLoad);
2509 if (!Ptr)
2510 return nullptr;
2511 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2512 Value *CastIndex =
2513 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, IndexLoad);
2514 LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n");
2515 // Shift all i16 left by 1 to match short addressing mode instead of
2516 // byte.
2517 auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
2518 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2519 Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});
2520 LLVM_DEBUG(dbgs() << " Shifted half index: " << *AdjustedIndex << ")\n");
2521
2522 return Builder.CreateIntrinsic(
2523 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2524 {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2525 AdjustedIndex},
2526 nullptr);
2527 } else if (Qual == HvxIdioms::HEX_Gather_Scatter) {
2528 // Gather feeds into previously inserted pseudo intrinsic.
2529 // These could not be in the same packet, so we need to generate another
2530 // pseudo that is expanded to .tmp + store V6_vgathermh_pseudo
2531 // V6_vgathermh_pseudo (ins IntRegs:$_dst_, s4_0Imm:$Ii, IntRegs:$Rt,
2532 // ModRegs:$Mu, HvxVR:$Vv)
2533 if (isa<AllocaInst>(IndexLoad)) {
2534 auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
2535 if (cstDataVector) {
2536 // Our indexes are represented as a constant. We need THEM in a reg.
2537 // This most likely will not work properly since alloca gives us DDR
2538 // stack location. This will be fixed once we teach compiler about VTCM.
2539 AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);
2540 [[maybe_unused]] auto *StoreIndexes =
2541 Builder.CreateStore(cstDataVector, IndexesAlloca);
2542 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2543 Value *LoadedIndex = Builder.CreateLoad(
2544 IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index");
2545 AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
2546 LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca << "\n");
2547
2548 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2549 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2550 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2551
2552 Gather = Builder.CreateIntrinsic(
2553 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2554 {ResultAlloca, CastedSrc,
2555 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
2556 nullptr);
2557 Value *LoadedResult = Builder.CreateLoad(
2558 HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
2559 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2560 LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n");
2561 In.replaceAllUsesWith(LoadedResult);
2562 }
2563 } else {
2564 // Address of source
2565 auto *Src = getPointer(IndexLoad);
2566 if (!Src)
2567 return nullptr;
2568 LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n");
2569
2570 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2571 Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2572 LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n");
2573
2574 auto *DstLoad = locateAddressFromIntrinsic(Dst);
2575 if (!DstLoad)
2576 return nullptr;
2577 LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n");
2578 auto *Ptr = getPointer(DstLoad);
2579 if (!Ptr)
2580 return nullptr;
2581 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2582
2583 Gather = Builder.CreateIntrinsic(
2584 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgather_vscattermh,
2585 {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2586 Indexes},
2587 nullptr);
2588 }
2589 return Gather;
2590 } else if (Qual == HvxIdioms::HEX_Scatter) {
2591 // This is the case when result of a gather is used as an argument to
2592 // Intrinsic::hexagon_V6_vscattermh_128B. Most likely we just inserted it
2593 // ourselves. We have to create alloca, store to it, and replace all uses
2594 // with that.
2595 AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
2596 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2597 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2598 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2599 Value *CastIndex =
2600 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
2601 LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n");
2602
2603 Gather = Builder.CreateIntrinsic(
2604 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2605 {ResultAlloca, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2606 CastIndex},
2607 nullptr);
2608 Value *LoadedResult = Builder.CreateLoad(
2609 HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
2610 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2611 In.replaceAllUsesWith(LoadedResult);
2612 } else if (Qual == HvxIdioms::HEX_Gather) {
2613 // Gather feeds to another gather but already replaced with
2614 // hexagon_V6_vgathermh_128B
2615 if (isa<AllocaInst>(IndexLoad)) {
2616 auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
2617 if (cstDataVector) {
2618 // Our indexes are represented as a constant. We need it in a reg.
2619 AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);
2620
2621 [[maybe_unused]] auto *StoreIndexes =
2622 Builder.CreateStore(cstDataVector, IndexesAlloca);
2623 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2624 Value *LoadedIndex = Builder.CreateLoad(
2625 IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index");
2626 AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
2627 LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca
2628 << "\n AddressSpace: "
2629 << ResultAlloca->getAddressSpace() << "\n";);
2630
2631 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2632 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2633 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2634
2635 Gather = Builder.CreateIntrinsic(
2636 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2637 {ResultAlloca, CastedSrc,
2638 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
2639 nullptr);
2640 Value *LoadedResult = Builder.CreateLoad(
2641 HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
2642 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2643 LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n");
2644 In.replaceAllUsesWith(LoadedResult);
2645 }
2646 }
2647 } else if (Qual == HvxIdioms::LLVM_Gather) {
2648 // Gather feeds into another gather
2649 errs() << " Underimplemented vgather to vgather sequence\n";
2650 return nullptr;
2651 } else
2652 llvm_unreachable("Unhandled Qual enum");
2653
2654 return Gather;
2655}
2656
2657auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
2658 const FxpOp &Op) const -> Value * {
2659 assert(Op.X.Val->getType() == Op.Y.Val->getType());
2660 auto *InpTy = cast<VectorType>(Op.X.Val->getType());
2661 unsigned Width = InpTy->getScalarSizeInBits();
2662 bool Rounding = Op.RoundAt.has_value();
2663
2664 if (!Op.RoundAt || *Op.RoundAt == Op.Frac - 1) {
2665 // The fixed-point intrinsics do signed multiplication.
2666 if (Width == Op.Frac + 1 && Op.X.Sgn != Unsigned && Op.Y.Sgn != Unsigned) {
2667 Value *QMul = nullptr;
2668 if (Width == 16) {
2669 QMul = createMulQ15(Builder, Op.X, Op.Y, Rounding);
2670 } else if (Width == 32) {
2671 QMul = createMulQ31(Builder, Op.X, Op.Y, Rounding);
2672 }
2673 if (QMul != nullptr)
2674 return QMul;
2675 }
2676 }
2677
2678 assert(Width >= 32 || isPowerOf2_32(Width)); // Width <= 32 => Width is 2^n
2679 assert(Width < 32 || Width % 32 == 0); // Width > 32 => Width is 32*k
2680
2681 // If Width < 32, then it should really be 16.
2682 if (Width < 32) {
2683 if (Width < 16)
2684 return nullptr;
2685 // Getting here with Op.Frac == 0 isn't wrong, but suboptimal: here we
2686 // generate a full precision products, which is unnecessary if there is
2687 // no shift.
2688 assert(Width == 16);
2689 assert(Op.Frac != 0 && "Unshifted mul should have been skipped");
2690 if (Op.Frac == 16) {
2691 // Multiply high
2692 if (Value *MulH = createMulH16(Builder, Op.X, Op.Y))
2693 return MulH;
2694 }
2695 // Do full-precision multiply and shift.
2696 Value *Prod32 = createMul16(Builder, Op.X, Op.Y);
2697 if (Rounding) {
2698 Value *RoundVal = ConstantInt::get(Prod32->getType(), 1 << *Op.RoundAt);
2699 Prod32 = Builder.CreateAdd(Prod32, RoundVal, "add");
2700 }
2701
2702 Value *ShiftAmt = ConstantInt::get(Prod32->getType(), Op.Frac);
2703 Value *Shifted = Op.X.Sgn == Signed || Op.Y.Sgn == Signed
2704 ? Builder.CreateAShr(Prod32, ShiftAmt, "asr")
2705 : Builder.CreateLShr(Prod32, ShiftAmt, "lsr");
2706 return Builder.CreateTrunc(Shifted, InpTy, "trn");
2707 }
2708
2709 // Width >= 32
2710
2711 // Break up the arguments Op.X and Op.Y into vectors of smaller widths
2712 // in preparation of doing the multiplication by 32-bit parts.
2713 auto WordX = HVC.splitVectorElements(Builder, Op.X.Val, /*ToWidth=*/32);
2714 auto WordY = HVC.splitVectorElements(Builder, Op.Y.Val, /*ToWidth=*/32);
2715 auto WordP = createMulLong(Builder, WordX, Op.X.Sgn, WordY, Op.Y.Sgn);
2716
2717 auto *HvxWordTy = cast<VectorType>(WordP.front()->getType());
2718
2719 // Add the optional rounding to the proper word.
2720 if (Op.RoundAt.has_value()) {
2721 Value *Zero = Constant::getNullValue(WordX[0]->getType());
2722 SmallVector<Value *> RoundV(WordP.size(), Zero);
2723 RoundV[*Op.RoundAt / 32] =
2724 ConstantInt::get(HvxWordTy, 1 << (*Op.RoundAt % 32));
2725 WordP = createAddLong(Builder, WordP, RoundV);
2726 }
2727
2728 // createRightShiftLong?
2729
2730 // Shift all products right by Op.Frac.
2731 unsigned SkipWords = Op.Frac / 32;
2732 Constant *ShiftAmt = ConstantInt::get(HvxWordTy, Op.Frac % 32);
2733
2734 for (int Dst = 0, End = WordP.size() - SkipWords; Dst != End; ++Dst) {
2735 int Src = Dst + SkipWords;
2736 Value *Lo = WordP[Src];
2737 if (Src + 1 < End) {
2738 Value *Hi = WordP[Src + 1];
2739 WordP[Dst] = Builder.CreateIntrinsic(HvxWordTy, Intrinsic::fshr,
2740 {Hi, Lo, ShiftAmt},
2741 /*FMFSource*/ nullptr, "int");
2742 } else {
2743 // The shift of the most significant word.
2744 WordP[Dst] = Builder.CreateAShr(Lo, ShiftAmt, "asr");
2745 }
2746 }
2747 if (SkipWords != 0)
2748 WordP.resize(WordP.size() - SkipWords);
2749
2750 return HVC.joinVectorElements(Builder, WordP, Op.ResTy);
2751}
2752
2753auto HvxIdioms::createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
2754 bool Rounding) const -> Value * {
2755 assert(X.Val->getType() == Y.Val->getType());
2756 assert(X.Val->getType()->getScalarType() == HVC.getIntTy(16));
2757 assert(HVC.HST.isHVXVectorType(EVT::getEVT(X.Val->getType(), false)));
2758
2759 // There is no non-rounding intrinsic for i16.
2760 if (!Rounding || X.Sgn == Unsigned || Y.Sgn == Unsigned)
2761 return nullptr;
2762
2763 auto V6_vmpyhvsrs = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhvsrs);
2764 return HVC.createHvxIntrinsic(Builder, V6_vmpyhvsrs, X.Val->getType(),
2765 {X.Val, Y.Val});
2766}
2767
2768auto HvxIdioms::createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
2769 bool Rounding) const -> Value * {
2770 Type *InpTy = X.Val->getType();
2771 assert(InpTy == Y.Val->getType());
2772 assert(InpTy->getScalarType() == HVC.getIntTy(32));
2773 assert(HVC.HST.isHVXVectorType(EVT::getEVT(InpTy, false)));
2774
2775 if (X.Sgn == Unsigned || Y.Sgn == Unsigned)
2776 return nullptr;
2777
2778 auto V6_vmpyewuh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyewuh);
2779 auto V6_vmpyo_acc = Rounding
2780 ? HVC.HST.getIntrinsicId(Hexagon::V6_vmpyowh_rnd_sacc)
2781 : HVC.HST.getIntrinsicId(Hexagon::V6_vmpyowh_sacc);
2782 Value *V1 =
2783 HVC.createHvxIntrinsic(Builder, V6_vmpyewuh, InpTy, {X.Val, Y.Val});
2784 return HVC.createHvxIntrinsic(Builder, V6_vmpyo_acc, InpTy,
2785 {V1, X.Val, Y.Val});
2786}
2787
2788auto HvxIdioms::createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
2789 Value *CarryIn) const
2790 -> std::pair<Value *, Value *> {
2791 assert(X->getType() == Y->getType());
2792 auto VecTy = cast<VectorType>(X->getType());
2793 if (VecTy == HvxI32Ty && HVC.HST.useHVXV62Ops()) {
2795 Intrinsic::ID AddCarry;
2796 if (CarryIn == nullptr && HVC.HST.useHVXV66Ops()) {
2797 AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarryo);
2798 } else {
2799 AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarry);
2800 if (CarryIn == nullptr)
2801 CarryIn = Constant::getNullValue(HVC.getBoolTy(HVC.length(VecTy)));
2802 Args.push_back(CarryIn);
2803 }
2804 Value *Ret = HVC.createHvxIntrinsic(Builder, AddCarry,
2805 /*RetTy=*/nullptr, Args);
2806 Value *Result = Builder.CreateExtractValue(Ret, {0}, "ext");
2807 Value *CarryOut = Builder.CreateExtractValue(Ret, {1}, "ext");
2808 return {Result, CarryOut};
2809 }
2810
2811 // In other cases, do a regular add, and unsigned compare-less-than.
2812 // The carry-out can originate in two places: adding the carry-in or adding
2813 // the two input values.
2814 Value *Result1 = X; // Result1 = X + CarryIn
2815 if (CarryIn != nullptr) {
2816 unsigned Width = VecTy->getScalarSizeInBits();
2817 uint32_t Mask = 1;
2818 if (Width < 32) {
2819 for (unsigned i = 0, e = 32 / Width; i != e; ++i)
2820 Mask = (Mask << Width) | 1;
2821 }
2822 auto V6_vandqrt = HVC.HST.getIntrinsicId(Hexagon::V6_vandqrt);
2823 Value *ValueIn =
2824 HVC.createHvxIntrinsic(Builder, V6_vandqrt, /*RetTy=*/nullptr,
2825 {CarryIn, HVC.getConstInt(Mask)});
2826 Result1 = Builder.CreateAdd(X, ValueIn, "add");
2827 }
2828
2829 Value *CarryOut1 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result1, X, "cmp");
2830 Value *Result2 = Builder.CreateAdd(Result1, Y, "add");
2831 Value *CarryOut2 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result2, Y, "cmp");
2832 return {Result2, Builder.CreateOr(CarryOut1, CarryOut2, "orb")};
2833}
2834
2835auto HvxIdioms::createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const
2836 -> Value * {
2837 Intrinsic::ID V6_vmpyh = 0;
2838 std::tie(X, Y) = canonSgn(X, Y);
2839
2840 if (X.Sgn == Signed) {
2841 V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhv);
2842 } else if (Y.Sgn == Signed) {
2843 // In vmpyhus the second operand is unsigned
2844 V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhus);
2845 } else {
2846 V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyuhv);
2847 }
2848
2849 // i16*i16 -> i32 / interleaved
2850 Value *P =
2851 HVC.createHvxIntrinsic(Builder, V6_vmpyh, HvxP32Ty, {Y.Val, X.Val});
2852 // Deinterleave
2853 return HVC.vshuff(Builder, HVC.sublo(Builder, P), HVC.subhi(Builder, P));
2854}
2855
2856auto HvxIdioms::createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
2857 -> Value * {
2858 Type *HvxI16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/false);
2859
2860 if (HVC.HST.useHVXV69Ops()) {
2861 if (X.Sgn != Signed && Y.Sgn != Signed) {
2862 auto V6_vmpyuhvs = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyuhvs);
2863 return HVC.createHvxIntrinsic(Builder, V6_vmpyuhvs, HvxI16Ty,
2864 {X.Val, Y.Val});
2865 }
2866 }
2867
2868 Type *HvxP16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/true);
2869 Value *Pair16 =
2870 Builder.CreateBitCast(createMul16(Builder, X, Y), HvxP16Ty, "cst");
2871 unsigned Len = HVC.length(HvxP16Ty) / 2;
2872
2873 SmallVector<int, 128> PickOdd(Len);
2874 for (int i = 0; i != static_cast<int>(Len); ++i)
2875 PickOdd[i] = 2 * i + 1;
2876
2877 return Builder.CreateShuffleVector(
2878 HVC.sublo(Builder, Pair16), HVC.subhi(Builder, Pair16), PickOdd, "shf");
2879}
2880
2881auto HvxIdioms::createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
2882 -> std::pair<Value *, Value *> {
2883 assert(X.Val->getType() == Y.Val->getType());
2884 assert(X.Val->getType() == HvxI32Ty);
2885
2886 Intrinsic::ID V6_vmpy_parts;
2887 std::tie(X, Y) = canonSgn(X, Y);
2888
2889 if (X.Sgn == Signed) {
2890 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyss_parts;
2891 } else if (Y.Sgn == Signed) {
2892 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyus_parts;
2893 } else {
2894 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyuu_parts;
2895 }
2896
2897 Value *Parts = HVC.createHvxIntrinsic(Builder, V6_vmpy_parts, nullptr,
2898 {X.Val, Y.Val}, {HvxI32Ty});
2899 Value *Hi = Builder.CreateExtractValue(Parts, {0}, "ext");
2900 Value *Lo = Builder.CreateExtractValue(Parts, {1}, "ext");
2901 return {Lo, Hi};
2902}
2903
2904auto HvxIdioms::createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
2905 ArrayRef<Value *> WordY) const
2907 assert(WordX.size() == WordY.size());
2908 unsigned Idx = 0, Length = WordX.size();
2910
2911 while (Idx != Length) {
2912 if (HVC.isZero(WordX[Idx]))
2913 Sum[Idx] = WordY[Idx];
2914 else if (HVC.isZero(WordY[Idx]))
2915 Sum[Idx] = WordX[Idx];
2916 else
2917 break;
2918 ++Idx;
2919 }
2920
2921 Value *Carry = nullptr;
2922 for (; Idx != Length; ++Idx) {
2923 std::tie(Sum[Idx], Carry) =
2924 createAddCarry(Builder, WordX[Idx], WordY[Idx], Carry);
2925 }
2926
2927 // This drops the final carry beyond the highest word.
2928 return Sum;
2929}
2930
2931auto HvxIdioms::createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
2932 Signedness SgnX, ArrayRef<Value *> WordY,
2933 Signedness SgnY) const -> SmallVector<Value *> {
2934 SmallVector<SmallVector<Value *>> Products(WordX.size() + WordY.size());
2935
2936 // WordX[i] * WordY[j] produces words i+j and i+j+1 of the results,
2937 // that is halves 2(i+j), 2(i+j)+1, 2(i+j)+2, 2(i+j)+3.
2938 for (int i = 0, e = WordX.size(); i != e; ++i) {
2939 for (int j = 0, f = WordY.size(); j != f; ++j) {
2940 // Check the 4 halves that this multiplication can generate.
2941 Signedness SX = (i + 1 == e) ? SgnX : Unsigned;
2942 Signedness SY = (j + 1 == f) ? SgnY : Unsigned;
2943 auto [Lo, Hi] = createMul32(Builder, {WordX[i], SX}, {WordY[j], SY});
2944 Products[i + j + 0].push_back(Lo);
2945 Products[i + j + 1].push_back(Hi);
2946 }
2947 }
2948
2949 Value *Zero = Constant::getNullValue(WordX[0]->getType());
2950
2951 auto pop_back_or_zero = [Zero](auto &Vector) -> Value * {
2952 if (Vector.empty())
2953 return Zero;
2954 auto Last = Vector.back();
2955 Vector.pop_back();
2956 return Last;
2957 };
2958
2959 for (int i = 0, e = Products.size(); i != e; ++i) {
2960 while (Products[i].size() > 1) {
2961 Value *Carry = nullptr; // no carry-in
2962 for (int j = i; j != e; ++j) {
2963 auto &ProdJ = Products[j];
2964 auto [Sum, CarryOut] = createAddCarry(Builder, pop_back_or_zero(ProdJ),
2965 pop_back_or_zero(ProdJ), Carry);
2966 ProdJ.insert(ProdJ.begin(), Sum);
2967 Carry = CarryOut;
2968 }
2969 }
2970 }
2971
2973 for (auto &P : Products) {
2974 assert(P.size() == 1 && "Should have been added together");
2975 WordP.push_back(P.front());
2976 }
2977
2978 return WordP;
2979}
2980
2981auto HvxIdioms::run() -> bool {
2982 bool Changed = false;
2983
2984 for (BasicBlock &B : HVC.F) {
2985 for (auto It = B.rbegin(); It != B.rend(); ++It) {
2986 if (auto Fxm = matchFxpMul(*It)) {
2987 Value *New = processFxpMul(*It, *Fxm);
2988 // Always report "changed" for now.
2989 Changed = true;
2990 if (!New)
2991 continue;
2992 bool StartOver = !isa<Instruction>(New);
2993 It->replaceAllUsesWith(New);
2995 It = StartOver ? B.rbegin()
2996 : cast<Instruction>(New)->getReverseIterator();
2997 Changed = true;
2998 } else if (matchGather(*It)) {
2999 Value *New = processVGather(*It);
3000 if (!New)
3001 continue;
3002 LLVM_DEBUG(dbgs() << " Gather : " << *New << "\n");
3003 // We replace original intrinsic with a new pseudo call.
3004 It->eraseFromParent();
3005 It = cast<Instruction>(New)->getReverseIterator();
3007 Changed = true;
3008 } else if (matchScatter(*It)) {
3009 Value *New = processVScatter(*It);
3010 if (!New)
3011 continue;
3012 LLVM_DEBUG(dbgs() << " Scatter : " << *New << "\n");
3013 // We replace original intrinsic with a new pseudo call.
3014 It->eraseFromParent();
3015 It = cast<Instruction>(New)->getReverseIterator();
3017 Changed = true;
3018 }
3019 }
3020 }
3021
3022 return Changed;
3023}
3024
3025// --- End HvxIdioms
3026
3027auto HexagonVectorCombine::run() -> bool {
3028 if (DumpModule)
3029 dbgs() << "Module before HexagonVectorCombine\n" << *F.getParent();
3030
3031 bool Changed = false;
3032 if (HST.useHVXOps()) {
3033 if (VAEnabled)
3034 Changed |= AlignVectors(*this).run();
3035 if (VIEnabled)
3036 Changed |= HvxIdioms(*this).run();
3037 }
3038
3039 if (DumpModule) {
3040 dbgs() << "Module " << (Changed ? "(modified)" : "(unchanged)")
3041 << " after HexagonVectorCombine\n"
3042 << *F.getParent();
3043 }
3044 return Changed;
3045}
3046
3047auto HexagonVectorCombine::getIntTy(unsigned Width) const -> IntegerType * {
3048 return IntegerType::get(F.getContext(), Width);
3049}
3050
3051auto HexagonVectorCombine::getByteTy(int ElemCount) const -> Type * {
3052 assert(ElemCount >= 0);
3053 IntegerType *ByteTy = Type::getInt8Ty(F.getContext());
3054 if (ElemCount == 0)
3055 return ByteTy;
3056 return VectorType::get(ByteTy, ElemCount, /*Scalable=*/false);
3057}
3058
3059auto HexagonVectorCombine::getBoolTy(int ElemCount) const -> Type * {
3060 assert(ElemCount >= 0);
3061 IntegerType *BoolTy = Type::getInt1Ty(F.getContext());
3062 if (ElemCount == 0)
3063 return BoolTy;
3064 return VectorType::get(BoolTy, ElemCount, /*Scalable=*/false);
3065}
3066
3067auto HexagonVectorCombine::getConstInt(int Val, unsigned Width) const
3068 -> ConstantInt * {
3069 return ConstantInt::getSigned(getIntTy(Width), Val);
3070}
3071
3072auto HexagonVectorCombine::isZero(const Value *Val) const -> bool {
3073 if (auto *C = dyn_cast<Constant>(Val))
3074 return C->isZeroValue();
3075 return false;
3076}
3077
3078auto HexagonVectorCombine::getIntValue(const Value *Val) const
3079 -> std::optional<APInt> {
3080 if (auto *CI = dyn_cast<ConstantInt>(Val))
3081 return CI->getValue();
3082 return std::nullopt;
3083}
3084
3085auto HexagonVectorCombine::isUndef(const Value *Val) const -> bool {
3086 return isa<UndefValue>(Val);
3087}
3088
3089auto HexagonVectorCombine::isTrue(const Value *Val) const -> bool {
3090 return Val == ConstantInt::getTrue(Val->getType());
3091}
3092
3093auto HexagonVectorCombine::isFalse(const Value *Val) const -> bool {
3094 return isZero(Val);
3095}
3096
3097auto HexagonVectorCombine::getHvxTy(Type *ElemTy, bool Pair) const
3098 -> VectorType * {
3099 EVT ETy = EVT::getEVT(ElemTy, false);
3100 assert(ETy.isSimple() && "Invalid HVX element type");
3101 // Do not allow boolean types here: they don't have a fixed length.
3102 assert(HST.isHVXElementType(ETy.getSimpleVT(), /*IncludeBool=*/false) &&
3103 "Invalid HVX element type");
3104 unsigned HwLen = HST.getVectorLength();
3105 unsigned NumElems = (8 * HwLen) / ETy.getSizeInBits();
3106 return VectorType::get(ElemTy, Pair ? 2 * NumElems : NumElems,
3107 /*Scalable=*/false);
3108}
3109
3110auto HexagonVectorCombine::getSizeOf(const Value *Val, SizeKind Kind) const
3111 -> int {
3112 return getSizeOf(Val->getType(), Kind);
3113}
3114
3115auto HexagonVectorCombine::getSizeOf(const Type *Ty, SizeKind Kind) const
3116 -> int {
3117 auto *NcTy = const_cast<Type *>(Ty);
3118 switch (Kind) {
3119 case Store:
3120 return DL.getTypeStoreSize(NcTy).getFixedValue();
3121 case Alloc:
3122 return DL.getTypeAllocSize(NcTy).getFixedValue();
3123 }
3124 llvm_unreachable("Unhandled SizeKind enum");
3125}
3126
3127auto HexagonVectorCombine::getTypeAlignment(Type *Ty) const -> int {
3128 // The actual type may be shorter than the HVX vector, so determine
3129 // the alignment based on subtarget info.
3130 if (HST.isTypeForHVX(Ty))
3131 return HST.getVectorLength();
3132 return DL.getABITypeAlign(Ty).value();
3133}
3134
3135auto HexagonVectorCombine::length(Value *Val) const -> size_t {
3136 return length(Val->getType());
3137}
3138
3139auto HexagonVectorCombine::length(Type *Ty) const -> size_t {
3140 auto *VecTy = dyn_cast<VectorType>(Ty);
3141 assert(VecTy && "Must be a vector type");
3142 return VecTy->getElementCount().getFixedValue();
3143}
3144
3145auto HexagonVectorCombine::simplify(Value *V) const -> Value * {
3146 if (auto *In = dyn_cast<Instruction>(V)) {
3147 SimplifyQuery Q(DL, &TLI, &DT, &AC, In);
3148 return simplifyInstruction(In, Q);
3149 }
3150 return nullptr;
3151}
3152
3153// Insert bytes [Start..Start+Length) of Src into Dst at byte Where.
3154auto HexagonVectorCombine::insertb(IRBuilderBase &Builder, Value *Dst,
3155 Value *Src, int Start, int Length,
3156 int Where) const -> Value * {
3157 assert(isByteVecTy(Dst->getType()) && isByteVecTy(Src->getType()));
3158 int SrcLen = getSizeOf(Src);
3159 int DstLen = getSizeOf(Dst);
3160 assert(0 <= Start && Start + Length <= SrcLen);
3161 assert(0 <= Where && Where + Length <= DstLen);
3162
3163 int P2Len = PowerOf2Ceil(SrcLen | DstLen);
3164 auto *Poison = PoisonValue::get(getByteTy());
3165 Value *P2Src = vresize(Builder, Src, P2Len, Poison);
3166 Value *P2Dst = vresize(Builder, Dst, P2Len, Poison);
3167
3168 SmallVector<int, 256> SMask(P2Len);
3169 for (int i = 0; i != P2Len; ++i) {
3170 // If i is in [Where, Where+Length), pick Src[Start+(i-Where)].
3171 // Otherwise, pick Dst[i];
3172 SMask[i] =
3173 (Where <= i && i < Where + Length) ? P2Len + Start + (i - Where) : i;
3174 }
3175
3176 Value *P2Insert = Builder.CreateShuffleVector(P2Dst, P2Src, SMask, "shf");
3177 return vresize(Builder, P2Insert, DstLen, Poison);
3178}
3179
3180auto HexagonVectorCombine::vlalignb(IRBuilderBase &Builder, Value *Lo,
3181 Value *Hi, Value *Amt) const -> Value * {
3182 assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
3183 if (isZero(Amt))
3184 return Hi;
3185 int VecLen = getSizeOf(Hi);
3186 if (auto IntAmt = getIntValue(Amt))
3187 return getElementRange(Builder, Lo, Hi, VecLen - IntAmt->getSExtValue(),
3188 VecLen);
3189
3190 if (HST.isTypeForHVX(Hi->getType())) {
3191 assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
3192 "Expecting an exact HVX type");
3193 return createHvxIntrinsic(Builder, HST.getIntrinsicId(Hexagon::V6_vlalignb),
3194 Hi->getType(), {Hi, Lo, Amt});
3195 }
3196
3197 if (VecLen == 4) {
3198 Value *Pair = concat(Builder, {Lo, Hi});
3199 Value *Shift =
3200 Builder.CreateLShr(Builder.CreateShl(Pair, Amt, "shl"), 32, "lsr");
3201 Value *Trunc =
3202 Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()), "trn");
3203 return Builder.CreateBitCast(Trunc, Hi->getType(), "cst");
3204 }
3205 if (VecLen == 8) {
3206 Value *Sub = Builder.CreateSub(getConstInt(VecLen), Amt, "sub");
3207 return vralignb(Builder, Lo, Hi, Sub);
3208 }
3209 llvm_unreachable("Unexpected vector length");
3210}
3211
3212auto HexagonVectorCombine::vralignb(IRBuilderBase &Builder, Value *Lo,
3213 Value *Hi, Value *Amt) const -> Value * {
3214 assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
3215 if (isZero(Amt))
3216 return Lo;
3217 int VecLen = getSizeOf(Lo);
3218 if (auto IntAmt = getIntValue(Amt))
3219 return getElementRange(Builder, Lo, Hi, IntAmt->getSExtValue(), VecLen);
3220
3221 if (HST.isTypeForHVX(Lo->getType())) {
3222 assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
3223 "Expecting an exact HVX type");
3224 return createHvxIntrinsic(Builder, HST.getIntrinsicId(Hexagon::V6_valignb),
3225 Lo->getType(), {Hi, Lo, Amt});
3226 }
3227
3228 if (VecLen == 4) {
3229 Value *Pair = concat(Builder, {Lo, Hi});
3230 Value *Shift = Builder.CreateLShr(Pair, Amt, "lsr");
3231 Value *Trunc =
3232 Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()), "trn");
3233 return Builder.CreateBitCast(Trunc, Lo->getType(), "cst");
3234 }
3235 if (VecLen == 8) {
3236 Type *Int64Ty = Type::getInt64Ty(F.getContext());
3237 Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty, "cst");
3238 Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty, "cst");
3239 Value *Call = Builder.CreateIntrinsic(Intrinsic::hexagon_S2_valignrb,
3240 {Hi64, Lo64, Amt},
3241 /*FMFSource=*/nullptr, "cup");
3242 return Builder.CreateBitCast(Call, Lo->getType(), "cst");
3243 }
3244 llvm_unreachable("Unexpected vector length");
3245}
3246
3247// Concatenates a sequence of vectors of the same type.
3248auto HexagonVectorCombine::concat(IRBuilderBase &Builder,
3249 ArrayRef<Value *> Vecs) const -> Value * {
3250 assert(!Vecs.empty());
3252 std::vector<Value *> Work[2];
3253 int ThisW = 0, OtherW = 1;
3254
3255 Work[ThisW].assign(Vecs.begin(), Vecs.end());
3256 while (Work[ThisW].size() > 1) {
3257 auto *Ty = cast<VectorType>(Work[ThisW].front()->getType());
3258 SMask.resize(length(Ty) * 2);
3259 std::iota(SMask.begin(), SMask.end(), 0);
3260
3261 Work[OtherW].clear();
3262 if (Work[ThisW].size() % 2 != 0)
3263 Work[ThisW].push_back(UndefValue::get(Ty));
3264 for (int i = 0, e = Work[ThisW].size(); i < e; i += 2) {
3265 Value *Joined = Builder.CreateShuffleVector(
3266 Work[ThisW][i], Work[ThisW][i + 1], SMask, "shf");
3267 Work[OtherW].push_back(Joined);
3268 }
3269 std::swap(ThisW, OtherW);
3270 }
3271
3272 // Since there may have been some undefs appended to make shuffle operands
3273 // have the same type, perform the last shuffle to only pick the original
3274 // elements.
3275 SMask.resize(Vecs.size() * length(Vecs.front()->getType()));
3276 std::iota(SMask.begin(), SMask.end(), 0);
3277 Value *Total = Work[ThisW].front();
3278 return Builder.CreateShuffleVector(Total, SMask, "shf");
3279}
3280
3281auto HexagonVectorCombine::vresize(IRBuilderBase &Builder, Value *Val,
3282 int NewSize, Value *Pad) const -> Value * {
3284 auto *ValTy = cast<VectorType>(Val->getType());
3285 assert(ValTy->getElementType() == Pad->getType());
3286
3287 int CurSize = length(ValTy);
3288 if (CurSize == NewSize)
3289 return Val;
3290 // Truncate?
3291 if (CurSize > NewSize)
3292 return getElementRange(Builder, Val, /*Ignored*/ Val, 0, NewSize);
3293 // Extend.
3294 SmallVector<int, 128> SMask(NewSize);
3295 std::iota(SMask.begin(), SMask.begin() + CurSize, 0);
3296 std::fill(SMask.begin() + CurSize, SMask.end(), CurSize);
3297 Value *PadVec = Builder.CreateVectorSplat(CurSize, Pad, "spt");
3298 return Builder.CreateShuffleVector(Val, PadVec, SMask, "shf");
3299}
3300
3301auto HexagonVectorCombine::rescale(IRBuilderBase &Builder, Value *Mask,
3302 Type *FromTy, Type *ToTy) const -> Value * {
3303 // Mask is a vector <N x i1>, where each element corresponds to an
3304 // element of FromTy. Remap it so that each element will correspond
3305 // to an element of ToTy.
3306 assert(isa<VectorType>(Mask->getType()));
3307
3308 Type *FromSTy = FromTy->getScalarType();
3309 Type *ToSTy = ToTy->getScalarType();
3310 if (FromSTy == ToSTy)
3311 return Mask;
3312
3313 int FromSize = getSizeOf(FromSTy);
3314 int ToSize = getSizeOf(ToSTy);
3315 assert(FromSize % ToSize == 0 || ToSize % FromSize == 0);
3316
3317 auto *MaskTy = cast<VectorType>(Mask->getType());
3318 int FromCount = length(MaskTy);
3319 int ToCount = (FromCount * FromSize) / ToSize;
3320 assert((FromCount * FromSize) % ToSize == 0);
3321
3322 auto *FromITy = getIntTy(FromSize * 8);
3323 auto *ToITy = getIntTy(ToSize * 8);
3324
3325 // Mask <N x i1> -> sext to <N x FromTy> -> bitcast to <M x ToTy> ->
3326 // -> trunc to <M x i1>.
3327 Value *Ext = Builder.CreateSExt(
3328 Mask, VectorType::get(FromITy, FromCount, /*Scalable=*/false), "sxt");
3329 Value *Cast = Builder.CreateBitCast(
3330 Ext, VectorType::get(ToITy, ToCount, /*Scalable=*/false), "cst");
3331 return Builder.CreateTrunc(
3332 Cast, VectorType::get(getBoolTy(), ToCount, /*Scalable=*/false), "trn");
3333}
3334
3335// Bitcast to bytes, and return least significant bits.
3336auto HexagonVectorCombine::vlsb(IRBuilderBase &Builder, Value *Val) const
3337 -> Value * {
3338 Type *ScalarTy = Val->getType()->getScalarType();
3339 if (ScalarTy == getBoolTy())
3340 return Val;
3341
3342 Value *Bytes = vbytes(Builder, Val);
3343 if (auto *VecTy = dyn_cast<VectorType>(Bytes->getType()))
3344 return Builder.CreateTrunc(Bytes, getBoolTy(getSizeOf(VecTy)), "trn");
3345 // If Bytes is a scalar (i.e. Val was a scalar byte), return i1, not
3346 // <1 x i1>.
3347 return Builder.CreateTrunc(Bytes, getBoolTy(), "trn");
3348}
3349
3350// Bitcast to bytes for non-bool. For bool, convert i1 -> i8.
3351auto HexagonVectorCombine::vbytes(IRBuilderBase &Builder, Value *Val) const
3352 -> Value * {
3353 Type *ScalarTy = Val->getType()->getScalarType();
3354 if (ScalarTy == getByteTy())
3355 return Val;
3356
3357 if (ScalarTy != getBoolTy())
3358 return Builder.CreateBitCast(Val, getByteTy(getSizeOf(Val)), "cst");
3359 // For bool, return a sext from i1 to i8.
3360 if (auto *VecTy = dyn_cast<VectorType>(Val->getType()))
3361 return Builder.CreateSExt(Val, VectorType::get(getByteTy(), VecTy), "sxt");
3362 return Builder.CreateSExt(Val, getByteTy(), "sxt");
3363}
3364
3365auto HexagonVectorCombine::subvector(IRBuilderBase &Builder, Value *Val,
3366 unsigned Start, unsigned Length) const
3367 -> Value * {
3368 assert(Start + Length <= length(Val));
3369 return getElementRange(Builder, Val, /*Ignored*/ Val, Start, Length);
3370}
3371
3372auto HexagonVectorCombine::sublo(IRBuilderBase &Builder, Value *Val) const
3373 -> Value * {
3374 size_t Len = length(Val);
3375 assert(Len % 2 == 0 && "Length should be even");
3376 return subvector(Builder, Val, 0, Len / 2);
3377}
3378
3379auto HexagonVectorCombine::subhi(IRBuilderBase &Builder, Value *Val) const
3380 -> Value * {
3381 size_t Len = length(Val);
3382 assert(Len % 2 == 0 && "Length should be even");
3383 return subvector(Builder, Val, Len / 2, Len / 2);
3384}
3385
3386auto HexagonVectorCombine::vdeal(IRBuilderBase &Builder, Value *Val0,
3387 Value *Val1) const -> Value * {
3388 assert(Val0->getType() == Val1->getType());
3389 int Len = length(Val0);
3390 SmallVector<int, 128> Mask(2 * Len);
3391
3392 for (int i = 0; i != Len; ++i) {
3393 Mask[i] = 2 * i; // Even
3394 Mask[i + Len] = 2 * i + 1; // Odd
3395 }
3396 return Builder.CreateShuffleVector(Val0, Val1, Mask, "shf");
3397}
3398
3399auto HexagonVectorCombine::vshuff(IRBuilderBase &Builder, Value *Val0,
3400 Value *Val1) const -> Value * { //
3401 assert(Val0->getType() == Val1->getType());
3402 int Len = length(Val0);
3403 SmallVector<int, 128> Mask(2 * Len);
3404
3405 for (int i = 0; i != Len; ++i) {
3406 Mask[2 * i + 0] = i; // Val0
3407 Mask[2 * i + 1] = i + Len; // Val1
3408 }
3409 return Builder.CreateShuffleVector(Val0, Val1, Mask, "shf");
3410}
3411
3412auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder,
3413 Intrinsic::ID IntID, Type *RetTy,
3414 ArrayRef<Value *> Args,
3415 ArrayRef<Type *> ArgTys,
3416 ArrayRef<Value *> MDSources) const
3417 -> Value * {
3418 auto getCast = [&](IRBuilderBase &Builder, Value *Val,
3419 Type *DestTy) -> Value * {
3420 Type *SrcTy = Val->getType();
3421 if (SrcTy == DestTy)
3422 return Val;
3423
3424 // Non-HVX type. It should be a scalar, and it should already have
3425 // a valid type.
3426 assert(HST.isTypeForHVX(SrcTy, /*IncludeBool=*/true));
3427
3428 Type *BoolTy = Type::getInt1Ty(F.getContext());
3429 if (cast<VectorType>(SrcTy)->getElementType() != BoolTy)
3430 return Builder.CreateBitCast(Val, DestTy, "cst");
3431
3432 // Predicate HVX vector.
3433 unsigned HwLen = HST.getVectorLength();
3434 Intrinsic::ID TC = HwLen == 64 ? Intrinsic::hexagon_V6_pred_typecast
3435 : Intrinsic::hexagon_V6_pred_typecast_128B;
3436 return Builder.CreateIntrinsic(TC, {DestTy, Val->getType()}, {Val},
3437 /*FMFSource=*/nullptr, "cup");
3438 };
3439
3440 Function *IntrFn =
3441 Intrinsic::getOrInsertDeclaration(F.getParent(), IntID, ArgTys);
3442 FunctionType *IntrTy = IntrFn->getFunctionType();
3443
3444 SmallVector<Value *, 4> IntrArgs;
3445 for (int i = 0, e = Args.size(); i != e; ++i) {
3446 Value *A = Args[i];
3447 Type *T = IntrTy->getParamType(i);
3448 if (A->getType() != T) {
3449 IntrArgs.push_back(getCast(Builder, A, T));
3450 } else {
3451 IntrArgs.push_back(A);
3452 }
3453 }
3454 StringRef MaybeName = !IntrTy->getReturnType()->isVoidTy() ? "cup" : "";
3455 CallInst *Call = Builder.CreateCall(IntrFn, IntrArgs, MaybeName);
3456
3457 MemoryEffects ME = Call->getAttributes().getMemoryEffects();
3459 propagateMetadata(Call, MDSources);
3460
3461 Type *CallTy = Call->getType();
3462 if (RetTy == nullptr || CallTy == RetTy)
3463 return Call;
3464 // Scalar types should have RetTy matching the call return type.
3465 assert(HST.isTypeForHVX(CallTy, /*IncludeBool=*/true));
3466 return getCast(Builder, Call, RetTy);
3467}
3468
3469auto HexagonVectorCombine::splitVectorElements(IRBuilderBase &Builder,
3470 Value *Vec,
3471 unsigned ToWidth) const
3473 // Break a vector of wide elements into a series of vectors with narrow
3474 // elements:
3475 // (...c0:b0:a0, ...c1:b1:a1, ...c2:b2:a2, ...)
3476 // -->
3477 // (a0, a1, a2, ...) // lowest "ToWidth" bits
3478 // (b0, b1, b2, ...) // the next lowest...
3479 // (c0, c1, c2, ...) // ...
3480 // ...
3481 //
3482 // The number of elements in each resulting vector is the same as
3483 // in the original vector.
3484
3485 auto *VecTy = cast<VectorType>(Vec->getType());
3486 assert(VecTy->getElementType()->isIntegerTy());
3487 unsigned FromWidth = VecTy->getScalarSizeInBits();
3488 assert(isPowerOf2_32(ToWidth) && isPowerOf2_32(FromWidth));
3489 assert(ToWidth <= FromWidth && "Breaking up into wider elements?");
3490 unsigned NumResults = FromWidth / ToWidth;
3491
3492 SmallVector<Value *> Results(NumResults);
3493 Results[0] = Vec;
3494 unsigned Length = length(VecTy);
3495
3496 // Do it by splitting in half, since those operations correspond to deal
3497 // instructions.
3498 auto splitInHalf = [&](unsigned Begin, unsigned End, auto splitFunc) -> void {
3499 // Take V = Results[Begin], split it in L, H.
3500 // Store Results[Begin] = L, Results[(Begin+End)/2] = H
3501 // Call itself recursively split(Begin, Half), split(Half+1, End)
3502 if (Begin + 1 == End)
3503 return;
3504
3505 Value *Val = Results[Begin];
3506 unsigned Width = Val->getType()->getScalarSizeInBits();
3507
3508 auto *VTy = VectorType::get(getIntTy(Width / 2), 2 * Length, false);
3509 Value *VVal = Builder.CreateBitCast(Val, VTy, "cst");
3510
3511 Value *Res = vdeal(Builder, sublo(Builder, VVal), subhi(Builder, VVal));
3512
3513 unsigned Half = (Begin + End) / 2;
3514 Results[Begin] = sublo(Builder, Res);
3515 Results[Half] = subhi(Builder, Res);
3516
3517 splitFunc(Begin, Half, splitFunc);
3518 splitFunc(Half, End, splitFunc);
3519 };
3520
3521 splitInHalf(0, NumResults, splitInHalf);
3522 return Results;
3523}
3524
3525auto HexagonVectorCombine::joinVectorElements(IRBuilderBase &Builder,
3526 ArrayRef<Value *> Values,
3527 VectorType *ToType) const
3528 -> Value * {
3529 assert(ToType->getElementType()->isIntegerTy());
3530
3531 // If the list of values does not have power-of-2 elements, append copies
3532 // of the sign bit to it, to make the size be 2^n.
3533 // The reason for this is that the values will be joined in pairs, because
3534 // otherwise the shuffles will result in convoluted code. With pairwise
3535 // joins, the shuffles will hopefully be folded into a perfect shuffle.
3536 // The output will need to be sign-extended to a type with element width
3537 // being a power-of-2 anyways.
3538 SmallVector<Value *> Inputs(Values);
3539
3540 unsigned ToWidth = ToType->getScalarSizeInBits();
3541 unsigned Width = Inputs.front()->getType()->getScalarSizeInBits();
3542 assert(Width <= ToWidth);
3543 assert(isPowerOf2_32(Width) && isPowerOf2_32(ToWidth));
3544 unsigned Length = length(Inputs.front()->getType());
3545
3546 unsigned NeedInputs = ToWidth / Width;
3547 if (Inputs.size() != NeedInputs) {
3548 // Having too many inputs is ok: drop the high bits (usual wrap-around).
3549 // If there are too few, fill them with the sign bit.
3550 Value *Last = Inputs.back();
3551 Value *Sign = Builder.CreateAShr(
3552 Last, ConstantInt::get(Last->getType(), Width - 1), "asr");
3553 Inputs.resize(NeedInputs, Sign);
3554 }
3555
3556 while (Inputs.size() > 1) {
3557 Width *= 2;
3558 auto *VTy = VectorType::get(getIntTy(Width), Length, false);
3559 for (int i = 0, e = Inputs.size(); i < e; i += 2) {
3560 Value *Res = vshuff(Builder, Inputs[i], Inputs[i + 1]);
3561 Inputs[i / 2] = Builder.CreateBitCast(Res, VTy, "cst");
3562 }
3563 Inputs.resize(Inputs.size() / 2);
3564 }
3565
3566 assert(Inputs.front()->getType() == ToType);
3567 return Inputs.front();
3568}
3569
3570auto HexagonVectorCombine::calculatePointerDifference(Value *Ptr0,
3571 Value *Ptr1) const
3572 -> std::optional<int> {
3573 // Try SCEV first.
3574 const SCEV *Scev0 = SE.getSCEV(Ptr0);
3575 const SCEV *Scev1 = SE.getSCEV(Ptr1);
3576 const SCEV *ScevDiff = SE.getMinusSCEV(Scev0, Scev1);
3577 if (auto *Const = dyn_cast<SCEVConstant>(ScevDiff)) {
3578 APInt V = Const->getAPInt();
3579 if (V.isSignedIntN(8 * sizeof(int)))
3580 return static_cast<int>(V.getSExtValue());
3581 }
3582
3583 struct Builder : IRBuilder<> {
3584 Builder(BasicBlock *B) : IRBuilder<>(B->getTerminator()) {}
3585 ~Builder() {
3586 for (Instruction *I : llvm::reverse(ToErase))
3587 I->eraseFromParent();
3588 }
3589 SmallVector<Instruction *, 8> ToErase;
3590 };
3591
3592#define CallBuilder(B, F) \
3593 [&](auto &B_) { \
3594 Value *V = B_.F; \
3595 if (auto *I = dyn_cast<Instruction>(V)) \
3596 B_.ToErase.push_back(I); \
3597 return V; \
3598 }(B)
3599
3600 auto Simplify = [this](Value *V) {
3601 if (Value *S = simplify(V))
3602 return S;
3603 return V;
3604 };
3605
3606 auto StripBitCast = [](Value *V) {
3607 while (auto *C = dyn_cast<BitCastInst>(V))
3608 V = C->getOperand(0);
3609 return V;
3610 };
3611
3612 Ptr0 = StripBitCast(Ptr0);
3613 Ptr1 = StripBitCast(Ptr1);
3615 return std::nullopt;
3616
3617 auto *Gep0 = cast<GetElementPtrInst>(Ptr0);
3618 auto *Gep1 = cast<GetElementPtrInst>(Ptr1);
3619 if (Gep0->getPointerOperand() != Gep1->getPointerOperand())
3620 return std::nullopt;
3621 if (Gep0->getSourceElementType() != Gep1->getSourceElementType())
3622 return std::nullopt;
3623
3624 Builder B(Gep0->getParent());
3625 int Scale = getSizeOf(Gep0->getSourceElementType(), Alloc);
3626
3627 // FIXME: for now only check GEPs with a single index.
3628 if (Gep0->getNumOperands() != 2 || Gep1->getNumOperands() != 2)
3629 return std::nullopt;
3630
3631 Value *Idx0 = Gep0->getOperand(1);
3632 Value *Idx1 = Gep1->getOperand(1);
3633
3634 // First, try to simplify the subtraction directly.
3635 if (auto *Diff = dyn_cast<ConstantInt>(
3636 Simplify(CallBuilder(B, CreateSub(Idx0, Idx1)))))
3637 return Diff->getSExtValue() * Scale;
3638
3639 KnownBits Known0 = getKnownBits(Idx0, Gep0);
3640 KnownBits Known1 = getKnownBits(Idx1, Gep1);
3641 APInt Unknown = ~(Known0.Zero | Known0.One) | ~(Known1.Zero | Known1.One);
3642 if (Unknown.isAllOnes())
3643 return std::nullopt;
3644
3645 Value *MaskU = ConstantInt::get(Idx0->getType(), Unknown);
3646 Value *AndU0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskU)));
3647 Value *AndU1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskU)));
3648 Value *SubU = Simplify(CallBuilder(B, CreateSub(AndU0, AndU1)));
3649 int Diff0 = 0;
3650 if (auto *C = dyn_cast<ConstantInt>(SubU)) {
3651 Diff0 = C->getSExtValue();
3652 } else {
3653 return std::nullopt;
3654 }
3655
3656 Value *MaskK = ConstantInt::get(MaskU->getType(), ~Unknown);
3657 Value *AndK0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskK)));
3658 Value *AndK1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskK)));
3659 Value *SubK = Simplify(CallBuilder(B, CreateSub(AndK0, AndK1)));
3660 int Diff1 = 0;
3661 if (auto *C = dyn_cast<ConstantInt>(SubK)) {
3662 Diff1 = C->getSExtValue();
3663 } else {
3664 return std::nullopt;
3665 }
3666
3667 return (Diff0 + Diff1) * Scale;
3668
3669#undef CallBuilder
3670}
3671
3672auto HexagonVectorCombine::getNumSignificantBits(const Value *V,
3673 const Instruction *CtxI) const
3674 -> unsigned {
3675 return ComputeMaxSignificantBits(V, DL, &AC, CtxI, &DT);
3676}
3677
3678auto HexagonVectorCombine::getKnownBits(const Value *V,
3679 const Instruction *CtxI) const
3680 -> KnownBits {
3681 return computeKnownBits(V, DL, &AC, CtxI, &DT);
3682}
3683
3684auto HexagonVectorCombine::isSafeToClone(const Instruction &In) const -> bool {
3685 if (In.mayHaveSideEffects() || In.isAtomic() || In.isVolatile() ||
3686 In.isFenceLike() || In.mayReadOrWriteMemory()) {
3687 return false;
3688 }
3689 if (isa<CallBase>(In) || isa<AllocaInst>(In))
3690 return false;
3691 return true;
3692}
3693
3694template <typename T>
3695auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In,
3697 const T &IgnoreInsts) const
3698 -> bool {
3699 auto getLocOrNone =
3700 [this](const Instruction &I) -> std::optional<MemoryLocation> {
3701 if (const auto *II = dyn_cast<IntrinsicInst>(&I)) {
3702 switch (II->getIntrinsicID()) {
3703 case Intrinsic::masked_load:
3704 return MemoryLocation::getForArgument(II, 0, TLI);
3705 case Intrinsic::masked_store:
3706 return MemoryLocation::getForArgument(II, 1, TLI);
3707 }
3708 }
3710 };
3711
3712 // The source and the destination must be in the same basic block.
3713 const BasicBlock &Block = *In.getParent();
3714 assert(Block.begin() == To || Block.end() == To || To->getParent() == &Block);
3715 // No PHIs.
3716 if (isa<PHINode>(In) || (To != Block.end() && isa<PHINode>(*To)))
3717 return false;
3718
3720 return true;
3721 bool MayWrite = In.mayWriteToMemory();
3722 auto MaybeLoc = getLocOrNone(In);
3723
3724 auto From = In.getIterator();
3725 if (From == To)
3726 return true;
3727 bool MoveUp = (To != Block.end() && To->comesBefore(&In));
3728 auto Range =
3729 MoveUp ? std::make_pair(To, From) : std::make_pair(std::next(From), To);
3730 for (auto It = Range.first; It != Range.second; ++It) {
3731 const Instruction &I = *It;
3732 if (llvm::is_contained(IgnoreInsts, &I))
3733 continue;
3734 // assume intrinsic can be ignored
3735 if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
3736 if (II->getIntrinsicID() == Intrinsic::assume)
3737 continue;
3738 }
3739 // Parts based on isSafeToMoveBefore from CoveMoverUtils.cpp.
3740 if (I.mayThrow())
3741 return false;
3742 if (auto *CB = dyn_cast<CallBase>(&I)) {
3743 if (!CB->hasFnAttr(Attribute::WillReturn))
3744 return false;
3745 if (!CB->hasFnAttr(Attribute::NoSync))
3746 return false;
3747 }
3748 if (I.mayReadOrWriteMemory()) {
3749 auto MaybeLocI = getLocOrNone(I);
3750 if (MayWrite || I.mayWriteToMemory()) {
3751 if (!MaybeLoc || !MaybeLocI)
3752 return false;
3753 if (!AA.isNoAlias(*MaybeLoc, *MaybeLocI))
3754 return false;
3755 }
3756 }
3757 }
3758 return true;
3759}
3760
3761auto HexagonVectorCombine::isByteVecTy(Type *Ty) const -> bool {
3762 if (auto *VecTy = dyn_cast<VectorType>(Ty))
3763 return VecTy->getElementType() == getByteTy();
3764 return false;
3765}
3766
3767auto HexagonVectorCombine::getElementRange(IRBuilderBase &Builder, Value *Lo,
3768 Value *Hi, int Start,
3769 int Length) const -> Value * {
3770 assert(0 <= Start && size_t(Start + Length) < length(Lo) + length(Hi));
3771 SmallVector<int, 128> SMask(Length);
3772 std::iota(SMask.begin(), SMask.end(), Start);
3773 return Builder.CreateShuffleVector(Lo, Hi, SMask, "shf");
3774}
3775
3776// Pass management.
3777
3778namespace {
3779class HexagonVectorCombineLegacy : public FunctionPass {
3780public:
3781 static char ID;
3782
3783 HexagonVectorCombineLegacy() : FunctionPass(ID) {}
3784
3785 StringRef getPassName() const override { return "Hexagon Vector Combine"; }
3786
3787 void getAnalysisUsage(AnalysisUsage &AU) const override {
3788 AU.setPreservesCFG();
3789 AU.addRequired<AAResultsWrapperPass>();
3790 AU.addRequired<AssumptionCacheTracker>();
3791 AU.addRequired<DominatorTreeWrapperPass>();
3792 AU.addRequired<ScalarEvolutionWrapperPass>();
3793 AU.addRequired<TargetLibraryInfoWrapperPass>();
3794 AU.addRequired<TargetPassConfig>();
3795 FunctionPass::getAnalysisUsage(AU);
3796 }
3797
3798 bool runOnFunction(Function &F) override {
3799 if (skipFunction(F))
3800 return false;
3801 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
3802 AssumptionCache &AC =
3803 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
3804 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
3805 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
3806 TargetLibraryInfo &TLI =
3807 getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
3808 auto &TM = getAnalysis<TargetPassConfig>().getTM<HexagonTargetMachine>();
3809 HexagonVectorCombine HVC(F, AA, AC, DT, SE, TLI, TM);
3810 return HVC.run();
3811 }
3812};
3813} // namespace
3814
3815char HexagonVectorCombineLegacy::ID = 0;
3816
3817INITIALIZE_PASS_BEGIN(HexagonVectorCombineLegacy, DEBUG_TYPE,
3818 "Hexagon Vector Combine", false, false)
3825INITIALIZE_PASS_END(HexagonVectorCombineLegacy, DEBUG_TYPE,
3826 "Hexagon Vector Combine", false, false)
3827
3829 return new HexagonVectorCombineLegacy();
3830}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Prepare AGPR Alloc
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
static IntegerType * getIntTy(IRBuilderBase &B, const TargetLibraryInfo *TLI)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
hexagon bit simplify
static cl::opt< unsigned > SizeLimit("eif-limit", cl::init(6), cl::Hidden, cl::desc("Size limit in Hexagon early if-conversion"))
shuff Hexagon Optimize Shuffle Vector
static Value * locateIndexesFromIntrinsic(Instruction *In)
Instruction * locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual)
Value * getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC, IRBuilderBase &Builder, LLVMContext &Ctx, Value *I)
static Value * locateIndexesFromGEP(Value *In)
#define CallBuilder(B, F)
Value * getPointer(Value *Ptr)
#define DEFAULT_HVX_VTCM_PAGE_SIZE
static Value * locateAddressFromIntrinsic(Instruction *In)
static Instruction * selectDestination(Instruction *In, HvxIdioms::DstQualifier &Qual)
Value * get_i32_Mask(const HexagonVectorCombine &HVC, IRBuilderBase &Builder, LLVMContext &Ctx, unsigned int pattern)
bool isArithmetic(unsigned Opc)
static Type * getIndexType(Value *In)
GetElementPtrInst * locateGepFromIntrinsic(Instruction *In)
Value * getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC, IRBuilderBase &Builder, LLVMContext &Ctx, Value *I)
iv Induction Variable Users
Definition IVUsers.cpp:48
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
#define H(x, y, z)
Definition MD5.cpp:56
static bool isCandidate(const MachineInstr *MI, Register &DefedReg, Register FrameReg)
static bool isUndef(const MachineInstr &MI)
This file contains the declarations for metadata subclasses.
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Remove Loads Into Fake Uses
static ConstantInt * getConstInt(MDNode *MD, unsigned NumOp)
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
Target-Independent Code Generator Pass Configuration Options pass.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:828
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
InstListType::const_iterator const_iterator
Definition BasicBlock.h:171
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
AttributeList getAttributes() const
Return the attributes for this call.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:136
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
iterator_range< iterator > children()
NodeT * getBlock() const
DomTreeNodeBase< NodeT > * getRootNode()
getRootNode - This returns the entry node for the CFG of the function.
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:321
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool empty() const
Definition Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const BasicBlock & back() const
Definition Function.h:860
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool isHVXVectorType(EVT VecTy, bool IncludeBool=false) const
unsigned getVectorLength() const
bool isTypeForHVX(Type *VecTy, bool IncludeBool=false) const
Intrinsic::ID getIntrinsicId(unsigned Opc) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition IRBuilder.h:1833
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2626
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2097
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2336
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2289
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1420
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2197
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2511
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2071
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1532
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2442
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
const char * getOpcodeName() const
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:220
bool onlyAccessesInaccessibleMem() const
Whether this function only (at most) accesses inaccessible memory.
Definition ModRef.h:239
static LLVM_ABI std::optional< MemoryLocation > getOrNone(const Instruction *Inst)
static LLVM_ABI MemoryLocation getForArgument(const CallBase *Call, unsigned ArgIdx, const TargetLibraryInfo *TLI)
Return a location representing a particular argument of a call.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
The main scalar evolution driver.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
Rounding
Possible values of current rounding mode, which is specified in bits 23:22 of FPCR.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate
Predicate - These are "(BI << 5) | BO" for various predicates.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
BinOpPred_match< LHS, RHS, is_right_shift_op > m_Shr(const LHS &L, const RHS &R)
Matches logical shift operations.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
constexpr double e
@ User
could "use" a pointer
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI Instruction * getTerminator() const
LLVM_ABI Instruction & front() const
This is an optimization pass for GlobalISel generic memory operations.
FunctionPass * createHexagonVectorCombineLegacyPass()
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2148
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:301
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1789
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI Value * simplifyInstruction(Instruction *I, const SimplifyQuery &Q)
See if we can compute a simplified version of this instruction.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2042
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2132
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
MaskT vshuff(ArrayRef< int > Vu, ArrayRef< int > Vv, unsigned Size, bool TakeOdd)
MaskT vdeal(ArrayRef< int > Vu, ArrayRef< int > Vv, unsigned Size, bool TakeOdd)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316