LLVM 22.0.0git
HexagonVectorCombine.cpp
Go to the documentation of this file.
1//===-- HexagonVectorCombine.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// HexagonVectorCombine is a utility class implementing a variety of functions
9// that assist in vector-based optimizations.
10//
11// AlignVectors: replace unaligned vector loads and stores with aligned ones.
12// HvxIdioms: recognize various opportunities to generate HVX intrinsic code.
13//===----------------------------------------------------------------------===//
14
15#include "llvm/ADT/APInt.h"
16#include "llvm/ADT/ArrayRef.h"
17#include "llvm/ADT/DenseMap.h"
18#include "llvm/ADT/MapVector.h"
19#include "llvm/ADT/STLExtras.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/IR/IRBuilder.h"
35#include "llvm/IR/Intrinsics.h"
36#include "llvm/IR/IntrinsicsHexagon.h"
37#include "llvm/IR/Metadata.h"
40#include "llvm/Pass.h"
47
48#include "Hexagon.h"
49#include "HexagonSubtarget.h"
51
52#include <algorithm>
53#include <deque>
54#include <map>
55#include <optional>
56#include <set>
57#include <utility>
58#include <vector>
59
60#define DEBUG_TYPE "hexagon-vc"
61
62// This is a const that represents default HVX VTCM page size.
63// It is boot time configurable, so we probably want an API to
64// read it, but for now assume 128KB
65#define DEFAULT_HVX_VTCM_PAGE_SIZE 131072
66
67using namespace llvm;
68
69namespace {
70cl::opt<bool> DumpModule("hvc-dump-module", cl::Hidden);
71cl::opt<bool> VAEnabled("hvc-va", cl::Hidden, cl::init(true)); // Align
72cl::opt<bool> VIEnabled("hvc-vi", cl::Hidden, cl::init(true)); // Idioms
73cl::opt<bool> VADoFullStores("hvc-va-full-stores", cl::Hidden);
74
75cl::opt<unsigned> VAGroupCountLimit("hvc-va-group-count-limit", cl::Hidden,
76 cl::init(~0));
77cl::opt<unsigned> VAGroupSizeLimit("hvc-va-group-size-limit", cl::Hidden,
78 cl::init(~0));
79
80class HexagonVectorCombine {
81public:
82 HexagonVectorCombine(Function &F_, AliasAnalysis &AA_, AssumptionCache &AC_,
84 TargetLibraryInfo &TLI_, const TargetMachine &TM_)
85 : F(F_), DL(F.getDataLayout()), AA(AA_), AC(AC_), DT(DT_),
86 SE(SE_), TLI(TLI_),
87 HST(static_cast<const HexagonSubtarget &>(*TM_.getSubtargetImpl(F))) {}
88
89 bool run();
90
91 // Common integer type.
92 IntegerType *getIntTy(unsigned Width = 32) const;
93 // Byte type: either scalar (when Length = 0), or vector with given
94 // element count.
95 Type *getByteTy(int ElemCount = 0) const;
96 // Boolean type: either scalar (when Length = 0), or vector with given
97 // element count.
98 Type *getBoolTy(int ElemCount = 0) const;
99 // Create a ConstantInt of type returned by getIntTy with the value Val.
100 ConstantInt *getConstInt(int Val, unsigned Width = 32) const;
101 // Get the integer value of V, if it exists.
102 std::optional<APInt> getIntValue(const Value *Val) const;
103 // Is Val a constant 0, or a vector of 0s?
104 bool isZero(const Value *Val) const;
105 // Is Val an undef value?
106 bool isUndef(const Value *Val) const;
107 // Is Val a scalar (i1 true) or a vector of (i1 true)?
108 bool isTrue(const Value *Val) const;
109 // Is Val a scalar (i1 false) or a vector of (i1 false)?
110 bool isFalse(const Value *Val) const;
111
112 // Get HVX vector type with the given element type.
113 VectorType *getHvxTy(Type *ElemTy, bool Pair = false) const;
114
115 enum SizeKind {
116 Store, // Store size
117 Alloc, // Alloc size
118 };
119 int getSizeOf(const Value *Val, SizeKind Kind = Store) const;
120 int getSizeOf(const Type *Ty, SizeKind Kind = Store) const;
121 int getTypeAlignment(Type *Ty) const;
122 size_t length(Value *Val) const;
123 size_t length(Type *Ty) const;
124
125 Value *simplify(Value *Val) const;
126
127 Value *insertb(IRBuilderBase &Builder, Value *Dest, Value *Src, int Start,
128 int Length, int Where) const;
129 Value *vlalignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
130 Value *Amt) const;
131 Value *vralignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
132 Value *Amt) const;
133 Value *concat(IRBuilderBase &Builder, ArrayRef<Value *> Vecs) const;
134 Value *vresize(IRBuilderBase &Builder, Value *Val, int NewSize,
135 Value *Pad) const;
136 Value *rescale(IRBuilderBase &Builder, Value *Mask, Type *FromTy,
137 Type *ToTy) const;
138 Value *vlsb(IRBuilderBase &Builder, Value *Val) const;
139 Value *vbytes(IRBuilderBase &Builder, Value *Val) const;
140 Value *subvector(IRBuilderBase &Builder, Value *Val, unsigned Start,
141 unsigned Length) const;
142 Value *sublo(IRBuilderBase &Builder, Value *Val) const;
143 Value *subhi(IRBuilderBase &Builder, Value *Val) const;
144 Value *vdeal(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
145 Value *vshuff(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
146
147 Value *createHvxIntrinsic(IRBuilderBase &Builder, Intrinsic::ID IntID,
148 Type *RetTy, ArrayRef<Value *> Args,
149 ArrayRef<Type *> ArgTys = {},
150 ArrayRef<Value *> MDSources = {}) const;
151 SmallVector<Value *> splitVectorElements(IRBuilderBase &Builder, Value *Vec,
152 unsigned ToWidth) const;
153 Value *joinVectorElements(IRBuilderBase &Builder, ArrayRef<Value *> Values,
154 VectorType *ToType) const;
155
156 std::optional<int> calculatePointerDifference(Value *Ptr0, Value *Ptr1) const;
157
158 unsigned getNumSignificantBits(const Value *V,
159 const Instruction *CtxI = nullptr) const;
160 KnownBits getKnownBits(const Value *V,
161 const Instruction *CtxI = nullptr) const;
162
163 bool isSafeToClone(const Instruction &In) const;
164
165 template <typename T = std::vector<Instruction *>>
166 bool isSafeToMoveBeforeInBB(const Instruction &In,
168 const T &IgnoreInsts = {}) const;
169
170 // This function is only used for assertions at the moment.
171 [[maybe_unused]] bool isByteVecTy(Type *Ty) const;
172
173 Function &F;
174 const DataLayout &DL;
176 AssumptionCache &AC;
177 DominatorTree &DT;
178 ScalarEvolution &SE;
180 const HexagonSubtarget &HST;
181
182private:
183 Value *getElementRange(IRBuilderBase &Builder, Value *Lo, Value *Hi,
184 int Start, int Length) const;
185};
186
187class AlignVectors {
188 // This code tries to replace unaligned vector loads/stores with aligned
189 // ones.
190 // Consider unaligned load:
191 // %v = original_load %some_addr, align <bad>
192 // %user = %v
193 // It will generate
194 // = load ..., align <good>
195 // = load ..., align <good>
196 // = valign
197 // etc.
198 // %synthesize = combine/shuffle the loaded data so that it looks
199 // exactly like what "original_load" has loaded.
200 // %user = %synthesize
201 // Similarly for stores.
202public:
203 AlignVectors(const HexagonVectorCombine &HVC_) : HVC(HVC_) {}
204
205 bool run();
206
207private:
208 using InstList = std::vector<Instruction *>;
210
211 struct AddrInfo {
212 AddrInfo(const AddrInfo &) = default;
213 AddrInfo &operator=(const AddrInfo &) = default;
214 AddrInfo(const HexagonVectorCombine &HVC, Instruction *I, Value *A, Type *T,
215 Align H)
216 : Inst(I), Addr(A), ValTy(T), HaveAlign(H),
217 NeedAlign(HVC.getTypeAlignment(ValTy)) {}
218
219 // XXX: add Size member?
220 Instruction *Inst;
221 Value *Addr;
222 Type *ValTy;
223 Align HaveAlign;
224 Align NeedAlign;
225 int Offset = 0; // Offset (in bytes) from the first member of the
226 // containing AddrList.
227 };
228 using AddrList = std::vector<AddrInfo>;
229
230 struct InstrLess {
231 bool operator()(const Instruction *A, const Instruction *B) const {
232 return A->comesBefore(B);
233 }
234 };
235 using DepList = std::set<Instruction *, InstrLess>;
236
237 struct MoveGroup {
238 MoveGroup(const AddrInfo &AI, Instruction *B, bool Hvx, bool Load)
239 : Base(B), Main{AI.Inst}, Clones{}, IsHvx(Hvx), IsLoad(Load) {}
240 MoveGroup() = default;
241 Instruction *Base; // Base instruction of the parent address group.
242 InstList Main; // Main group of instructions.
243 InstList Deps; // List of dependencies.
244 InstMap Clones; // Map from original Deps to cloned ones.
245 bool IsHvx; // Is this group of HVX instructions?
246 bool IsLoad; // Is this a load group?
247 };
248 using MoveList = std::vector<MoveGroup>;
249
250 struct ByteSpan {
251 // A representation of "interesting" bytes within a given span of memory.
252 // These bytes are those that are loaded or stored, and they don't have
253 // to cover the entire span of memory.
254 //
255 // The representation works by picking a contiguous sequence of bytes
256 // from somewhere within a llvm::Value, and placing it at a given offset
257 // within the span.
258 //
259 // The sequence of bytes from llvm:Value is represented by Segment.
260 // Block is Segment, plus where it goes in the span.
261 //
262 // An important feature of ByteSpan is being able to make a "section",
263 // i.e. creating another ByteSpan corresponding to a range of offsets
264 // relative to the source span.
265
266 struct Segment {
267 // Segment of a Value: 'Len' bytes starting at byte 'Begin'.
268 Segment(Value *Val, int Begin, int Len)
269 : Val(Val), Start(Begin), Size(Len) {}
270 Segment(const Segment &Seg) = default;
271 Segment &operator=(const Segment &Seg) = default;
272 Value *Val; // Value representable as a sequence of bytes.
273 int Start; // First byte of the value that belongs to the segment.
274 int Size; // Number of bytes in the segment.
275 };
276
277 struct Block {
278 Block(Value *Val, int Len, int Pos) : Seg(Val, 0, Len), Pos(Pos) {}
279 Block(Value *Val, int Off, int Len, int Pos)
280 : Seg(Val, Off, Len), Pos(Pos) {}
281 Block(const Block &Blk) = default;
282 Block &operator=(const Block &Blk) = default;
283 Segment Seg; // Value segment.
284 int Pos; // Position (offset) of the block in the span.
285 };
286
287 int extent() const;
288 ByteSpan section(int Start, int Length) const;
289 ByteSpan &shift(int Offset);
290 SmallVector<Value *, 8> values() const;
291
292 int size() const { return Blocks.size(); }
293 Block &operator[](int i) { return Blocks[i]; }
294 const Block &operator[](int i) const { return Blocks[i]; }
295
296 std::vector<Block> Blocks;
297
298 using iterator = decltype(Blocks)::iterator;
299 iterator begin() { return Blocks.begin(); }
300 iterator end() { return Blocks.end(); }
301 using const_iterator = decltype(Blocks)::const_iterator;
302 const_iterator begin() const { return Blocks.begin(); }
303 const_iterator end() const { return Blocks.end(); }
304 };
305
306 std::optional<AddrInfo> getAddrInfo(Instruction &In) const;
307 bool isHvx(const AddrInfo &AI) const;
308 // This function is only used for assertions at the moment.
309 [[maybe_unused]] bool isSectorTy(Type *Ty) const;
310
311 Value *getPayload(Value *Val) const;
312 Value *getMask(Value *Val) const;
313 Value *getPassThrough(Value *Val) const;
314
315 Value *createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
316 int Adjust,
317 const InstMap &CloneMap = InstMap()) const;
318 Value *createAlignedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
319 int Alignment,
320 const InstMap &CloneMap = InstMap()) const;
321
322 Value *createLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
323 Value *Predicate, int Alignment, Value *Mask,
324 Value *PassThru, ArrayRef<Value *> MDSources = {}) const;
325 Value *createSimpleLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
326 int Alignment,
327 ArrayRef<Value *> MDSources = {}) const;
328
329 Value *createStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
330 Value *Predicate, int Alignment, Value *Mask,
331 ArrayRef<Value *> MDSources = {}) const;
332 Value *createSimpleStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
333 int Alignment,
334 ArrayRef<Value *> MDSources = {}) const;
335
336 Value *createPredicatedLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
337 Value *Predicate, int Alignment,
338 ArrayRef<Value *> MDSources = {}) const;
339 Value *createPredicatedStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
340 Value *Predicate, int Alignment,
341 ArrayRef<Value *> MDSources = {}) const;
342
343 DepList getUpwardDeps(Instruction *In, Instruction *Base) const;
344 bool createAddressGroups();
345 MoveList createLoadGroups(const AddrList &Group) const;
346 MoveList createStoreGroups(const AddrList &Group) const;
347 bool moveTogether(MoveGroup &Move) const;
348 template <typename T>
349 InstMap cloneBefore(BasicBlock::iterator To, T &&Insts) const;
350
351 void realignLoadGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
352 int ScLen, Value *AlignVal, Value *AlignAddr) const;
353 void realignStoreGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
354 int ScLen, Value *AlignVal, Value *AlignAddr) const;
355 bool realignGroup(const MoveGroup &Move);
356 Value *makeTestIfUnaligned(IRBuilderBase &Builder, Value *AlignVal,
357 int Alignment) const;
358
359 using AddrGroupMap = MapVector<Instruction *, AddrList>;
360 AddrGroupMap AddrGroups;
361
362 friend raw_ostream &operator<<(raw_ostream &OS, const AddrList &L);
363 friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);
364 friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);
365 friend raw_ostream &operator<<(raw_ostream &OS, const MoveList &L);
366 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan::Block &B);
367 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);
368 friend raw_ostream &operator<<(raw_ostream &OS, const AddrGroupMap &AG);
369 friend raw_ostream &operator<<(raw_ostream &OS, const AddrList &L);
370 friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);
371 friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);
372 friend raw_ostream &operator<<(raw_ostream &OS, const MoveList &L);
373 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan::Block &B);
374 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);
375 friend raw_ostream &operator<<(raw_ostream &OS, const AddrGroupMap &AG);
376
377 const HexagonVectorCombine &HVC;
378};
379
380[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
381 const AlignVectors::AddrGroupMap &AG) {
382 OS << "Printing AddrGroups:"
383 << "\n";
384 for (auto &It : AG) {
385 OS << "\n\tInstruction: ";
386 It.first->dump();
387 OS << "\n\tAddrInfo: ";
388 for (auto &AI : It.second)
389 OS << AI << "\n";
390 }
391 return OS;
392}
393
394[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
395 const AlignVectors::AddrList &AL) {
396 OS << "\n *** Addr List: ***\n";
397 for (auto &AG : AL) {
398 OS << "\n *** Addr Group: ***\n";
399 OS << AG;
400 OS << "\n";
401 }
402 return OS;
403}
404
405[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
406 const AlignVectors::AddrInfo &AI) {
407 OS << "Inst: " << AI.Inst << " " << *AI.Inst << '\n';
408 OS << "Addr: " << *AI.Addr << '\n';
409 OS << "Type: " << *AI.ValTy << '\n';
410 OS << "HaveAlign: " << AI.HaveAlign.value() << '\n';
411 OS << "NeedAlign: " << AI.NeedAlign.value() << '\n';
412 OS << "Offset: " << AI.Offset;
413 return OS;
414}
415
416[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
417 const AlignVectors::MoveList &ML) {
418 OS << "\n *** Move List: ***\n";
419 for (auto &MG : ML) {
420 OS << "\n *** Move Group: ***\n";
421 OS << MG;
422 OS << "\n";
423 }
424 return OS;
425}
426
427[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
428 const AlignVectors::MoveGroup &MG) {
429 OS << "IsLoad:" << (MG.IsLoad ? "yes" : "no");
430 OS << ", IsHvx:" << (MG.IsHvx ? "yes" : "no") << '\n';
431 OS << "Main\n";
432 for (Instruction *I : MG.Main)
433 OS << " " << *I << '\n';
434 OS << "Deps\n";
435 for (Instruction *I : MG.Deps)
436 OS << " " << *I << '\n';
437 OS << "Clones\n";
438 for (auto [K, V] : MG.Clones) {
439 OS << " ";
440 K->printAsOperand(OS, false);
441 OS << "\t-> " << *V << '\n';
442 }
443 return OS;
444}
445
446[[maybe_unused]] raw_ostream &
447operator<<(raw_ostream &OS, const AlignVectors::ByteSpan::Block &B) {
448 OS << " @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] ";
449 if (B.Seg.Val == reinterpret_cast<const Value *>(&B)) {
450 OS << "(self:" << B.Seg.Val << ')';
451 } else if (B.Seg.Val != nullptr) {
452 OS << *B.Seg.Val;
453 } else {
454 OS << "(null)";
455 }
456 return OS;
457}
458
459[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
460 const AlignVectors::ByteSpan &BS) {
461 OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
462 for (const AlignVectors::ByteSpan::Block &B : BS)
463 OS << B << '\n';
464 OS << ']';
465 return OS;
466}
467
468class HvxIdioms {
469public:
470 enum DstQualifier {
471 Undefined = 0,
472 Arithmetic,
473 LdSt,
474 LLVM_Gather,
475 LLVM_Scatter,
476 HEX_Gather_Scatter,
477 HEX_Gather,
478 HEX_Scatter,
479 Call
480 };
481
482 HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) {
483 auto *Int32Ty = HVC.getIntTy(32);
484 HvxI32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/false);
485 HvxP32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/true);
486 }
487
488 bool run();
489
490private:
491 enum Signedness { Positive, Signed, Unsigned };
492
493 // Value + sign
494 // This is to keep track of whether the value should be treated as signed
495 // or unsigned, or is known to be positive.
496 struct SValue {
497 Value *Val;
498 Signedness Sgn;
499 };
500
501 struct FxpOp {
502 unsigned Opcode;
503 unsigned Frac; // Number of fraction bits
504 SValue X, Y;
505 // If present, add 1 << RoundAt before shift:
506 std::optional<unsigned> RoundAt;
507 VectorType *ResTy;
508 };
509
510 auto getNumSignificantBits(Value *V, Instruction *In) const
511 -> std::pair<unsigned, Signedness>;
512 auto canonSgn(SValue X, SValue Y) const -> std::pair<SValue, SValue>;
513
514 auto matchFxpMul(Instruction &In) const -> std::optional<FxpOp>;
515 auto processFxpMul(Instruction &In, const FxpOp &Op) const -> Value *;
516
517 auto processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
518 const FxpOp &Op) const -> Value *;
519 auto createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
520 bool Rounding) const -> Value *;
521 auto createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
522 bool Rounding) const -> Value *;
523 // Return {Result, Carry}, where Carry is a vector predicate.
524 auto createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
525 Value *CarryIn = nullptr) const
526 -> std::pair<Value *, Value *>;
527 auto createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const -> Value *;
528 auto createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
529 -> Value *;
530 auto createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
531 -> std::pair<Value *, Value *>;
532 auto createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
534 auto createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
535 Signedness SgnX, ArrayRef<Value *> WordY,
536 Signedness SgnY) const -> SmallVector<Value *>;
537
538 bool matchMLoad(Instruction &In) const;
539 bool matchMStore(Instruction &In) const;
540 Value *processMLoad(Instruction &In) const;
541 Value *processMStore(Instruction &In) const;
542 std::optional<uint64_t> getAlignment(Instruction &In, Value *ptr) const;
543 std::optional<uint64_t>
544 getAlignmentImpl(Instruction &In, Value *ptr,
545 SmallPtrSet<Value *, 16> &Visited) const;
546 std::optional<uint64_t> getPHIBaseMinAlignment(Instruction &In,
547 PHINode *PN) const;
548
549 // Vector manipulations for Ripple
550 bool matchScatter(Instruction &In) const;
551 bool matchGather(Instruction &In) const;
552 Value *processVScatter(Instruction &In) const;
553 Value *processVGather(Instruction &In) const;
554
555 VectorType *HvxI32Ty;
556 VectorType *HvxP32Ty;
557 const HexagonVectorCombine &HVC;
558
559 friend raw_ostream &operator<<(raw_ostream &, const FxpOp &);
560};
561
562[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
563 const HvxIdioms::FxpOp &Op) {
564 static const char *SgnNames[] = {"Positive", "Signed", "Unsigned"};
565 OS << Instruction::getOpcodeName(Op.Opcode) << '.' << Op.Frac;
566 if (Op.RoundAt.has_value()) {
567 if (Op.Frac != 0 && *Op.RoundAt == Op.Frac - 1) {
568 OS << ":rnd";
569 } else {
570 OS << " + 1<<" << *Op.RoundAt;
571 }
572 }
573 OS << "\n X:(" << SgnNames[Op.X.Sgn] << ") " << *Op.X.Val << "\n"
574 << " Y:(" << SgnNames[Op.Y.Sgn] << ") " << *Op.Y.Val;
575 return OS;
576}
577
578} // namespace
579
580namespace {
581
582template <typename T> T *getIfUnordered(T *MaybeT) {
583 return MaybeT && MaybeT->isUnordered() ? MaybeT : nullptr;
584}
585template <typename T> T *isCandidate(Instruction *In) {
586 return dyn_cast<T>(In);
587}
589 return getIfUnordered(dyn_cast<LoadInst>(In));
590}
592 return getIfUnordered(dyn_cast<StoreInst>(In));
593}
594
595// Forward other erase_ifs to the LLVM implementations.
596template <typename Pred, typename T> void erase_if(T &&container, Pred p) {
597 llvm::erase_if(std::forward<T>(container), p);
598}
599
600} // namespace
601
602// --- Begin AlignVectors
603
604// For brevity, only consider loads. We identify a group of loads where we
605// know the relative differences between their addresses, so we know how they
606// are laid out in memory (relative to one another). These loads can overlap,
607// can be shorter or longer than the desired vector length.
608// Ultimately we want to generate a sequence of aligned loads that will load
609// every byte that the original loads loaded, and have the program use these
610// loaded values instead of the original loads.
611// We consider the contiguous memory area spanned by all these loads.
612//
613// Let's say that a single aligned vector load can load 16 bytes at a time.
614// If the program wanted to use a byte at offset 13 from the beginning of the
615// original span, it will be a byte at offset 13+x in the aligned data for
616// some x>=0. This may happen to be in the first aligned load, or in the load
617// following it. Since we generally don't know what the that alignment value
618// is at compile time, we proactively do valigns on the aligned loads, so that
619// byte that was at offset 13 is still at offset 13 after the valigns.
620//
621// This will be the starting point for making the rest of the program use the
622// data loaded by the new loads.
623// For each original load, and its users:
624// %v = load ...
625// ... = %v
626// ... = %v
627// we create
628// %new_v = extract/combine/shuffle data from loaded/valigned vectors so
629// it contains the same value as %v did before
630// then replace all users of %v with %new_v.
631// ... = %new_v
632// ... = %new_v
633
634auto AlignVectors::ByteSpan::extent() const -> int {
635 if (size() == 0)
636 return 0;
637 int Min = Blocks[0].Pos;
638 int Max = Blocks[0].Pos + Blocks[0].Seg.Size;
639 for (int i = 1, e = size(); i != e; ++i) {
640 Min = std::min(Min, Blocks[i].Pos);
641 Max = std::max(Max, Blocks[i].Pos + Blocks[i].Seg.Size);
642 }
643 return Max - Min;
644}
645
646auto AlignVectors::ByteSpan::section(int Start, int Length) const -> ByteSpan {
647 ByteSpan Section;
648 for (const ByteSpan::Block &B : Blocks) {
649 int L = std::max(B.Pos, Start); // Left end.
650 int R = std::min(B.Pos + B.Seg.Size, Start + Length); // Right end+1.
651 if (L < R) {
652 // How much to chop off the beginning of the segment:
653 int Off = L > B.Pos ? L - B.Pos : 0;
654 Section.Blocks.emplace_back(B.Seg.Val, B.Seg.Start + Off, R - L, L);
655 }
656 }
657 return Section;
658}
659
660auto AlignVectors::ByteSpan::shift(int Offset) -> ByteSpan & {
661 for (Block &B : Blocks)
662 B.Pos += Offset;
663 return *this;
664}
665
666auto AlignVectors::ByteSpan::values() const -> SmallVector<Value *, 8> {
667 SmallVector<Value *, 8> Values(Blocks.size());
668 for (int i = 0, e = Blocks.size(); i != e; ++i)
669 Values[i] = Blocks[i].Seg.Val;
670 return Values;
671}
672
673// Turn a requested integer alignment into the effective Align to use.
674// If Requested == 0 -> use ABI alignment of the value type (old semantics).
675// 0 means "ABI alignment" in old IR.
677 int Requested) {
678 if (Requested > 0)
679 return Align(static_cast<uint64_t>(Requested));
680 return Align(DL.getABITypeAlign(ValTy).value());
681}
682
683auto AlignVectors::getAddrInfo(Instruction &In) const
684 -> std::optional<AddrInfo> {
685 if (auto *L = isCandidate<LoadInst>(&In))
686 return AddrInfo(HVC, L, L->getPointerOperand(), L->getType(),
687 L->getAlign());
688 if (auto *S = isCandidate<StoreInst>(&In))
689 return AddrInfo(HVC, S, S->getPointerOperand(),
690 S->getValueOperand()->getType(), S->getAlign());
691 if (auto *II = isCandidate<IntrinsicInst>(&In)) {
692 Intrinsic::ID ID = II->getIntrinsicID();
693 switch (ID) {
694 case Intrinsic::masked_load:
695 return AddrInfo(HVC, II, II->getArgOperand(0), II->getType(),
696 II->getParamAlign(0).valueOrOne());
697 case Intrinsic::masked_store:
698 return AddrInfo(HVC, II, II->getArgOperand(1),
699 II->getArgOperand(0)->getType(),
700 II->getParamAlign(1).valueOrOne());
701 }
702 }
703 return std::nullopt;
704}
705
706auto AlignVectors::isHvx(const AddrInfo &AI) const -> bool {
707 return HVC.HST.isTypeForHVX(AI.ValTy);
708}
709
710auto AlignVectors::getPayload(Value *Val) const -> Value * {
711 if (auto *In = dyn_cast<Instruction>(Val)) {
712 Intrinsic::ID ID = 0;
713 if (auto *II = dyn_cast<IntrinsicInst>(In))
714 ID = II->getIntrinsicID();
715 if (isa<StoreInst>(In) || ID == Intrinsic::masked_store)
716 return In->getOperand(0);
717 }
718 return Val;
719}
720
721auto AlignVectors::getMask(Value *Val) const -> Value * {
722 if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
723 switch (II->getIntrinsicID()) {
724 case Intrinsic::masked_load:
725 return II->getArgOperand(1);
726 case Intrinsic::masked_store:
727 return II->getArgOperand(2);
728 }
729 }
730
731 Type *ValTy = getPayload(Val)->getType();
732 if (auto *VecTy = dyn_cast<VectorType>(ValTy))
733 return Constant::getAllOnesValue(HVC.getBoolTy(HVC.length(VecTy)));
734 return Constant::getAllOnesValue(HVC.getBoolTy());
735}
736
737auto AlignVectors::getPassThrough(Value *Val) const -> Value * {
738 if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
739 if (II->getIntrinsicID() == Intrinsic::masked_load)
740 return II->getArgOperand(2);
741 }
742 return UndefValue::get(getPayload(Val)->getType());
743}
744
745auto AlignVectors::createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr,
746 Type *ValTy, int Adjust,
747 const InstMap &CloneMap) const
748 -> Value * {
749 if (auto *I = dyn_cast<Instruction>(Ptr))
750 if (Instruction *New = CloneMap.lookup(I))
751 Ptr = New;
752 return Builder.CreatePtrAdd(Ptr, HVC.getConstInt(Adjust), "gep");
753}
754
755auto AlignVectors::createAlignedPointer(IRBuilderBase &Builder, Value *Ptr,
756 Type *ValTy, int Alignment,
757 const InstMap &CloneMap) const
758 -> Value * {
759 auto remap = [&](Value *V) -> Value * {
760 if (auto *I = dyn_cast<Instruction>(V)) {
761 for (auto [Old, New] : CloneMap)
762 I->replaceUsesOfWith(Old, New);
763 return I;
764 }
765 return V;
766 };
767 Value *AsInt = Builder.CreatePtrToInt(Ptr, HVC.getIntTy(), "pti");
768 Value *Mask = HVC.getConstInt(-Alignment);
769 Value *And = Builder.CreateAnd(remap(AsInt), Mask, "and");
770 return Builder.CreateIntToPtr(
771 And, PointerType::getUnqual(ValTy->getContext()), "itp");
772}
773
774auto AlignVectors::createLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
775 Value *Predicate, int Alignment, Value *Mask,
776 Value *PassThru,
777 ArrayRef<Value *> MDSources) const -> Value * {
778 // Predicate is nullptr if not creating predicated load
779 if (Predicate) {
780 assert(!Predicate->getType()->isVectorTy() &&
781 "Expectning scalar predicate");
782 if (HVC.isFalse(Predicate))
783 return UndefValue::get(ValTy);
784 if (!HVC.isTrue(Predicate)) {
785 Value *Load = createPredicatedLoad(Builder, ValTy, Ptr, Predicate,
786 Alignment, MDSources);
787 return Builder.CreateSelect(Mask, Load, PassThru);
788 }
789 // Predicate == true here.
790 }
791 assert(!HVC.isUndef(Mask)); // Should this be allowed?
792 if (HVC.isZero(Mask))
793 return PassThru;
794
795 Align EffA = effectiveAlignForValueTy(HVC.DL, ValTy, Alignment);
796 if (HVC.isTrue(Mask))
797 return createSimpleLoad(Builder, ValTy, Ptr, EffA.value(), MDSources);
798
800 Builder.CreateMaskedLoad(ValTy, Ptr, EffA, Mask, PassThru, "mld");
801 LLVM_DEBUG(dbgs() << "\t[Creating masked Load:] "; Load->dump());
802 propagateMetadata(Load, MDSources);
803 return Load;
804}
805
806auto AlignVectors::createSimpleLoad(IRBuilderBase &Builder, Type *ValTy,
807 Value *Ptr, int Alignment,
808 ArrayRef<Value *> MDSources) const
809 -> Value * {
810 Align EffA = effectiveAlignForValueTy(HVC.DL, ValTy, Alignment);
811 Instruction *Load = Builder.CreateAlignedLoad(ValTy, Ptr, EffA, "ald");
812 propagateMetadata(Load, MDSources);
813 LLVM_DEBUG(dbgs() << "\t[Creating Load:] "; Load->dump());
814 return Load;
815}
816
817auto AlignVectors::createPredicatedLoad(IRBuilderBase &Builder, Type *ValTy,
818 Value *Ptr, Value *Predicate,
819 int Alignment,
820 ArrayRef<Value *> MDSources) const
821 -> Value * {
822 assert(HVC.HST.isTypeForHVX(ValTy) &&
823 "Predicates 'scalar' vector loads not yet supported");
824 assert(Predicate);
825 assert(!Predicate->getType()->isVectorTy() && "Expectning scalar predicate");
826 Align EffA = effectiveAlignForValueTy(HVC.DL, ValTy, Alignment);
827 assert(HVC.getSizeOf(ValTy, HVC.Alloc) % EffA.value() == 0);
828
829 if (HVC.isFalse(Predicate))
830 return UndefValue::get(ValTy);
831 if (HVC.isTrue(Predicate))
832 return createSimpleLoad(Builder, ValTy, Ptr, EffA.value(), MDSources);
833
834 auto V6_vL32b_pred_ai = HVC.HST.getIntrinsicId(Hexagon::V6_vL32b_pred_ai);
835 // FIXME: This may not put the offset from Ptr into the vmem offset.
836 return HVC.createHvxIntrinsic(Builder, V6_vL32b_pred_ai, ValTy,
837 {Predicate, Ptr, HVC.getConstInt(0)}, {},
838 MDSources);
839}
840
841auto AlignVectors::createStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
842 Value *Predicate, int Alignment, Value *Mask,
843 ArrayRef<Value *> MDSources) const -> Value * {
844 if (HVC.isZero(Mask) || HVC.isUndef(Val) || HVC.isUndef(Mask))
845 return UndefValue::get(Val->getType());
846 assert(!Predicate || (!Predicate->getType()->isVectorTy() &&
847 "Expectning scalar predicate"));
848 if (Predicate) {
849 if (HVC.isFalse(Predicate))
850 return UndefValue::get(Val->getType());
851 if (HVC.isTrue(Predicate))
852 Predicate = nullptr;
853 }
854 // Here both Predicate and Mask are true or unknown.
855
856 if (HVC.isTrue(Mask)) {
857 if (Predicate) { // Predicate unknown
858 return createPredicatedStore(Builder, Val, Ptr, Predicate, Alignment,
859 MDSources);
860 }
861 // Predicate is true:
862 return createSimpleStore(Builder, Val, Ptr, Alignment, MDSources);
863 }
864
865 // Mask is unknown
866 if (!Predicate) {
868 Builder.CreateMaskedStore(Val, Ptr, Align(Alignment), Mask);
869 propagateMetadata(Store, MDSources);
870 return Store;
871 }
872
873 // Both Predicate and Mask are unknown.
874 // Emulate masked store with predicated-load + mux + predicated-store.
875 Value *PredLoad = createPredicatedLoad(Builder, Val->getType(), Ptr,
876 Predicate, Alignment, MDSources);
877 Value *Mux = Builder.CreateSelect(Mask, Val, PredLoad);
878 return createPredicatedStore(Builder, Mux, Ptr, Predicate, Alignment,
879 MDSources);
880}
881
882auto AlignVectors::createSimpleStore(IRBuilderBase &Builder, Value *Val,
883 Value *Ptr, int Alignment,
884 ArrayRef<Value *> MDSources) const
885 -> Value * {
886 Align EffA = effectiveAlignForValueTy(HVC.DL, Val->getType(), Alignment);
887 Instruction *Store = Builder.CreateAlignedStore(Val, Ptr, EffA);
888 LLVM_DEBUG(dbgs() << "\t[Creating store:] "; Store->dump());
889 propagateMetadata(Store, MDSources);
890 return Store;
891}
892
893auto AlignVectors::createPredicatedStore(IRBuilderBase &Builder, Value *Val,
894 Value *Ptr, Value *Predicate,
895 int Alignment,
896 ArrayRef<Value *> MDSources) const
897 -> Value * {
898 Align EffA = effectiveAlignForValueTy(HVC.DL, Val->getType(), Alignment);
899 assert(HVC.HST.isTypeForHVX(Val->getType()) &&
900 "Predicates 'scalar' vector stores not yet supported");
901 assert(Predicate);
902 if (HVC.isFalse(Predicate))
903 return UndefValue::get(Val->getType());
904 if (HVC.isTrue(Predicate))
905 return createSimpleStore(Builder, Val, Ptr, EffA.value(), MDSources);
906
907 assert(HVC.getSizeOf(Val, HVC.Alloc) % EffA.value() == 0);
908 auto V6_vS32b_pred_ai = HVC.HST.getIntrinsicId(Hexagon::V6_vS32b_pred_ai);
909 // FIXME: This may not put the offset from Ptr into the vmem offset.
910 return HVC.createHvxIntrinsic(Builder, V6_vS32b_pred_ai, nullptr,
911 {Predicate, Ptr, HVC.getConstInt(0), Val}, {},
912 MDSources);
913}
914
915auto AlignVectors::getUpwardDeps(Instruction *In, Instruction *Base) const
916 -> DepList {
917 BasicBlock *Parent = Base->getParent();
918 assert(In->getParent() == Parent &&
919 "Base and In should be in the same block");
920 assert(Base->comesBefore(In) && "Base should come before In");
921
922 DepList Deps;
923 std::deque<Instruction *> WorkQ = {In};
924 while (!WorkQ.empty()) {
925 Instruction *D = WorkQ.front();
926 WorkQ.pop_front();
927 if (D != In)
928 Deps.insert(D);
929 for (Value *Op : D->operands()) {
930 if (auto *I = dyn_cast<Instruction>(Op)) {
931 if (I->getParent() == Parent && Base->comesBefore(I))
932 WorkQ.push_back(I);
933 }
934 }
935 }
936 return Deps;
937}
938
939auto AlignVectors::createAddressGroups() -> bool {
940 // An address group created here may contain instructions spanning
941 // multiple basic blocks.
942 AddrList WorkStack;
943
944 auto findBaseAndOffset = [&](AddrInfo &AI) -> std::pair<Instruction *, int> {
945 for (AddrInfo &W : WorkStack) {
946 if (auto D = HVC.calculatePointerDifference(AI.Addr, W.Addr))
947 return std::make_pair(W.Inst, *D);
948 }
949 return std::make_pair(nullptr, 0);
950 };
951
952 auto traverseBlock = [&](DomTreeNode *DomN, auto Visit) -> void {
953 BasicBlock &Block = *DomN->getBlock();
954 for (Instruction &I : Block) {
955 auto AI = this->getAddrInfo(I); // Use this-> for gcc6.
956 if (!AI)
957 continue;
958 auto F = findBaseAndOffset(*AI);
959 Instruction *GroupInst;
960 if (Instruction *BI = F.first) {
961 AI->Offset = F.second;
962 GroupInst = BI;
963 } else {
964 WorkStack.push_back(*AI);
965 GroupInst = AI->Inst;
966 }
967 AddrGroups[GroupInst].push_back(*AI);
968 }
969
970 for (DomTreeNode *C : DomN->children())
971 Visit(C, Visit);
972
973 while (!WorkStack.empty() && WorkStack.back().Inst->getParent() == &Block)
974 WorkStack.pop_back();
975 };
976
977 traverseBlock(HVC.DT.getRootNode(), traverseBlock);
978 assert(WorkStack.empty());
979
980 // AddrGroups are formed.
981 // Remove groups of size 1.
982 AddrGroups.remove_if([](auto &G) { return G.second.size() == 1; });
983 // Remove groups that don't use HVX types.
984 AddrGroups.remove_if([&](auto &G) {
985 return llvm::none_of(
986 G.second, [&](auto &I) { return HVC.HST.isTypeForHVX(I.ValTy); });
987 });
988
989 LLVM_DEBUG(dbgs() << AddrGroups);
990 return !AddrGroups.empty();
991}
992
993auto AlignVectors::createLoadGroups(const AddrList &Group) const -> MoveList {
994 // Form load groups.
995 // To avoid complications with moving code across basic blocks, only form
996 // groups that are contained within a single basic block.
997 unsigned SizeLimit = VAGroupSizeLimit;
998 if (SizeLimit == 0)
999 return {};
1000
1001 auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
1002 assert(!Move.Main.empty() && "Move group should have non-empty Main");
1003 if (Move.Main.size() >= SizeLimit)
1004 return false;
1005 // Don't mix HVX and non-HVX instructions.
1006 if (Move.IsHvx != isHvx(Info))
1007 return false;
1008 // Leading instruction in the load group.
1009 Instruction *Base = Move.Main.front();
1010 if (Base->getParent() != Info.Inst->getParent())
1011 return false;
1012 // Check if it's safe to move the load.
1013 if (!HVC.isSafeToMoveBeforeInBB(*Info.Inst, Base->getIterator()))
1014 return false;
1015 // And if it's safe to clone the dependencies.
1016 auto isSafeToCopyAtBase = [&](const Instruction *I) {
1017 return HVC.isSafeToMoveBeforeInBB(*I, Base->getIterator()) &&
1018 HVC.isSafeToClone(*I);
1019 };
1020 DepList Deps = getUpwardDeps(Info.Inst, Base);
1021 if (!llvm::all_of(Deps, isSafeToCopyAtBase))
1022 return false;
1023
1024 Move.Main.push_back(Info.Inst);
1025 llvm::append_range(Move.Deps, Deps);
1026 return true;
1027 };
1028
1029 MoveList LoadGroups;
1030
1031 for (const AddrInfo &Info : Group) {
1032 if (!Info.Inst->mayReadFromMemory())
1033 continue;
1034 if (LoadGroups.empty() || !tryAddTo(Info, LoadGroups.back()))
1035 LoadGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), true);
1036 }
1037
1038 // Erase singleton groups.
1039 erase_if(LoadGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
1040
1041 // Erase HVX groups on targets < HvxV62 (due to lack of predicated loads).
1042 if (!HVC.HST.useHVXV62Ops())
1043 erase_if(LoadGroups, [](const MoveGroup &G) { return G.IsHvx; });
1044
1045 LLVM_DEBUG(dbgs() << "LoadGroups list: " << LoadGroups);
1046 return LoadGroups;
1047}
1048
1049auto AlignVectors::createStoreGroups(const AddrList &Group) const -> MoveList {
1050 // Form store groups.
1051 // To avoid complications with moving code across basic blocks, only form
1052 // groups that are contained within a single basic block.
1053 unsigned SizeLimit = VAGroupSizeLimit;
1054 if (SizeLimit == 0)
1055 return {};
1056
1057 auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
1058 assert(!Move.Main.empty() && "Move group should have non-empty Main");
1059 if (Move.Main.size() >= SizeLimit)
1060 return false;
1061 // For stores with return values we'd have to collect downward dependencies.
1062 // There are no such stores that we handle at the moment, so omit that.
1063 assert(Info.Inst->getType()->isVoidTy() &&
1064 "Not handling stores with return values");
1065 // Don't mix HVX and non-HVX instructions.
1066 if (Move.IsHvx != isHvx(Info))
1067 return false;
1068 // For stores we need to be careful whether it's safe to move them.
1069 // Stores that are otherwise safe to move together may not appear safe
1070 // to move over one another (i.e. isSafeToMoveBefore may return false).
1071 Instruction *Base = Move.Main.front();
1072 if (Base->getParent() != Info.Inst->getParent())
1073 return false;
1074 if (!HVC.isSafeToMoveBeforeInBB(*Info.Inst, Base->getIterator(), Move.Main))
1075 return false;
1076 Move.Main.push_back(Info.Inst);
1077 return true;
1078 };
1079
1080 MoveList StoreGroups;
1081
1082 for (auto I = Group.rbegin(), E = Group.rend(); I != E; ++I) {
1083 const AddrInfo &Info = *I;
1084 if (!Info.Inst->mayWriteToMemory())
1085 continue;
1086 if (StoreGroups.empty() || !tryAddTo(Info, StoreGroups.back()))
1087 StoreGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), false);
1088 }
1089
1090 // Erase singleton groups.
1091 erase_if(StoreGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
1092
1093 // Erase HVX groups on targets < HvxV62 (due to lack of predicated loads).
1094 if (!HVC.HST.useHVXV62Ops())
1095 erase_if(StoreGroups, [](const MoveGroup &G) { return G.IsHvx; });
1096
1097 // Erase groups where every store is a full HVX vector. The reason is that
1098 // aligning predicated stores generates complex code that may be less
1099 // efficient than a sequence of unaligned vector stores.
1100 if (!VADoFullStores) {
1101 erase_if(StoreGroups, [this](const MoveGroup &G) {
1102 return G.IsHvx && llvm::all_of(G.Main, [this](Instruction *S) {
1103 auto MaybeInfo = this->getAddrInfo(*S);
1104 assert(MaybeInfo.has_value());
1105 return HVC.HST.isHVXVectorType(
1106 EVT::getEVT(MaybeInfo->ValTy, false));
1107 });
1108 });
1109 }
1110
1111 return StoreGroups;
1112}
1113
1114auto AlignVectors::moveTogether(MoveGroup &Move) const -> bool {
1115 // Move all instructions to be adjacent.
1116 assert(!Move.Main.empty() && "Move group should have non-empty Main");
1117 Instruction *Where = Move.Main.front();
1118
1119 if (Move.IsLoad) {
1120 // Move all the loads (and dependencies) to where the first load is.
1121 // Clone all deps to before Where, keeping order.
1122 Move.Clones = cloneBefore(Where->getIterator(), Move.Deps);
1123 // Move all main instructions to after Where, keeping order.
1124 ArrayRef<Instruction *> Main(Move.Main);
1125 for (Instruction *M : Main) {
1126 if (M != Where)
1127 M->moveAfter(Where);
1128 for (auto [Old, New] : Move.Clones)
1129 M->replaceUsesOfWith(Old, New);
1130 Where = M;
1131 }
1132 // Replace Deps with the clones.
1133 for (int i = 0, e = Move.Deps.size(); i != e; ++i)
1134 Move.Deps[i] = Move.Clones[Move.Deps[i]];
1135 } else {
1136 // Move all the stores to where the last store is.
1137 // NOTE: Deps are empty for "store" groups. If they need to be
1138 // non-empty, decide on the order.
1139 assert(Move.Deps.empty());
1140 // Move all main instructions to before Where, inverting order.
1141 ArrayRef<Instruction *> Main(Move.Main);
1142 for (Instruction *M : Main.drop_front(1)) {
1143 M->moveBefore(Where->getIterator());
1144 Where = M;
1145 }
1146 }
1147
1148 return Move.Main.size() + Move.Deps.size() > 1;
1149}
1150
1151template <typename T>
1152auto AlignVectors::cloneBefore(BasicBlock::iterator To, T &&Insts) const
1153 -> InstMap {
1154 InstMap Map;
1155
1156 for (Instruction *I : Insts) {
1157 assert(HVC.isSafeToClone(*I));
1158 Instruction *C = I->clone();
1159 C->setName(Twine("c.") + I->getName() + ".");
1160 C->insertBefore(To);
1161
1162 for (auto [Old, New] : Map)
1163 C->replaceUsesOfWith(Old, New);
1164 Map.insert(std::make_pair(I, C));
1165 }
1166 return Map;
1167}
1168
1169auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
1170 const ByteSpan &VSpan, int ScLen,
1171 Value *AlignVal, Value *AlignAddr) const
1172 -> void {
1173 LLVM_DEBUG(dbgs() << __func__ << "\n");
1174
1175 Type *SecTy = HVC.getByteTy(ScLen);
1176 int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
1177 bool DoAlign = !HVC.isZero(AlignVal);
1178 BasicBlock::iterator BasePos = Builder.GetInsertPoint();
1179 BasicBlock *BaseBlock = Builder.GetInsertBlock();
1180
1181 ByteSpan ASpan;
1182 auto *True = Constant::getAllOnesValue(HVC.getBoolTy(ScLen));
1183 auto *Undef = UndefValue::get(SecTy);
1184
1185 // Created load does not have to be "Instruction" (e.g. "undef").
1186 SmallVector<Value *> Loads(NumSectors + DoAlign, nullptr);
1187
1188 // We could create all of the aligned loads, and generate the valigns
1189 // at the location of the first load, but for large load groups, this
1190 // could create highly suboptimal code (there have been groups of 140+
1191 // loads in real code).
1192 // Instead, place the loads/valigns as close to the users as possible.
1193 // In any case we need to have a mapping from the blocks of VSpan (the
1194 // span covered by the pre-existing loads) to ASpan (the span covered
1195 // by the aligned loads). There is a small problem, though: ASpan needs
1196 // to have pointers to the loads/valigns, but we don't have these loads
1197 // because we don't know where to put them yet. We find out by creating
1198 // a section of ASpan that corresponds to values (blocks) from VSpan,
1199 // and checking where the new load should be placed. We need to attach
1200 // this location information to each block in ASpan somehow, so we put
1201 // distincts values for Seg.Val in each ASpan.Blocks[i], and use a map
1202 // to store the location for each Seg.Val.
1203 // The distinct values happen to be Blocks[i].Seg.Val = &Blocks[i],
1204 // which helps with printing ByteSpans without crashing when printing
1205 // Segments with these temporary identifiers in place of Val.
1206
1207 // Populate the blocks first, to avoid reallocations of the vector
1208 // interfering with generating the placeholder addresses.
1209 for (int Index = 0; Index != NumSectors; ++Index)
1210 ASpan.Blocks.emplace_back(nullptr, ScLen, Index * ScLen);
1211 for (int Index = 0; Index != NumSectors; ++Index) {
1212 ASpan.Blocks[Index].Seg.Val =
1213 reinterpret_cast<Value *>(&ASpan.Blocks[Index]);
1214 }
1215
1216 // Multiple values from VSpan can map to the same value in ASpan. Since we
1217 // try to create loads lazily, we need to find the earliest use for each
1218 // value from ASpan.
1219 DenseMap<void *, Instruction *> EarliestUser;
1220 auto isEarlier = [](Instruction *A, Instruction *B) {
1221 if (B == nullptr)
1222 return true;
1223 if (A == nullptr)
1224 return false;
1225 assert(A->getParent() == B->getParent());
1226 return A->comesBefore(B);
1227 };
1228 auto earliestUser = [&](const auto &Uses) {
1229 Instruction *User = nullptr;
1230 for (const Use &U : Uses) {
1231 auto *I = dyn_cast<Instruction>(U.getUser());
1232 assert(I != nullptr && "Load used in a non-instruction?");
1233 // Make sure we only consider users in this block, but we need
1234 // to remember if there were users outside the block too. This is
1235 // because if no users are found, aligned loads will not be created.
1236 if (I->getParent() == BaseBlock) {
1237 if (!isa<PHINode>(I))
1238 User = std::min(User, I, isEarlier);
1239 } else {
1240 User = std::min(User, BaseBlock->getTerminator(), isEarlier);
1241 }
1242 }
1243 return User;
1244 };
1245
1246 for (const ByteSpan::Block &B : VSpan) {
1247 ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size);
1248 for (const ByteSpan::Block &S : ASection) {
1249 auto &EU = EarliestUser[S.Seg.Val];
1250 EU = std::min(EU, earliestUser(B.Seg.Val->uses()), isEarlier);
1251 }
1252 }
1253
1254 LLVM_DEBUG({
1255 dbgs() << "ASpan:\n" << ASpan << '\n';
1256 dbgs() << "Earliest users of ASpan:\n";
1257 for (auto &[Val, User] : EarliestUser) {
1258 dbgs() << Val << "\n ->" << *User << '\n';
1259 }
1260 });
1261
1262 auto createLoad = [&](IRBuilderBase &Builder, const ByteSpan &VSpan,
1263 int Index, bool MakePred) {
1264 Value *Ptr =
1265 createAdjustedPointer(Builder, AlignAddr, SecTy, Index * ScLen);
1266 Value *Predicate =
1267 MakePred ? makeTestIfUnaligned(Builder, AlignVal, ScLen) : nullptr;
1268
1269 // If vector shifting is potentially needed, accumulate metadata
1270 // from source sections of twice the load width.
1271 int Start = (Index - DoAlign) * ScLen;
1272 int Width = (1 + DoAlign) * ScLen;
1273 return this->createLoad(Builder, SecTy, Ptr, Predicate, ScLen, True, Undef,
1274 VSpan.section(Start, Width).values());
1275 };
1276
1277 auto moveBefore = [this](BasicBlock::iterator In, BasicBlock::iterator To) {
1278 // Move In and its upward dependencies to before To.
1279 assert(In->getParent() == To->getParent());
1280 DepList Deps = getUpwardDeps(&*In, &*To);
1281 In->moveBefore(To);
1282 // DepList is sorted with respect to positions in the basic block.
1283 InstMap Map = cloneBefore(In, Deps);
1284 for (auto [Old, New] : Map)
1285 In->replaceUsesOfWith(Old, New);
1286 };
1287
1288 // Generate necessary loads at appropriate locations.
1289 LLVM_DEBUG(dbgs() << "Creating loads for ASpan sectors\n");
1290 for (int Index = 0; Index != NumSectors + 1; ++Index) {
1291 // In ASpan, each block will be either a single aligned load, or a
1292 // valign of a pair of loads. In the latter case, an aligned load j
1293 // will belong to the current valign, and the one in the previous
1294 // block (for j > 0).
1295 // Place the load at a location which will dominate the valign, assuming
1296 // the valign will be placed right before the earliest user.
1297 Instruction *PrevAt =
1298 DoAlign && Index > 0 ? EarliestUser[&ASpan[Index - 1]] : nullptr;
1299 Instruction *ThisAt =
1300 Index < NumSectors ? EarliestUser[&ASpan[Index]] : nullptr;
1301 if (auto *Where = std::min(PrevAt, ThisAt, isEarlier)) {
1302 Builder.SetInsertPoint(Where);
1303 Loads[Index] =
1304 createLoad(Builder, VSpan, Index, DoAlign && Index == NumSectors);
1305 // We know it's safe to put the load at BasePos, but we'd prefer to put
1306 // it at "Where". To see if the load is safe to be placed at Where, put
1307 // it there first and then check if it's safe to move it to BasePos.
1308 // If not, then the load needs to be placed at BasePos.
1309 // We can't do this check proactively because we need the load to exist
1310 // in order to check legality.
1311 if (auto *Load = dyn_cast<Instruction>(Loads[Index])) {
1312 if (!HVC.isSafeToMoveBeforeInBB(*Load, BasePos))
1313 moveBefore(Load->getIterator(), BasePos);
1314 }
1315 LLVM_DEBUG(dbgs() << "Loads[" << Index << "]:" << *Loads[Index] << '\n');
1316 }
1317 }
1318
1319 // Generate valigns if needed, and fill in proper values in ASpan
1320 LLVM_DEBUG(dbgs() << "Creating values for ASpan sectors\n");
1321 for (int Index = 0; Index != NumSectors; ++Index) {
1322 ASpan[Index].Seg.Val = nullptr;
1323 if (auto *Where = EarliestUser[&ASpan[Index]]) {
1324 Builder.SetInsertPoint(Where);
1325 Value *Val = Loads[Index];
1326 assert(Val != nullptr);
1327 if (DoAlign) {
1328 Value *NextLoad = Loads[Index + 1];
1329 assert(NextLoad != nullptr);
1330 Val = HVC.vralignb(Builder, Val, NextLoad, AlignVal);
1331 }
1332 ASpan[Index].Seg.Val = Val;
1333 LLVM_DEBUG(dbgs() << "ASpan[" << Index << "]:" << *Val << '\n');
1334 }
1335 }
1336
1337 for (const ByteSpan::Block &B : VSpan) {
1338 ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);
1339 Value *Accum = UndefValue::get(HVC.getByteTy(B.Seg.Size));
1340 Builder.SetInsertPoint(cast<Instruction>(B.Seg.Val));
1341
1342 // We're generating a reduction, where each instruction depends on
1343 // the previous one, so we need to order them according to the position
1344 // of their inputs in the code.
1345 std::vector<ByteSpan::Block *> ABlocks;
1346 for (ByteSpan::Block &S : ASection) {
1347 if (S.Seg.Val != nullptr)
1348 ABlocks.push_back(&S);
1349 }
1350 llvm::sort(ABlocks,
1351 [&](const ByteSpan::Block *A, const ByteSpan::Block *B) {
1352 return isEarlier(cast<Instruction>(A->Seg.Val),
1353 cast<Instruction>(B->Seg.Val));
1354 });
1355 for (ByteSpan::Block *S : ABlocks) {
1356 // The processing of the data loaded by the aligned loads
1357 // needs to be inserted after the data is available.
1358 Instruction *SegI = cast<Instruction>(S->Seg.Val);
1359 Builder.SetInsertPoint(&*std::next(SegI->getIterator()));
1360 Value *Pay = HVC.vbytes(Builder, getPayload(S->Seg.Val));
1361 Accum =
1362 HVC.insertb(Builder, Accum, Pay, S->Seg.Start, S->Seg.Size, S->Pos);
1363 }
1364 // Instead of casting everything to bytes for the vselect, cast to the
1365 // original value type. This will avoid complications with casting masks.
1366 // For example, in cases when the original mask applied to i32, it could
1367 // be converted to a mask applicable to i8 via pred_typecast intrinsic,
1368 // but if the mask is not exactly of HVX length, extra handling would be
1369 // needed to make it work.
1370 Type *ValTy = getPayload(B.Seg.Val)->getType();
1371 Value *Cast = Builder.CreateBitCast(Accum, ValTy, "cst");
1372 Value *Sel = Builder.CreateSelect(getMask(B.Seg.Val), Cast,
1373 getPassThrough(B.Seg.Val), "sel");
1374 B.Seg.Val->replaceAllUsesWith(Sel);
1375 }
1376}
1377
1378auto AlignVectors::realignStoreGroup(IRBuilderBase &Builder,
1379 const ByteSpan &VSpan, int ScLen,
1380 Value *AlignVal, Value *AlignAddr) const
1381 -> void {
1382 LLVM_DEBUG(dbgs() << __func__ << "\n");
1383
1384 Type *SecTy = HVC.getByteTy(ScLen);
1385 int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
1386 bool DoAlign = !HVC.isZero(AlignVal);
1387
1388 // Stores.
1389 ByteSpan ASpanV, ASpanM;
1390
1391 // Return a vector value corresponding to the input value Val:
1392 // either <1 x Val> for scalar Val, or Val itself for vector Val.
1393 auto MakeVec = [](IRBuilderBase &Builder, Value *Val) -> Value * {
1394 Type *Ty = Val->getType();
1395 if (Ty->isVectorTy())
1396 return Val;
1397 auto *VecTy = VectorType::get(Ty, 1, /*Scalable=*/false);
1398 return Builder.CreateBitCast(Val, VecTy, "cst");
1399 };
1400
1401 // Create an extra "undef" sector at the beginning and at the end.
1402 // They will be used as the left/right filler in the vlalign step.
1403 for (int Index = (DoAlign ? -1 : 0); Index != NumSectors + DoAlign; ++Index) {
1404 // For stores, the size of each section is an aligned vector length.
1405 // Adjust the store offsets relative to the section start offset.
1406 ByteSpan VSection =
1407 VSpan.section(Index * ScLen, ScLen).shift(-Index * ScLen);
1408 Value *Undef = UndefValue::get(SecTy);
1410 Value *AccumV = Undef;
1411 Value *AccumM = Zero;
1412 for (ByteSpan::Block &S : VSection) {
1413 Value *Pay = getPayload(S.Seg.Val);
1414 Value *Mask = HVC.rescale(Builder, MakeVec(Builder, getMask(S.Seg.Val)),
1415 Pay->getType(), HVC.getByteTy());
1416 Value *PartM = HVC.insertb(Builder, Zero, HVC.vbytes(Builder, Mask),
1417 S.Seg.Start, S.Seg.Size, S.Pos);
1418 AccumM = Builder.CreateOr(AccumM, PartM);
1419
1420 Value *PartV = HVC.insertb(Builder, Undef, HVC.vbytes(Builder, Pay),
1421 S.Seg.Start, S.Seg.Size, S.Pos);
1422
1423 AccumV = Builder.CreateSelect(
1424 Builder.CreateICmp(CmpInst::ICMP_NE, PartM, Zero), PartV, AccumV);
1425 }
1426 ASpanV.Blocks.emplace_back(AccumV, ScLen, Index * ScLen);
1427 ASpanM.Blocks.emplace_back(AccumM, ScLen, Index * ScLen);
1428 }
1429
1430 LLVM_DEBUG({
1431 dbgs() << "ASpanV before vlalign:\n" << ASpanV << '\n';
1432 dbgs() << "ASpanM before vlalign:\n" << ASpanM << '\n';
1433 });
1434
1435 // vlalign
1436 if (DoAlign) {
1437 for (int Index = 1; Index != NumSectors + 2; ++Index) {
1438 Value *PrevV = ASpanV[Index - 1].Seg.Val, *ThisV = ASpanV[Index].Seg.Val;
1439 Value *PrevM = ASpanM[Index - 1].Seg.Val, *ThisM = ASpanM[Index].Seg.Val;
1440 assert(isSectorTy(PrevV->getType()) && isSectorTy(PrevM->getType()));
1441 ASpanV[Index - 1].Seg.Val = HVC.vlalignb(Builder, PrevV, ThisV, AlignVal);
1442 ASpanM[Index - 1].Seg.Val = HVC.vlalignb(Builder, PrevM, ThisM, AlignVal);
1443 }
1444 }
1445
1446 LLVM_DEBUG({
1447 dbgs() << "ASpanV after vlalign:\n" << ASpanV << '\n';
1448 dbgs() << "ASpanM after vlalign:\n" << ASpanM << '\n';
1449 });
1450
1451 auto createStore = [&](IRBuilderBase &Builder, const ByteSpan &ASpanV,
1452 const ByteSpan &ASpanM, int Index, bool MakePred) {
1453 Value *Val = ASpanV[Index].Seg.Val;
1454 Value *Mask = ASpanM[Index].Seg.Val; // bytes
1455 if (HVC.isUndef(Val) || HVC.isZero(Mask))
1456 return;
1457 Value *Ptr =
1458 createAdjustedPointer(Builder, AlignAddr, SecTy, Index * ScLen);
1459 Value *Predicate =
1460 MakePred ? makeTestIfUnaligned(Builder, AlignVal, ScLen) : nullptr;
1461
1462 // If vector shifting is potentially needed, accumulate metadata
1463 // from source sections of twice the store width.
1464 int Start = (Index - DoAlign) * ScLen;
1465 int Width = (1 + DoAlign) * ScLen;
1466 this->createStore(Builder, Val, Ptr, Predicate, ScLen,
1467 HVC.vlsb(Builder, Mask),
1468 VSpan.section(Start, Width).values());
1469 };
1470
1471 for (int Index = 0; Index != NumSectors + DoAlign; ++Index) {
1472 createStore(Builder, ASpanV, ASpanM, Index, DoAlign && Index == NumSectors);
1473 }
1474}
1475
1476auto AlignVectors::realignGroup(const MoveGroup &Move) -> bool {
1477 LLVM_DEBUG(dbgs() << "Realigning group:\n" << Move << '\n');
1478
1479 // TODO: Needs support for masked loads/stores of "scalar" vectors.
1480 if (!Move.IsHvx)
1481 return false;
1482
1483 // Return the element with the maximum alignment from Range,
1484 // where GetValue obtains the value to compare from an element.
1485 auto getMaxOf = [](auto Range, auto GetValue) {
1486 return *llvm::max_element(Range, [&GetValue](auto &A, auto &B) {
1487 return GetValue(A) < GetValue(B);
1488 });
1489 };
1490
1491 AddrList &BaseInfos = AddrGroups[Move.Base];
1492
1493 // Conceptually, there is a vector of N bytes covering the addresses
1494 // starting from the minimum offset (i.e. Base.Addr+Start). This vector
1495 // represents a contiguous memory region that spans all accessed memory
1496 // locations.
1497 // The correspondence between loaded or stored values will be expressed
1498 // in terms of this vector. For example, the 0th element of the vector
1499 // from the Base address info will start at byte Start from the beginning
1500 // of this conceptual vector.
1501 //
1502 // This vector will be loaded/stored starting at the nearest down-aligned
1503 // address and the amount of the down-alignment will be AlignVal:
1504 // valign(load_vector(align_down(Base+Start)), AlignVal)
1505
1506 std::set<Instruction *> TestSet(Move.Main.begin(), Move.Main.end());
1507 AddrList MoveInfos;
1508
1510 BaseInfos, std::back_inserter(MoveInfos),
1511 [&TestSet](const AddrInfo &AI) { return TestSet.count(AI.Inst); });
1512
1513 // Maximum alignment present in the whole address group.
1514 const AddrInfo &WithMaxAlign =
1515 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.HaveAlign; });
1516 Align MaxGiven = WithMaxAlign.HaveAlign;
1517
1518 // Minimum alignment present in the move address group.
1519 const AddrInfo &WithMinOffset =
1520 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return -AI.Offset; });
1521
1522 const AddrInfo &WithMaxNeeded =
1523 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.NeedAlign; });
1524 Align MinNeeded = WithMaxNeeded.NeedAlign;
1525
1526 // Set the builder's insertion point right before the load group, or
1527 // immediately after the store group. (Instructions in a store group are
1528 // listed in reverse order.)
1529 Instruction *InsertAt = Move.Main.front();
1530 if (!Move.IsLoad) {
1531 // There should be a terminator (which store isn't, but check anyways).
1532 assert(InsertAt->getIterator() != InsertAt->getParent()->end());
1533 InsertAt = &*std::next(InsertAt->getIterator());
1534 }
1535
1536 IRBuilder Builder(InsertAt->getParent(), InsertAt->getIterator(),
1537 InstSimplifyFolder(HVC.DL));
1538 Value *AlignAddr = nullptr; // Actual aligned address.
1539 Value *AlignVal = nullptr; // Right-shift amount (for valign).
1540
1541 if (MinNeeded <= MaxGiven) {
1542 int Start = WithMinOffset.Offset;
1543 int OffAtMax = WithMaxAlign.Offset;
1544 // Shift the offset of the maximally aligned instruction (OffAtMax)
1545 // back by just enough multiples of the required alignment to cover the
1546 // distance from Start to OffAtMax.
1547 // Calculate the address adjustment amount based on the address with the
1548 // maximum alignment. This is to allow a simple gep instruction instead
1549 // of potential bitcasts to i8*.
1550 int Adjust = -alignTo(OffAtMax - Start, MinNeeded.value());
1551 AlignAddr = createAdjustedPointer(Builder, WithMaxAlign.Addr,
1552 WithMaxAlign.ValTy, Adjust, Move.Clones);
1553 int Diff = Start - (OffAtMax + Adjust);
1554 AlignVal = HVC.getConstInt(Diff);
1555 assert(Diff >= 0);
1556 assert(static_cast<decltype(MinNeeded.value())>(Diff) < MinNeeded.value());
1557 } else {
1558 // WithMinOffset is the lowest address in the group,
1559 // WithMinOffset.Addr = Base+Start.
1560 // Align instructions for both HVX (V6_valign) and scalar (S2_valignrb)
1561 // mask off unnecessary bits, so it's ok to just the original pointer as
1562 // the alignment amount.
1563 // Do an explicit down-alignment of the address to avoid creating an
1564 // aligned instruction with an address that is not really aligned.
1565 AlignAddr =
1566 createAlignedPointer(Builder, WithMinOffset.Addr, WithMinOffset.ValTy,
1567 MinNeeded.value(), Move.Clones);
1568 AlignVal =
1569 Builder.CreatePtrToInt(WithMinOffset.Addr, HVC.getIntTy(), "pti");
1570 if (auto *I = dyn_cast<Instruction>(AlignVal)) {
1571 for (auto [Old, New] : Move.Clones)
1572 I->replaceUsesOfWith(Old, New);
1573 }
1574 }
1575
1576 ByteSpan VSpan;
1577 for (const AddrInfo &AI : MoveInfos) {
1578 VSpan.Blocks.emplace_back(AI.Inst, HVC.getSizeOf(AI.ValTy),
1579 AI.Offset - WithMinOffset.Offset);
1580 }
1581
1582 // The aligned loads/stores will use blocks that are either scalars,
1583 // or HVX vectors. Let "sector" be the unified term for such a block.
1584 // blend(scalar, vector) -> sector...
1585 int ScLen = Move.IsHvx ? HVC.HST.getVectorLength()
1586 : std::max<int>(MinNeeded.value(), 4);
1587 assert(!Move.IsHvx || ScLen == 64 || ScLen == 128);
1588 assert(Move.IsHvx || ScLen == 4 || ScLen == 8);
1589
1590 LLVM_DEBUG({
1591 dbgs() << "ScLen: " << ScLen << "\n";
1592 dbgs() << "AlignVal:" << *AlignVal << "\n";
1593 dbgs() << "AlignAddr:" << *AlignAddr << "\n";
1594 dbgs() << "VSpan:\n" << VSpan << '\n';
1595 });
1596
1597 if (Move.IsLoad)
1598 realignLoadGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1599 else
1600 realignStoreGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1601
1602 for (auto *Inst : Move.Main)
1603 Inst->eraseFromParent();
1604
1605 return true;
1606}
1607
1608auto AlignVectors::makeTestIfUnaligned(IRBuilderBase &Builder, Value *AlignVal,
1609 int Alignment) const -> Value * {
1610 auto *AlignTy = AlignVal->getType();
1611 Value *And = Builder.CreateAnd(
1612 AlignVal, ConstantInt::get(AlignTy, Alignment - 1), "and");
1613 Value *Zero = ConstantInt::get(AlignTy, 0);
1614 return Builder.CreateICmpNE(And, Zero, "isz");
1615}
1616
1617auto AlignVectors::isSectorTy(Type *Ty) const -> bool {
1618 if (!HVC.isByteVecTy(Ty))
1619 return false;
1620 int Size = HVC.getSizeOf(Ty);
1621 if (HVC.HST.isTypeForHVX(Ty))
1622 return Size == static_cast<int>(HVC.HST.getVectorLength());
1623 return Size == 4 || Size == 8;
1624}
1625
1626auto AlignVectors::run() -> bool {
1627 LLVM_DEBUG(dbgs() << "\nRunning HVC::AlignVectors on " << HVC.F.getName()
1628 << '\n');
1629 if (!createAddressGroups())
1630 return false;
1631
1632 LLVM_DEBUG({
1633 dbgs() << "Address groups(" << AddrGroups.size() << "):\n";
1634 for (auto &[In, AL] : AddrGroups) {
1635 for (const AddrInfo &AI : AL)
1636 dbgs() << "---\n" << AI << '\n';
1637 }
1638 });
1639
1640 bool Changed = false;
1641 MoveList LoadGroups, StoreGroups;
1642
1643 for (auto &G : AddrGroups) {
1644 llvm::append_range(LoadGroups, createLoadGroups(G.second));
1645 llvm::append_range(StoreGroups, createStoreGroups(G.second));
1646 }
1647
1648 LLVM_DEBUG({
1649 dbgs() << "\nLoad groups(" << LoadGroups.size() << "):\n";
1650 for (const MoveGroup &G : LoadGroups)
1651 dbgs() << G << "\n";
1652 dbgs() << "Store groups(" << StoreGroups.size() << "):\n";
1653 for (const MoveGroup &G : StoreGroups)
1654 dbgs() << G << "\n";
1655 });
1656
1657 // Cumulative limit on the number of groups.
1658 unsigned CountLimit = VAGroupCountLimit;
1659 if (CountLimit == 0)
1660 return false;
1661
1662 if (LoadGroups.size() > CountLimit) {
1663 LoadGroups.resize(CountLimit);
1664 StoreGroups.clear();
1665 } else {
1666 unsigned StoreLimit = CountLimit - LoadGroups.size();
1667 if (StoreGroups.size() > StoreLimit)
1668 StoreGroups.resize(StoreLimit);
1669 }
1670
1671 for (auto &M : LoadGroups)
1672 Changed |= moveTogether(M);
1673 for (auto &M : StoreGroups)
1674 Changed |= moveTogether(M);
1675
1676 LLVM_DEBUG(dbgs() << "After moveTogether:\n" << HVC.F);
1677
1678 for (auto &M : LoadGroups)
1679 Changed |= realignGroup(M);
1680 for (auto &M : StoreGroups)
1681 Changed |= realignGroup(M);
1682
1683 return Changed;
1684}
1685
1686// --- End AlignVectors
1687
1688// --- Begin HvxIdioms
1689
1690auto HvxIdioms::getNumSignificantBits(Value *V, Instruction *In) const
1691 -> std::pair<unsigned, Signedness> {
1692 unsigned Bits = HVC.getNumSignificantBits(V, In);
1693 // The significant bits are calculated including the sign bit. This may
1694 // add an extra bit for zero-extended values, e.g. (zext i32 to i64) may
1695 // result in 33 significant bits. To avoid extra words, skip the extra
1696 // sign bit, but keep information that the value is to be treated as
1697 // unsigned.
1698 KnownBits Known = HVC.getKnownBits(V, In);
1699 Signedness Sign = Signed;
1700 unsigned NumToTest = 0; // Number of bits used in test for unsignedness.
1701 if (isPowerOf2_32(Bits))
1702 NumToTest = Bits;
1703 else if (Bits > 1 && isPowerOf2_32(Bits - 1))
1704 NumToTest = Bits - 1;
1705
1706 if (NumToTest != 0 && Known.Zero.ashr(NumToTest).isAllOnes()) {
1707 Sign = Unsigned;
1708 Bits = NumToTest;
1709 }
1710
1711 // If the top bit of the nearest power-of-2 is zero, this value is
1712 // positive. It could be treated as either signed or unsigned.
1713 if (unsigned Pow2 = PowerOf2Ceil(Bits); Pow2 != Bits) {
1714 if (Known.Zero.ashr(Pow2 - 1).isAllOnes())
1715 Sign = Positive;
1716 }
1717 return {Bits, Sign};
1718}
1719
1720auto HvxIdioms::canonSgn(SValue X, SValue Y) const
1721 -> std::pair<SValue, SValue> {
1722 // Canonicalize the signedness of X and Y, so that the result is one of:
1723 // S, S
1724 // U/P, S
1725 // U/P, U/P
1726 if (X.Sgn == Signed && Y.Sgn != Signed)
1727 std::swap(X, Y);
1728 return {X, Y};
1729}
1730
1731// Match
1732// (X * Y) [>> N], or
1733// ((X * Y) + (1 << M)) >> N
1734auto HvxIdioms::matchFxpMul(Instruction &In) const -> std::optional<FxpOp> {
1735 using namespace PatternMatch;
1736 auto *Ty = In.getType();
1737
1738 if (!Ty->isVectorTy() || !Ty->getScalarType()->isIntegerTy())
1739 return std::nullopt;
1740
1741 unsigned Width = cast<IntegerType>(Ty->getScalarType())->getBitWidth();
1742
1743 FxpOp Op;
1744 Value *Exp = &In;
1745
1746 // Fixed-point multiplication is always shifted right (except when the
1747 // fraction is 0 bits).
1748 auto m_Shr = [](auto &&V, auto &&S) {
1749 return m_CombineOr(m_LShr(V, S), m_AShr(V, S));
1750 };
1751
1752 uint64_t Qn = 0;
1753 if (Value *T; match(Exp, m_Shr(m_Value(T), m_ConstantInt(Qn)))) {
1754 Op.Frac = Qn;
1755 Exp = T;
1756 } else {
1757 Op.Frac = 0;
1758 }
1759
1760 if (Op.Frac > Width)
1761 return std::nullopt;
1762
1763 // Check if there is rounding added.
1764 uint64_t CV;
1765 if (Value *T;
1766 Op.Frac > 0 && match(Exp, m_Add(m_Value(T), m_ConstantInt(CV)))) {
1767 if (CV != 0 && !isPowerOf2_64(CV))
1768 return std::nullopt;
1769 if (CV != 0)
1770 Op.RoundAt = Log2_64(CV);
1771 Exp = T;
1772 }
1773
1774 // Check if the rest is a multiplication.
1775 if (match(Exp, m_Mul(m_Value(Op.X.Val), m_Value(Op.Y.Val)))) {
1776 Op.Opcode = Instruction::Mul;
1777 // FIXME: The information below is recomputed.
1778 Op.X.Sgn = getNumSignificantBits(Op.X.Val, &In).second;
1779 Op.Y.Sgn = getNumSignificantBits(Op.Y.Val, &In).second;
1780 Op.ResTy = cast<VectorType>(Ty);
1781 return Op;
1782 }
1783
1784 return std::nullopt;
1785}
1786
1787auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const
1788 -> Value * {
1789 assert(Op.X.Val->getType() == Op.Y.Val->getType());
1790
1791 auto *VecTy = dyn_cast<VectorType>(Op.X.Val->getType());
1792 if (VecTy == nullptr)
1793 return nullptr;
1794 auto *ElemTy = cast<IntegerType>(VecTy->getElementType());
1795 unsigned ElemWidth = ElemTy->getBitWidth();
1796
1797 // TODO: This can be relaxed after legalization is done pre-isel.
1798 if ((HVC.length(VecTy) * ElemWidth) % (8 * HVC.HST.getVectorLength()) != 0)
1799 return nullptr;
1800
1801 // There are no special intrinsics that should be used for multiplying
1802 // signed 8-bit values, so just skip them. Normal codegen should handle
1803 // this just fine.
1804 if (ElemWidth <= 8)
1805 return nullptr;
1806 // Similarly, if this is just a multiplication that can be handled without
1807 // intervention, then leave it alone.
1808 if (ElemWidth <= 32 && Op.Frac == 0)
1809 return nullptr;
1810
1811 auto [BitsX, SignX] = getNumSignificantBits(Op.X.Val, &In);
1812 auto [BitsY, SignY] = getNumSignificantBits(Op.Y.Val, &In);
1813
1814 // TODO: Add multiplication of vectors by scalar registers (up to 4 bytes).
1815
1816 Value *X = Op.X.Val, *Y = Op.Y.Val;
1817 IRBuilder Builder(In.getParent(), In.getIterator(),
1818 InstSimplifyFolder(HVC.DL));
1819
1820 auto roundUpWidth = [](unsigned Width) -> unsigned {
1821 if (Width <= 32 && !isPowerOf2_32(Width)) {
1822 // If the element width is not a power of 2, round it up
1823 // to the next one. Do this for widths not exceeding 32.
1824 return PowerOf2Ceil(Width);
1825 }
1826 if (Width > 32 && Width % 32 != 0) {
1827 // For wider elements, round it up to the multiple of 32.
1828 return alignTo(Width, 32u);
1829 }
1830 return Width;
1831 };
1832
1833 BitsX = roundUpWidth(BitsX);
1834 BitsY = roundUpWidth(BitsY);
1835
1836 // For elementwise multiplication vectors must have the same lengths, so
1837 // resize the elements of both inputs to the same width, the max of the
1838 // calculated significant bits.
1839 unsigned Width = std::max(BitsX, BitsY);
1840
1841 auto *ResizeTy = VectorType::get(HVC.getIntTy(Width), VecTy);
1842 if (Width < ElemWidth) {
1843 X = Builder.CreateTrunc(X, ResizeTy, "trn");
1844 Y = Builder.CreateTrunc(Y, ResizeTy, "trn");
1845 } else if (Width > ElemWidth) {
1846 X = SignX == Signed ? Builder.CreateSExt(X, ResizeTy, "sxt")
1847 : Builder.CreateZExt(X, ResizeTy, "zxt");
1848 Y = SignY == Signed ? Builder.CreateSExt(Y, ResizeTy, "sxt")
1849 : Builder.CreateZExt(Y, ResizeTy, "zxt");
1850 };
1851
1852 assert(X->getType() == Y->getType() && X->getType() == ResizeTy);
1853
1854 unsigned VecLen = HVC.length(ResizeTy);
1855 unsigned ChopLen = (8 * HVC.HST.getVectorLength()) / std::min(Width, 32u);
1856
1858 FxpOp ChopOp = Op;
1859 ChopOp.ResTy = VectorType::get(Op.ResTy->getElementType(), ChopLen, false);
1860
1861 for (unsigned V = 0; V != VecLen / ChopLen; ++V) {
1862 ChopOp.X.Val = HVC.subvector(Builder, X, V * ChopLen, ChopLen);
1863 ChopOp.Y.Val = HVC.subvector(Builder, Y, V * ChopLen, ChopLen);
1864 Results.push_back(processFxpMulChopped(Builder, In, ChopOp));
1865 if (Results.back() == nullptr)
1866 break;
1867 }
1868
1869 if (Results.empty() || Results.back() == nullptr)
1870 return nullptr;
1871
1872 Value *Cat = HVC.concat(Builder, Results);
1873 Value *Ext = SignX == Signed || SignY == Signed
1874 ? Builder.CreateSExt(Cat, VecTy, "sxt")
1875 : Builder.CreateZExt(Cat, VecTy, "zxt");
1876 return Ext;
1877}
1878
1879inline bool HvxIdioms::matchScatter(Instruction &In) const {
1880 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
1881 if (!II)
1882 return false;
1883 return (II->getIntrinsicID() == Intrinsic::masked_scatter);
1884}
1885
1886inline bool HvxIdioms::matchGather(Instruction &In) const {
1887 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
1888 if (!II)
1889 return false;
1890 return (II->getIntrinsicID() == Intrinsic::masked_gather);
1891}
1892
1893inline bool HvxIdioms::matchMLoad(Instruction &In) const {
1894 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
1895 if (!II)
1896 return false;
1897 return (II->getIntrinsicID() == Intrinsic::masked_load);
1898}
1899
1900inline bool HvxIdioms::matchMStore(Instruction &In) const {
1901 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
1902 if (!II)
1903 return false;
1904 return (II->getIntrinsicID() == Intrinsic::masked_store);
1905}
1906
1907Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual);
1908
1909// Binary instructions we want to handle as users of gather/scatter.
1910inline bool isArithmetic(unsigned Opc) {
1911 switch (Opc) {
1912 case Instruction::Add:
1913 case Instruction::Sub:
1914 case Instruction::Mul:
1915 case Instruction::And:
1916 case Instruction::Or:
1917 case Instruction::Xor:
1918 case Instruction::AShr:
1919 case Instruction::LShr:
1920 case Instruction::Shl:
1921 case Instruction::UDiv:
1922 return true;
1923 }
1924 return false;
1925}
1926
1927// TODO: Maybe use MemoryLocation for this. See getLocOrNone above.
1928inline Value *getPointer(Value *Ptr) {
1929 assert(Ptr && "Unable to extract pointer");
1930 if (isa<AllocaInst>(Ptr) || isa<Argument>(Ptr) || isa<GlobalValue>(Ptr))
1931 return Ptr;
1932 if (isa<LoadInst>(Ptr) || isa<StoreInst>(Ptr))
1933 return getLoadStorePointerOperand(Ptr);
1935 if (II->getIntrinsicID() == Intrinsic::masked_store)
1936 return II->getOperand(1);
1937 }
1938 return nullptr;
1939}
1940
1942 HvxIdioms::DstQualifier &Qual) {
1943 Instruction *Destination = nullptr;
1944 if (!In)
1945 return Destination;
1946 if (isa<StoreInst>(In)) {
1947 Destination = In;
1948 Qual = HvxIdioms::LdSt;
1949 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {
1950 if (II->getIntrinsicID() == Intrinsic::masked_gather) {
1951 Destination = In;
1952 Qual = HvxIdioms::LLVM_Gather;
1953 } else if (II->getIntrinsicID() == Intrinsic::masked_scatter) {
1954 Destination = In;
1955 Qual = HvxIdioms::LLVM_Scatter;
1956 } else if (II->getIntrinsicID() == Intrinsic::masked_store) {
1957 Destination = In;
1958 Qual = HvxIdioms::LdSt;
1959 } else if (II->getIntrinsicID() ==
1960 Intrinsic::hexagon_V6_vgather_vscattermh) {
1961 Destination = In;
1962 Qual = HvxIdioms::HEX_Gather_Scatter;
1963 } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vscattermh_128B) {
1964 Destination = In;
1965 Qual = HvxIdioms::HEX_Scatter;
1966 } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vgathermh_128B) {
1967 Destination = In;
1968 Qual = HvxIdioms::HEX_Gather;
1969 }
1970 } else if (isa<ZExtInst>(In)) {
1971 return locateDestination(In, Qual);
1972 } else if (isa<CastInst>(In)) {
1973 return locateDestination(In, Qual);
1974 } else if (isa<CallInst>(In)) {
1975 Destination = In;
1976 Qual = HvxIdioms::Call;
1977 } else if (isa<GetElementPtrInst>(In)) {
1978 return locateDestination(In, Qual);
1979 } else if (isArithmetic(In->getOpcode())) {
1980 Destination = In;
1981 Qual = HvxIdioms::Arithmetic;
1982 } else {
1983 LLVM_DEBUG(dbgs() << "Unhandled destination : " << *In << "\n");
1984 }
1985 return Destination;
1986}
1987
1988// This method attempts to find destination (user) for a given intrinsic.
1989// Given that these are produced only by Ripple, the number of options is
1990// limited. Simplest case is explicit store which in fact is redundant (since
1991// HVX gater creates its own store during packetization). Nevertheless we need
1992// to figure address where we storing. Other cases are more complicated, but
1993// still few.
1994Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual) {
1995 Instruction *Destination = nullptr;
1996 if (!In)
1997 return Destination;
1998 // Get all possible destinations
2000 // Iterate over the uses of the instruction
2001 for (auto &U : In->uses()) {
2002 if (auto *UI = dyn_cast<Instruction>(U.getUser())) {
2003 Destination = selectDestination(UI, Qual);
2004 if (Destination)
2005 Users.push_back(Destination);
2006 }
2007 }
2008 // Now see which of the users (if any) is a memory destination.
2009 for (auto *I : Users)
2010 if (getPointer(I))
2011 return I;
2012 return Destination;
2013}
2014
2015// The two intrinsics we handle here have GEP in a different position.
2017 assert(In && "Bad instruction");
2019 assert((IIn && (IIn->getIntrinsicID() == Intrinsic::masked_gather ||
2020 IIn->getIntrinsicID() == Intrinsic::masked_scatter)) &&
2021 "Not a gather Intrinsic");
2022 GetElementPtrInst *GEPIndex = nullptr;
2023 if (IIn->getIntrinsicID() == Intrinsic::masked_gather)
2024 GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(0));
2025 else
2026 GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(1));
2027 return GEPIndex;
2028}
2029
2030// Given the intrinsic find its GEP argument and extract base address it uses.
2031// The method relies on the way how Ripple typically forms the GEP for
2032// scatter/gather.
2035 if (!GEPIndex) {
2036 LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n");
2037 return nullptr;
2038 }
2039 Value *BaseAddress = GEPIndex->getPointerOperand();
2040 auto *IndexLoad = dyn_cast<LoadInst>(BaseAddress);
2041 if (IndexLoad)
2042 return IndexLoad;
2043
2044 auto *IndexZEx = dyn_cast<ZExtInst>(BaseAddress);
2045 if (IndexZEx) {
2046 IndexLoad = dyn_cast<LoadInst>(IndexZEx->getOperand(0));
2047 if (IndexLoad)
2048 return IndexLoad;
2049 IntrinsicInst *II = dyn_cast<IntrinsicInst>(IndexZEx->getOperand(0));
2050 if (II && II->getIntrinsicID() == Intrinsic::masked_gather)
2052 }
2053 auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(BaseAddress);
2054 if (BaseShuffle) {
2055 IndexLoad = dyn_cast<LoadInst>(BaseShuffle->getOperand(0));
2056 if (IndexLoad)
2057 return IndexLoad;
2058 auto *IE = dyn_cast<InsertElementInst>(BaseShuffle->getOperand(0));
2059 if (IE) {
2060 auto *Src = IE->getOperand(1);
2061 IndexLoad = dyn_cast<LoadInst>(Src);
2062 if (IndexLoad)
2063 return IndexLoad;
2064 auto *Alloca = dyn_cast<AllocaInst>(Src);
2065 if (Alloca)
2066 return Alloca;
2067 if (isa<Argument>(Src)) {
2068 return Src;
2069 }
2070 if (isa<GlobalValue>(Src)) {
2071 return Src;
2072 }
2073 }
2074 }
2075 LLVM_DEBUG(dbgs() << " Unable to locate Address from intrinsic\n");
2076 return nullptr;
2077}
2078
2080 if (!In)
2081 return nullptr;
2082
2083 if (isa<LoadInst>(In) || isa<StoreInst>(In))
2084 return getLoadStoreType(In);
2085
2087 if (II->getIntrinsicID() == Intrinsic::masked_load)
2088 return II->getType();
2089 if (II->getIntrinsicID() == Intrinsic::masked_store)
2090 return II->getOperand(0)->getType();
2091 }
2092 return In->getType();
2093}
2094
2096 if (!In)
2097 return nullptr;
2098 if (isa<LoadInst>(In))
2099 return In;
2101 if (II->getIntrinsicID() == Intrinsic::masked_load)
2102 return In;
2103 if (II->getIntrinsicID() == Intrinsic::masked_gather)
2104 return In;
2105 }
2106 if (auto *IndexZEx = dyn_cast<ZExtInst>(In))
2107 return locateIndexesFromGEP(IndexZEx->getOperand(0));
2108 if (auto *IndexSEx = dyn_cast<SExtInst>(In))
2109 return locateIndexesFromGEP(IndexSEx->getOperand(0));
2110 if (auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(In))
2111 return locateIndexesFromGEP(BaseShuffle->getOperand(0));
2112 if (auto *IE = dyn_cast<InsertElementInst>(In))
2113 return locateIndexesFromGEP(IE->getOperand(1));
2114 if (auto *cstDataVector = dyn_cast<ConstantDataVector>(In))
2115 return cstDataVector;
2116 if (auto *GEPIndex = dyn_cast<GetElementPtrInst>(In))
2117 return GEPIndex->getOperand(0);
2118 return nullptr;
2119}
2120
2121// Given the intrinsic find its GEP argument and extract offsetts from the base
2122// address it uses.
2125 if (!GEPIndex) {
2126 LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n");
2127 return nullptr;
2128 }
2129 Value *Indexes = GEPIndex->getOperand(1);
2130 if (auto *IndexLoad = locateIndexesFromGEP(Indexes))
2131 return IndexLoad;
2132
2133 LLVM_DEBUG(dbgs() << " Unable to locate Index from intrinsic\n");
2134 return nullptr;
2135}
2136
2137// Because of aukward definition of many Hex intrinsics we often have to
2138// reinterprete HVX native <64 x i16> as <32 x i32> which in practice is a NOP
2139// for all use cases, so this only exist to make IR builder happy.
2140inline Value *getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC,
2141 IRBuilderBase &Builder,
2142 LLVMContext &Ctx, Value *I) {
2143 assert(I && "Unable to reinterprete cast");
2144 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2145 std::vector<unsigned> shuffleMask;
2146 for (unsigned i = 0; i < 64; ++i)
2147 shuffleMask.push_back(i);
2148 Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);
2149 Value *CastShuffle =
2150 Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");
2151 return Builder.CreateBitCast(CastShuffle, NT, "cst64_i16_to_32_i32");
2152}
2153
2154// Recast <128 x i8> as <32 x i32>
2155inline Value *getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC,
2156 IRBuilderBase &Builder,
2157 LLVMContext &Ctx, Value *I) {
2158 assert(I && "Unable to reinterprete cast");
2159 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2160 std::vector<unsigned> shuffleMask;
2161 for (unsigned i = 0; i < 128; ++i)
2162 shuffleMask.push_back(i);
2163 Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);
2164 Value *CastShuffle =
2165 Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");
2166 return Builder.CreateBitCast(CastShuffle, NT, "cst128_i8_to_32_i32");
2167}
2168
2169// Create <32 x i32> mask reinterpreted as <128 x i1> with a given pattern
2170inline Value *get_i32_Mask(const HexagonVectorCombine &HVC,
2171 IRBuilderBase &Builder, LLVMContext &Ctx,
2172 unsigned int pattern) {
2173 std::vector<unsigned int> byteMask;
2174 for (unsigned i = 0; i < 32; ++i)
2175 byteMask.push_back(pattern);
2176
2177 return Builder.CreateIntrinsic(
2178 HVC.getBoolTy(128), HVC.HST.getIntrinsicId(Hexagon::V6_vandvrt),
2179 {llvm::ConstantDataVector::get(Ctx, byteMask), HVC.getConstInt(~0)},
2180 nullptr);
2181}
2182
2183Value *HvxIdioms::processVScatter(Instruction &In) const {
2184 auto *InpTy = dyn_cast<VectorType>(In.getOperand(0)->getType());
2185 assert(InpTy && "Cannot handle no vector type for llvm.scatter/gather");
2186 unsigned InpSize = HVC.getSizeOf(InpTy);
2187 auto *F = In.getFunction();
2188 LLVMContext &Ctx = F->getContext();
2189 auto *ElemTy = dyn_cast<IntegerType>(InpTy->getElementType());
2190 assert(ElemTy && "llvm.scatter needs integer type argument");
2191 unsigned ElemWidth = HVC.DL.getTypeAllocSize(ElemTy);
2192 LLVM_DEBUG({
2193 unsigned Elements = HVC.length(InpTy);
2194 dbgs() << "\n[Process scatter](" << In << ")\n" << *In.getParent() << "\n";
2195 dbgs() << " Input type(" << *InpTy << ") elements(" << Elements
2196 << ") VecLen(" << InpSize << ") type(" << *ElemTy << ") ElemWidth("
2197 << ElemWidth << ")\n";
2198 });
2199
2200 IRBuilder Builder(In.getParent(), In.getIterator(),
2201 InstSimplifyFolder(HVC.DL));
2202
2203 auto *ValueToScatter = In.getOperand(0);
2204 LLVM_DEBUG(dbgs() << " ValueToScatter : " << *ValueToScatter << "\n");
2205
2206 if (HVC.HST.getVectorLength() != InpSize) {
2207 LLVM_DEBUG(dbgs() << "Unhandled vector size(" << InpSize
2208 << ") for vscatter\n");
2209 return nullptr;
2210 }
2211
2212 // Base address of indexes.
2213 auto *IndexLoad = locateAddressFromIntrinsic(&In);
2214 if (!IndexLoad)
2215 return nullptr;
2216 LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n");
2217
2218 // Address of destination. Must be in VTCM.
2219 auto *Ptr = getPointer(IndexLoad);
2220 if (!Ptr)
2221 return nullptr;
2222 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2223 // Indexes/offsets
2224 auto *Indexes = locateIndexesFromIntrinsic(&In);
2225 if (!Indexes)
2226 return nullptr;
2227 LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n");
2228 Value *CastedDst = Builder.CreateBitOrPointerCast(Ptr, Type::getInt32Ty(Ctx),
2229 "cst_ptr_to_i32");
2230 LLVM_DEBUG(dbgs() << " CastedDst : " << *CastedDst << "\n");
2231 // Adjust Indexes
2232 auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
2233 Value *CastIndex = nullptr;
2234 if (cstDataVector) {
2235 // Our indexes are represented as a constant. We need it in a reg.
2236 AllocaInst *IndexesAlloca =
2237 Builder.CreateAlloca(HVC.getHvxTy(HVC.getIntTy(32), false));
2238 [[maybe_unused]] auto *StoreIndexes =
2239 Builder.CreateStore(cstDataVector, IndexesAlloca);
2240 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2241 CastIndex = Builder.CreateLoad(IndexesAlloca->getAllocatedType(),
2242 IndexesAlloca, "reload_index");
2243 } else {
2244 if (ElemWidth == 2)
2245 CastIndex = getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
2246 else
2247 CastIndex = Indexes;
2248 }
2249 LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n");
2250
2251 if (ElemWidth == 1) {
2252 // v128i8 There is no native instruction for this.
2253 // Do this as two Hi/Lo gathers with masking.
2254 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2255 // Extend indexes. We assume that indexes are in 128i8 format - need to
2256 // expand them to Hi/Lo 64i16
2257 Value *CastIndexes = Builder.CreateBitCast(CastIndex, NT, "cast_to_32i32");
2258 auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);
2259 auto *UnpackedIndexes = Builder.CreateIntrinsic(
2260 HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastIndexes, nullptr);
2261 LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes << ")\n");
2262
2263 auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);
2264 auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);
2265 [[maybe_unused]] Value *IndexHi =
2266 HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);
2267 [[maybe_unused]] Value *IndexLo =
2268 HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);
2269 LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n");
2270 LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n");
2271 // Now unpack values to scatter
2272 Value *CastSrc =
2273 getReinterpretiveCast_i8_to_i32(HVC, Builder, Ctx, ValueToScatter);
2274 LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n");
2275 auto *UnpackedValueToScatter = Builder.CreateIntrinsic(
2276 HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastSrc, nullptr);
2277 LLVM_DEBUG(dbgs() << " UnpackedValToScat: " << *UnpackedValueToScatter
2278 << ")\n");
2279
2280 [[maybe_unused]] Value *UVSHi =
2281 HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedValueToScatter);
2282 [[maybe_unused]] Value *UVSLo =
2283 HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedValueToScatter);
2284 LLVM_DEBUG(dbgs() << " UVSHi : " << *UVSHi << ")\n");
2285 LLVM_DEBUG(dbgs() << " UVSLo : " << *UVSLo << ")\n");
2286
2287 // Create the mask for individual bytes
2288 auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);
2289 LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n");
2290 [[maybe_unused]] auto *ResHi = Builder.CreateIntrinsic(
2291 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,
2292 {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2293 IndexHi, UVSHi},
2294 nullptr);
2295 LLVM_DEBUG(dbgs() << " ResHi : " << *ResHi << ")\n");
2296 return Builder.CreateIntrinsic(
2297 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,
2298 {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2299 IndexLo, UVSLo},
2300 nullptr);
2301 } else if (ElemWidth == 2) {
2302 Value *CastSrc =
2303 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, ValueToScatter);
2304 LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n");
2305 return Builder.CreateIntrinsic(
2306 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermh_128B,
2307 {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
2308 CastSrc},
2309 nullptr);
2310 } else if (ElemWidth == 4) {
2311 return Builder.CreateIntrinsic(
2312 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermw_128B,
2313 {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
2314 ValueToScatter},
2315 nullptr);
2316 } else {
2317 LLVM_DEBUG(dbgs() << "Unhandled element type for vscatter\n");
2318 return nullptr;
2319 }
2320}
2321
2322Value *HvxIdioms::processVGather(Instruction &In) const {
2323 [[maybe_unused]] auto *InpTy =
2324 dyn_cast<VectorType>(In.getOperand(0)->getType());
2325 assert(InpTy && "Cannot handle no vector type for llvm.gather");
2326 [[maybe_unused]] auto *ElemTy =
2327 dyn_cast<PointerType>(InpTy->getElementType());
2328 assert(ElemTy && "llvm.gather needs vector of ptr argument");
2329 auto *F = In.getFunction();
2330 LLVMContext &Ctx = F->getContext();
2331 LLVM_DEBUG(dbgs() << "\n[Process gather](" << In << ")\n"
2332 << *In.getParent() << "\n");
2333 LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements("
2334 << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
2335 << ") type(" << *ElemTy << ") Access alignment("
2336 << *In.getOperand(1) << ") AddressSpace("
2337 << ElemTy->getAddressSpace() << ")\n");
2338
2339 // TODO: Handle masking of elements.
2340 assert(dyn_cast<VectorType>(In.getOperand(2)->getType()) &&
2341 "llvm.gather needs vector for mask");
2342 IRBuilder Builder(In.getParent(), In.getIterator(),
2343 InstSimplifyFolder(HVC.DL));
2344
2345 // See who is using the result. The difference between LLVM and HVX vgather
2346 // Intrinsic makes it impossible to handle all cases with temp storage. Alloca
2347 // in VTCM is not yet supported, so for now we just bail out for those cases.
2348 HvxIdioms::DstQualifier Qual = HvxIdioms::Undefined;
2349 Instruction *Dst = locateDestination(&In, Qual);
2350 if (!Dst) {
2351 LLVM_DEBUG(dbgs() << " Unable to locate vgather destination\n");
2352 return nullptr;
2353 }
2354 LLVM_DEBUG(dbgs() << " Destination : " << *Dst << " Qual(" << Qual
2355 << ")\n");
2356
2357 // Address of destination. Must be in VTCM.
2358 auto *Ptr = getPointer(Dst);
2359 if (!Ptr) {
2360 LLVM_DEBUG(dbgs() << "Could not locate vgather destination ptr\n");
2361 return nullptr;
2362 }
2363
2364 // Result type. Assume it is a vector type.
2365 auto *DstType = cast<VectorType>(getIndexType(Dst));
2366 assert(DstType && "Cannot handle non vector dst type for llvm.gather");
2367
2368 // Base address for sources to be loaded
2369 auto *IndexLoad = locateAddressFromIntrinsic(&In);
2370 if (!IndexLoad)
2371 return nullptr;
2372 LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n");
2373
2374 // Gather indexes/offsets
2375 auto *Indexes = locateIndexesFromIntrinsic(&In);
2376 if (!Indexes)
2377 return nullptr;
2378 LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n");
2379
2380 Instruction *Gather = nullptr;
2381 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2382 if (Qual == HvxIdioms::LdSt || Qual == HvxIdioms::Arithmetic) {
2383 // We fully assume the address space is in VTCM. We also assume that all
2384 // pointers in Operand(0) have the same base(!).
2385 // This is the most basic case of all the above.
2386 unsigned OutputSize = HVC.getSizeOf(DstType);
2387 auto *DstElemTy = cast<IntegerType>(DstType->getElementType());
2388 unsigned ElemWidth = HVC.DL.getTypeAllocSize(DstElemTy);
2389 LLVM_DEBUG(dbgs() << " Buffer type : " << *Ptr->getType()
2390 << " Address space ("
2391 << Ptr->getType()->getPointerAddressSpace() << ")\n"
2392 << " Result type : " << *DstType
2393 << "\n Size in bytes : " << OutputSize
2394 << " element type(" << *DstElemTy
2395 << ")\n ElemWidth : " << ElemWidth << " bytes\n");
2396
2397 auto *IndexType = cast<VectorType>(getIndexType(Indexes));
2398 assert(IndexType && "Cannot handle non vector index type for llvm.gather");
2399 unsigned IndexWidth = HVC.DL.getTypeAllocSize(IndexType->getElementType());
2400 LLVM_DEBUG(dbgs() << " IndexWidth(" << IndexWidth << ")\n");
2401
2402 // Intrinsic takes i32 instead of pointer so cast.
2403 Value *CastedPtr = Builder.CreateBitOrPointerCast(
2404 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2405 // [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ...]
2406 // int_hexagon_V6_vgathermh [... , llvm_v16i32_ty]
2407 // int_hexagon_V6_vgathermh_128B [... , llvm_v32i32_ty]
2408 // int_hexagon_V6_vgathermhw [... , llvm_v32i32_ty]
2409 // int_hexagon_V6_vgathermhw_128B [... , llvm_v64i32_ty]
2410 // int_hexagon_V6_vgathermw [... , llvm_v16i32_ty]
2411 // int_hexagon_V6_vgathermw_128B [... , llvm_v32i32_ty]
2412 if (HVC.HST.getVectorLength() == OutputSize) {
2413 if (ElemWidth == 1) {
2414 // v128i8 There is no native instruction for this.
2415 // Do this as two Hi/Lo gathers with masking.
2416 // Unpack indexes. We assume that indexes are in 128i8 format - need to
2417 // expand them to Hi/Lo 64i16
2418 Value *CastIndexes =
2419 Builder.CreateBitCast(Indexes, NT, "cast_to_32i32");
2420 auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);
2421 auto *UnpackedIndexes =
2422 Builder.CreateIntrinsic(HVC.getHvxTy(HVC.getIntTy(32), true),
2423 V6_vunpack, CastIndexes, nullptr);
2424 LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes
2425 << ")\n");
2426
2427 auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);
2428 auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);
2429 [[maybe_unused]] Value *IndexHi =
2430 HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);
2431 [[maybe_unused]] Value *IndexLo =
2432 HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);
2433 LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n");
2434 LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n");
2435 // Create the mask for individual bytes
2436 auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);
2437 LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n");
2438 // We use our destination allocation as a temp storage
2439 // This is unlikely to work properly for masked gather.
2440 auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermhq);
2441 [[maybe_unused]] auto GatherHi = Builder.CreateIntrinsic(
2442 Type::getVoidTy(Ctx), V6_vgather,
2443 {Ptr, QByteMask, CastedPtr,
2444 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexHi},
2445 nullptr);
2446 LLVM_DEBUG(dbgs() << " GatherHi : " << *GatherHi << ")\n");
2447 // Rematerialize the result
2448 [[maybe_unused]] Value *LoadedResultHi = Builder.CreateLoad(
2449 HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_hi");
2450 LLVM_DEBUG(dbgs() << " LoadedResultHi : " << *LoadedResultHi << "\n");
2451 // Same for the low part. Here we use Gather to return non-NULL result
2452 // from this function and continue to iterate. We also are deleting Dst
2453 // store below.
2454 Gather = Builder.CreateIntrinsic(
2455 Type::getVoidTy(Ctx), V6_vgather,
2456 {Ptr, QByteMask, CastedPtr,
2457 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexLo},
2458 nullptr);
2459 LLVM_DEBUG(dbgs() << " GatherLo : " << *Gather << ")\n");
2460 Value *LoadedResultLo = Builder.CreateLoad(
2461 HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_lo");
2462 LLVM_DEBUG(dbgs() << " LoadedResultLo : " << *LoadedResultLo << "\n");
2463 // Now we have properly sized bytes in every other position
2464 // B b A a c a A b B c f F g G h H is presented as
2465 // B . b . A . a . c . a . A . b . B . c . f . F . g . G . h . H
2466 // Use vpack to gather them
2467 auto V6_vpackeb = HVC.HST.getIntrinsicId(Hexagon::V6_vpackeb);
2468 [[maybe_unused]] auto Res = Builder.CreateIntrinsic(
2469 NT, V6_vpackeb, {LoadedResultHi, LoadedResultLo}, nullptr);
2470 LLVM_DEBUG(dbgs() << " ScaledRes : " << *Res << "\n");
2471 [[maybe_unused]] auto *StoreRes = Builder.CreateStore(Res, Ptr);
2472 LLVM_DEBUG(dbgs() << " StoreRes : " << *StoreRes << "\n");
2473 } else if (ElemWidth == 2) {
2474 // v32i16
2475 if (IndexWidth == 2) {
2476 // Reinterprete 64i16 as 32i32. Only needed for syntactic IR match.
2477 Value *CastIndex =
2478 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
2479 LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n");
2480 // shift all i16 left by 1 to match short addressing mode instead of
2481 // byte.
2482 auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
2483 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2484 Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});
2486 << " Shifted half index: " << *AdjustedIndex << ")\n");
2487
2488 auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermh);
2489 // The 3rd argument is the size of the region to gather from. Probably
2490 // want to set it to max VTCM size.
2491 Gather = Builder.CreateIntrinsic(
2492 Type::getVoidTy(Ctx), V6_vgather,
2493 {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2494 AdjustedIndex},
2495 nullptr);
2496 for (auto &U : Dst->uses()) {
2497 if (auto *UI = dyn_cast<Instruction>(U.getUser()))
2498 dbgs() << " dst used by: " << *UI << "\n";
2499 }
2500 for (auto &U : In.uses()) {
2501 if (auto *UI = dyn_cast<Instruction>(U.getUser()))
2502 dbgs() << " In used by : " << *UI << "\n";
2503 }
2504 // Create temp load from result in case the result is used by any
2505 // other instruction.
2506 Value *LoadedResult = Builder.CreateLoad(
2507 HVC.getHvxTy(HVC.getIntTy(16), false), Ptr, "temp_result");
2508 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2509 In.replaceAllUsesWith(LoadedResult);
2510 } else {
2511 dbgs() << " Unhandled index type for vgather\n";
2512 return nullptr;
2513 }
2514 } else if (ElemWidth == 4) {
2515 if (IndexWidth == 4) {
2516 // v32i32
2517 auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
2518 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2519 Builder, V6_vaslh, NT, {Indexes, HVC.getConstInt(2)});
2521 << " Shifted word index: " << *AdjustedIndex << ")\n");
2522 Gather = Builder.CreateIntrinsic(
2523 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermw_128B,
2524 {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2525 AdjustedIndex},
2526 nullptr);
2527 } else {
2528 LLVM_DEBUG(dbgs() << " Unhandled index type for vgather\n");
2529 return nullptr;
2530 }
2531 } else {
2532 LLVM_DEBUG(dbgs() << " Unhandled element type for vgather\n");
2533 return nullptr;
2534 }
2535 } else if (HVC.HST.getVectorLength() == OutputSize * 2) {
2536 // This is half of the reg width, duplicate low in high
2537 LLVM_DEBUG(dbgs() << " Unhandled half of register size\n");
2538 return nullptr;
2539 } else if (HVC.HST.getVectorLength() * 2 == OutputSize) {
2540 LLVM_DEBUG(dbgs() << " Unhandle twice the register size\n");
2541 return nullptr;
2542 }
2543 // Erase the original intrinsic and store that consumes it.
2544 // HVX will create a pseudo for gather that is expanded to gather + store
2545 // during packetization.
2546 Dst->eraseFromParent();
2547 } else if (Qual == HvxIdioms::LLVM_Scatter) {
2548 // Gather feeds directly into scatter.
2549 LLVM_DEBUG({
2550 auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType());
2551 assert(DstInpTy && "Cannot handle no vector type for llvm.scatter");
2552 unsigned DstInpSize = HVC.getSizeOf(DstInpTy);
2553 unsigned DstElements = HVC.length(DstInpTy);
2554 auto *DstElemTy = cast<PointerType>(DstInpTy->getElementType());
2555 assert(DstElemTy && "llvm.scatter needs vector of ptr argument");
2556 dbgs() << " Gather feeds into scatter\n Values to scatter : "
2557 << *Dst->getOperand(0) << "\n";
2558 dbgs() << " Dst type(" << *DstInpTy << ") elements(" << DstElements
2559 << ") VecLen(" << DstInpSize << ") type(" << *DstElemTy
2560 << ") Access alignment(" << *Dst->getOperand(2) << ")\n";
2561 });
2562 // Address of source
2563 auto *Src = getPointer(IndexLoad);
2564 if (!Src)
2565 return nullptr;
2566 LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n");
2567
2568 if (!isa<PointerType>(Src->getType())) {
2569 LLVM_DEBUG(dbgs() << " Source is not a pointer type...\n");
2570 return nullptr;
2571 }
2572
2573 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2574 Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2575 LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n");
2576
2577 auto *DstLoad = locateAddressFromIntrinsic(Dst);
2578 if (!DstLoad) {
2579 LLVM_DEBUG(dbgs() << " Unable to locate DstLoad\n");
2580 return nullptr;
2581 }
2582 LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n");
2583
2584 Value *Ptr = getPointer(DstLoad);
2585 if (!Ptr)
2586 return nullptr;
2587 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2588 Value *CastIndex =
2589 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, IndexLoad);
2590 LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n");
2591 // Shift all i16 left by 1 to match short addressing mode instead of
2592 // byte.
2593 auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
2594 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2595 Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});
2596 LLVM_DEBUG(dbgs() << " Shifted half index: " << *AdjustedIndex << ")\n");
2597
2598 return Builder.CreateIntrinsic(
2599 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2600 {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2601 AdjustedIndex},
2602 nullptr);
2603 } else if (Qual == HvxIdioms::HEX_Gather_Scatter) {
2604 // Gather feeds into previously inserted pseudo intrinsic.
2605 // These could not be in the same packet, so we need to generate another
2606 // pseudo that is expanded to .tmp + store V6_vgathermh_pseudo
2607 // V6_vgathermh_pseudo (ins IntRegs:$_dst_, s4_0Imm:$Ii, IntRegs:$Rt,
2608 // ModRegs:$Mu, HvxVR:$Vv)
2609 if (isa<AllocaInst>(IndexLoad)) {
2610 auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
2611 if (cstDataVector) {
2612 // Our indexes are represented as a constant. We need THEM in a reg.
2613 // This most likely will not work properly since alloca gives us DDR
2614 // stack location. This will be fixed once we teach compiler about VTCM.
2615 AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);
2616 [[maybe_unused]] auto *StoreIndexes =
2617 Builder.CreateStore(cstDataVector, IndexesAlloca);
2618 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2619 Value *LoadedIndex = Builder.CreateLoad(
2620 IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index");
2621 AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
2622 LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca << "\n");
2623
2624 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2625 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2626 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2627
2628 Gather = Builder.CreateIntrinsic(
2629 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2630 {ResultAlloca, CastedSrc,
2631 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
2632 nullptr);
2633 Value *LoadedResult = Builder.CreateLoad(
2634 HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
2635 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2636 LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n");
2637 In.replaceAllUsesWith(LoadedResult);
2638 }
2639 } else {
2640 // Address of source
2641 auto *Src = getPointer(IndexLoad);
2642 if (!Src)
2643 return nullptr;
2644 LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n");
2645
2646 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2647 Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2648 LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n");
2649
2650 auto *DstLoad = locateAddressFromIntrinsic(Dst);
2651 if (!DstLoad)
2652 return nullptr;
2653 LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n");
2654 auto *Ptr = getPointer(DstLoad);
2655 if (!Ptr)
2656 return nullptr;
2657 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2658
2659 Gather = Builder.CreateIntrinsic(
2660 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgather_vscattermh,
2661 {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2662 Indexes},
2663 nullptr);
2664 }
2665 return Gather;
2666 } else if (Qual == HvxIdioms::HEX_Scatter) {
2667 // This is the case when result of a gather is used as an argument to
2668 // Intrinsic::hexagon_V6_vscattermh_128B. Most likely we just inserted it
2669 // ourselves. We have to create alloca, store to it, and replace all uses
2670 // with that.
2671 AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
2672 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2673 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2674 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2675 Value *CastIndex =
2676 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
2677 LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n");
2678
2679 Gather = Builder.CreateIntrinsic(
2680 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2681 {ResultAlloca, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2682 CastIndex},
2683 nullptr);
2684 Value *LoadedResult = Builder.CreateLoad(
2685 HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
2686 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2687 In.replaceAllUsesWith(LoadedResult);
2688 } else if (Qual == HvxIdioms::HEX_Gather) {
2689 // Gather feeds to another gather but already replaced with
2690 // hexagon_V6_vgathermh_128B
2691 if (isa<AllocaInst>(IndexLoad)) {
2692 auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
2693 if (cstDataVector) {
2694 // Our indexes are represented as a constant. We need it in a reg.
2695 AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);
2696
2697 [[maybe_unused]] auto *StoreIndexes =
2698 Builder.CreateStore(cstDataVector, IndexesAlloca);
2699 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2700 Value *LoadedIndex = Builder.CreateLoad(
2701 IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index");
2702 AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
2703 LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca
2704 << "\n AddressSpace: "
2705 << ResultAlloca->getAddressSpace() << "\n";);
2706
2707 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2708 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2709 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2710
2711 Gather = Builder.CreateIntrinsic(
2712 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2713 {ResultAlloca, CastedSrc,
2714 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
2715 nullptr);
2716 Value *LoadedResult = Builder.CreateLoad(
2717 HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
2718 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2719 LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n");
2720 In.replaceAllUsesWith(LoadedResult);
2721 }
2722 }
2723 } else if (Qual == HvxIdioms::LLVM_Gather) {
2724 // Gather feeds into another gather
2725 errs() << " Underimplemented vgather to vgather sequence\n";
2726 return nullptr;
2727 } else
2728 llvm_unreachable("Unhandled Qual enum");
2729
2730 return Gather;
2731}
2732
2733// Go through all PHI incomming values and find minimal alignment for non GEP
2734// members.
2735std::optional<uint64_t> HvxIdioms::getPHIBaseMinAlignment(Instruction &In,
2736 PHINode *PN) const {
2737 if (!PN)
2738 return std::nullopt;
2739
2740 SmallVector<Value *, 16> Worklist;
2741 SmallPtrSet<Value *, 16> Visited;
2742 uint64_t minPHIAlignment = Value::MaximumAlignment;
2743 Worklist.push_back(PN);
2744
2745 while (!Worklist.empty()) {
2746 Value *V = Worklist.back();
2747 Worklist.pop_back();
2748 if (!Visited.insert(V).second)
2749 continue;
2750
2751 if (PHINode *PN = dyn_cast<PHINode>(V)) {
2752 for (unsigned i = 0; i < PN->getNumIncomingValues(); ++i) {
2753 Worklist.push_back(PN->getIncomingValue(i));
2754 }
2755 } else if (isa<GetElementPtrInst>(V)) {
2756 // Ignore geps for now.
2757 continue;
2758 } else {
2759 Align KnownAlign = getKnownAlignment(V, HVC.DL, &In, &HVC.AC, &HVC.DT);
2760 if (KnownAlign.value() < minPHIAlignment)
2761 minPHIAlignment = KnownAlign.value();
2762 }
2763 }
2764 if (minPHIAlignment != Value::MaximumAlignment)
2765 return minPHIAlignment;
2766 return std::nullopt;
2767}
2768
2769// Helper function to discover alignment for a ptr.
2770std::optional<uint64_t> HvxIdioms::getAlignment(Instruction &In,
2771 Value *ptr) const {
2772 SmallPtrSet<Value *, 16> Visited;
2773 return getAlignmentImpl(In, ptr, Visited);
2774}
2775
2776std::optional<uint64_t>
2777HvxIdioms::getAlignmentImpl(Instruction &In, Value *ptr,
2778 SmallPtrSet<Value *, 16> &Visited) const {
2779 LLVM_DEBUG(dbgs() << "[getAlignment] for : " << *ptr << "\n");
2780 // Prevent infinite recursion
2781 if (!Visited.insert(ptr).second)
2782 return std::nullopt;
2783 // Try AssumptionCache.
2784 Align KnownAlign = getKnownAlignment(ptr, HVC.DL, &In, &HVC.AC, &HVC.DT);
2785 // This is the most formal and reliable source of information.
2786 if (KnownAlign.value() > 1) {
2787 LLVM_DEBUG(dbgs() << " VC align(" << KnownAlign.value() << ")\n");
2788 return KnownAlign.value();
2789 }
2790
2791 // If it is a PHI try to iterate through inputs
2792 if (PHINode *PN = dyn_cast<PHINode>(ptr)) {
2793 // See if we have a common base to which we know alignment.
2794 auto baseAlignmentOpt = getPHIBaseMinAlignment(In, PN);
2795 if (!baseAlignmentOpt)
2796 return std::nullopt;
2797
2798 uint64_t minBaseAlignment = *baseAlignmentOpt;
2799 // If it is 1, there is no point to keep on looking.
2800 if (minBaseAlignment == 1)
2801 return 1;
2802 // No see if all other incomming phi nodes are just loop carried constants.
2803 uint64_t minPHIAlignment = minBaseAlignment;
2804 LLVM_DEBUG(dbgs() << " It is a PHI with(" << PN->getNumIncomingValues()
2805 << ")nodes and min base aligned to (" << minBaseAlignment
2806 << ")\n");
2807 for (unsigned i = 0; i < PN->getNumIncomingValues(); ++i) {
2808 Value *IV = PN->getIncomingValue(i);
2809 // We have already looked at all other values.
2811 continue;
2812 uint64_t MemberAlignment = Value::MaximumAlignment;
2813 if (auto res = getAlignment(*PN, IV))
2814 MemberAlignment = *res;
2815 else
2816 return std::nullopt;
2817 // Adjust total PHI alignment.
2818 if (minPHIAlignment > MemberAlignment)
2819 minPHIAlignment = MemberAlignment;
2820 }
2821 LLVM_DEBUG(dbgs() << " total PHI alignment(" << minPHIAlignment << ")\n");
2822 return minPHIAlignment;
2823 }
2824
2825 if (auto *GEP = dyn_cast<GetElementPtrInst>(ptr)) {
2826 auto *GEPPtr = GEP->getPointerOperand();
2827 // Only if this is the induction variable with const offset
2828 // Implicit assumption is that induction variable itself is a PHI
2829 if (&In == GEPPtr) {
2830 APInt Offset(HVC.DL.getPointerSizeInBits(
2831 GEPPtr->getType()->getPointerAddressSpace()),
2832 0);
2833 if (GEP->accumulateConstantOffset(HVC.DL, Offset)) {
2834 LLVM_DEBUG(dbgs() << " Induction GEP with const step of ("
2835 << Offset.getZExtValue() << ")\n");
2836 return Offset.getZExtValue();
2837 }
2838 }
2839 }
2840
2841 return std::nullopt;
2842}
2843
2844Value *HvxIdioms::processMStore(Instruction &In) const {
2845 [[maybe_unused]] auto *InpTy =
2846 dyn_cast<VectorType>(In.getOperand(0)->getType());
2847 assert(InpTy && "Cannot handle no vector type for llvm.masked.store");
2848
2849 LLVM_DEBUG(dbgs() << "\n[Process mstore](" << In << ")\n"
2850 << *In.getParent() << "\n");
2851 LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements("
2852 << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
2853 << ") type(" << *InpTy->getElementType() << ") of size("
2854 << InpTy->getScalarSizeInBits() << ")bits\n");
2855 auto *CI = dyn_cast<CallBase>(&In);
2856 assert(CI && "Expected llvm.masked.store to be a call");
2857 Align HaveAlign = CI->getParamAlign(1).valueOrOne();
2858
2859 uint64_t KA = 1;
2860 if (auto res = getAlignment(In, In.getOperand(1))) // ptr operand
2861 KA = *res;
2862 LLVM_DEBUG(dbgs() << " HaveAlign(" << HaveAlign.value() << ") KnownAlign("
2863 << KA << ")\n");
2864 // Normalize 0 -> ABI alignment of the stored value type (operand 0).
2865 Type *ValTy = In.getOperand(0)->getType();
2866 Align EffA =
2867 (KA > 0) ? Align(KA) : Align(HVC.DL.getABITypeAlign(ValTy).value());
2868
2869 if (EffA < HaveAlign)
2870 return nullptr;
2871
2872 // Attach/replace the param attribute on pointer param #1.
2873 AttrBuilder AttrB(CI->getContext());
2874 AttrB.addAlignmentAttr(EffA);
2875 CI->setAttributes(
2876 CI->getAttributes().addParamAttributes(CI->getContext(), 1, AttrB));
2877 return CI;
2878}
2879
2880Value *HvxIdioms::processMLoad(Instruction &In) const {
2881 [[maybe_unused]] auto *InpTy = dyn_cast<VectorType>(In.getType());
2882 assert(InpTy && "Cannot handle non vector type for llvm.masked.store");
2883 LLVM_DEBUG(dbgs() << "\n[Process mload](" << In << ")\n"
2884 << *In.getParent() << "\n");
2885 LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements("
2886 << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
2887 << ") type(" << *InpTy->getElementType() << ") of size("
2888 << InpTy->getScalarSizeInBits() << ")bits\n");
2889 auto *CI = dyn_cast<CallBase>(&In);
2890 assert(CI && "Expected to be a call to llvm.masked.load");
2891 // The pointer is operand #0, and its param attribute index is also 0.
2892 Align HaveAlign = CI->getParamAlign(0).valueOrOne();
2893
2894 // Compute best-known alignment KA from analysis.
2895 uint64_t KA = 1;
2896 if (auto res = getAlignment(In, In.getOperand(0))) // ptr operand
2897 KA = *res;
2898
2899 // Normalize 0 → ABI alignment of the loaded value type.
2900 Type *ValTy = In.getType();
2901 Align EffA =
2902 (KA > 0) ? Align(KA) : Align(HVC.DL.getABITypeAlign(ValTy).value());
2903 if (EffA < HaveAlign)
2904 return nullptr;
2905 LLVM_DEBUG(dbgs() << " HaveAlign(" << HaveAlign.value() << ") KnownAlign("
2906 << KA << ")\n");
2907
2908 // Attach/replace the param attribute on pointer param #0.
2909 AttrBuilder AttrB(CI->getContext());
2910 AttrB.addAlignmentAttr(EffA);
2911 CI->setAttributes(
2912 CI->getAttributes().addParamAttributes(CI->getContext(), 0, AttrB));
2913 return CI;
2914}
2915
2916auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
2917 const FxpOp &Op) const -> Value * {
2918 assert(Op.X.Val->getType() == Op.Y.Val->getType());
2919 auto *InpTy = cast<VectorType>(Op.X.Val->getType());
2920 unsigned Width = InpTy->getScalarSizeInBits();
2921 bool Rounding = Op.RoundAt.has_value();
2922
2923 if (!Op.RoundAt || *Op.RoundAt == Op.Frac - 1) {
2924 // The fixed-point intrinsics do signed multiplication.
2925 if (Width == Op.Frac + 1 && Op.X.Sgn != Unsigned && Op.Y.Sgn != Unsigned) {
2926 Value *QMul = nullptr;
2927 if (Width == 16) {
2928 QMul = createMulQ15(Builder, Op.X, Op.Y, Rounding);
2929 } else if (Width == 32) {
2930 QMul = createMulQ31(Builder, Op.X, Op.Y, Rounding);
2931 }
2932 if (QMul != nullptr)
2933 return QMul;
2934 }
2935 }
2936
2937 assert(Width >= 32 || isPowerOf2_32(Width)); // Width <= 32 => Width is 2^n
2938 assert(Width < 32 || Width % 32 == 0); // Width > 32 => Width is 32*k
2939
2940 // If Width < 32, then it should really be 16.
2941 if (Width < 32) {
2942 if (Width < 16)
2943 return nullptr;
2944 // Getting here with Op.Frac == 0 isn't wrong, but suboptimal: here we
2945 // generate a full precision products, which is unnecessary if there is
2946 // no shift.
2947 assert(Width == 16);
2948 assert(Op.Frac != 0 && "Unshifted mul should have been skipped");
2949 if (Op.Frac == 16) {
2950 // Multiply high
2951 if (Value *MulH = createMulH16(Builder, Op.X, Op.Y))
2952 return MulH;
2953 }
2954 // Do full-precision multiply and shift.
2955 Value *Prod32 = createMul16(Builder, Op.X, Op.Y);
2956 if (Rounding) {
2957 Value *RoundVal =
2958 ConstantInt::get(Prod32->getType(), 1ull << *Op.RoundAt);
2959 Prod32 = Builder.CreateAdd(Prod32, RoundVal, "add");
2960 }
2961
2962 Value *ShiftAmt = ConstantInt::get(Prod32->getType(), Op.Frac);
2963 Value *Shifted = Op.X.Sgn == Signed || Op.Y.Sgn == Signed
2964 ? Builder.CreateAShr(Prod32, ShiftAmt, "asr")
2965 : Builder.CreateLShr(Prod32, ShiftAmt, "lsr");
2966 return Builder.CreateTrunc(Shifted, InpTy, "trn");
2967 }
2968
2969 // Width >= 32
2970
2971 // Break up the arguments Op.X and Op.Y into vectors of smaller widths
2972 // in preparation of doing the multiplication by 32-bit parts.
2973 auto WordX = HVC.splitVectorElements(Builder, Op.X.Val, /*ToWidth=*/32);
2974 auto WordY = HVC.splitVectorElements(Builder, Op.Y.Val, /*ToWidth=*/32);
2975 auto WordP = createMulLong(Builder, WordX, Op.X.Sgn, WordY, Op.Y.Sgn);
2976
2977 auto *HvxWordTy = cast<VectorType>(WordP.front()->getType());
2978
2979 // Add the optional rounding to the proper word.
2980 if (Op.RoundAt.has_value()) {
2981 Value *Zero = Constant::getNullValue(WordX[0]->getType());
2982 SmallVector<Value *> RoundV(WordP.size(), Zero);
2983 RoundV[*Op.RoundAt / 32] =
2984 ConstantInt::get(HvxWordTy, 1ull << (*Op.RoundAt % 32));
2985 WordP = createAddLong(Builder, WordP, RoundV);
2986 }
2987
2988 // createRightShiftLong?
2989
2990 // Shift all products right by Op.Frac.
2991 unsigned SkipWords = Op.Frac / 32;
2992 Constant *ShiftAmt = ConstantInt::get(HvxWordTy, Op.Frac % 32);
2993
2994 for (int Dst = 0, End = WordP.size() - SkipWords; Dst != End; ++Dst) {
2995 int Src = Dst + SkipWords;
2996 Value *Lo = WordP[Src];
2997 if (Src + 1 < End) {
2998 Value *Hi = WordP[Src + 1];
2999 WordP[Dst] = Builder.CreateIntrinsic(HvxWordTy, Intrinsic::fshr,
3000 {Hi, Lo, ShiftAmt},
3001 /*FMFSource*/ nullptr, "int");
3002 } else {
3003 // The shift of the most significant word.
3004 WordP[Dst] = Builder.CreateAShr(Lo, ShiftAmt, "asr");
3005 }
3006 }
3007 if (SkipWords != 0)
3008 WordP.resize(WordP.size() - SkipWords);
3009
3010 return HVC.joinVectorElements(Builder, WordP, Op.ResTy);
3011}
3012
3013auto HvxIdioms::createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
3014 bool Rounding) const -> Value * {
3015 assert(X.Val->getType() == Y.Val->getType());
3016 assert(X.Val->getType()->getScalarType() == HVC.getIntTy(16));
3017 assert(HVC.HST.isHVXVectorType(EVT::getEVT(X.Val->getType(), false)));
3018
3019 // There is no non-rounding intrinsic for i16.
3020 if (!Rounding || X.Sgn == Unsigned || Y.Sgn == Unsigned)
3021 return nullptr;
3022
3023 auto V6_vmpyhvsrs = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhvsrs);
3024 return HVC.createHvxIntrinsic(Builder, V6_vmpyhvsrs, X.Val->getType(),
3025 {X.Val, Y.Val});
3026}
3027
3028auto HvxIdioms::createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
3029 bool Rounding) const -> Value * {
3030 Type *InpTy = X.Val->getType();
3031 assert(InpTy == Y.Val->getType());
3032 assert(InpTy->getScalarType() == HVC.getIntTy(32));
3033 assert(HVC.HST.isHVXVectorType(EVT::getEVT(InpTy, false)));
3034
3035 if (X.Sgn == Unsigned || Y.Sgn == Unsigned)
3036 return nullptr;
3037
3038 auto V6_vmpyewuh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyewuh);
3039 auto V6_vmpyo_acc = Rounding
3040 ? HVC.HST.getIntrinsicId(Hexagon::V6_vmpyowh_rnd_sacc)
3041 : HVC.HST.getIntrinsicId(Hexagon::V6_vmpyowh_sacc);
3042 Value *V1 =
3043 HVC.createHvxIntrinsic(Builder, V6_vmpyewuh, InpTy, {X.Val, Y.Val});
3044 return HVC.createHvxIntrinsic(Builder, V6_vmpyo_acc, InpTy,
3045 {V1, X.Val, Y.Val});
3046}
3047
3048auto HvxIdioms::createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
3049 Value *CarryIn) const
3050 -> std::pair<Value *, Value *> {
3051 assert(X->getType() == Y->getType());
3052 auto VecTy = cast<VectorType>(X->getType());
3053 if (VecTy == HvxI32Ty && HVC.HST.useHVXV62Ops()) {
3055 Intrinsic::ID AddCarry;
3056 if (CarryIn == nullptr && HVC.HST.useHVXV66Ops()) {
3057 AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarryo);
3058 } else {
3059 AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarry);
3060 if (CarryIn == nullptr)
3061 CarryIn = Constant::getNullValue(HVC.getBoolTy(HVC.length(VecTy)));
3062 Args.push_back(CarryIn);
3063 }
3064 Value *Ret = HVC.createHvxIntrinsic(Builder, AddCarry,
3065 /*RetTy=*/nullptr, Args);
3066 Value *Result = Builder.CreateExtractValue(Ret, {0}, "ext");
3067 Value *CarryOut = Builder.CreateExtractValue(Ret, {1}, "ext");
3068 return {Result, CarryOut};
3069 }
3070
3071 // In other cases, do a regular add, and unsigned compare-less-than.
3072 // The carry-out can originate in two places: adding the carry-in or adding
3073 // the two input values.
3074 Value *Result1 = X; // Result1 = X + CarryIn
3075 if (CarryIn != nullptr) {
3076 unsigned Width = VecTy->getScalarSizeInBits();
3077 uint32_t Mask = 1;
3078 if (Width < 32) {
3079 for (unsigned i = 0, e = 32 / Width; i != e; ++i)
3080 Mask = (Mask << Width) | 1;
3081 }
3082 auto V6_vandqrt = HVC.HST.getIntrinsicId(Hexagon::V6_vandqrt);
3083 Value *ValueIn =
3084 HVC.createHvxIntrinsic(Builder, V6_vandqrt, /*RetTy=*/nullptr,
3085 {CarryIn, HVC.getConstInt(Mask)});
3086 Result1 = Builder.CreateAdd(X, ValueIn, "add");
3087 }
3088
3089 Value *CarryOut1 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result1, X, "cmp");
3090 Value *Result2 = Builder.CreateAdd(Result1, Y, "add");
3091 Value *CarryOut2 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result2, Y, "cmp");
3092 return {Result2, Builder.CreateOr(CarryOut1, CarryOut2, "orb")};
3093}
3094
3095auto HvxIdioms::createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const
3096 -> Value * {
3097 Intrinsic::ID V6_vmpyh = 0;
3098 std::tie(X, Y) = canonSgn(X, Y);
3099
3100 if (X.Sgn == Signed) {
3101 V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhv);
3102 } else if (Y.Sgn == Signed) {
3103 // In vmpyhus the second operand is unsigned
3104 V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhus);
3105 } else {
3106 V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyuhv);
3107 }
3108
3109 // i16*i16 -> i32 / interleaved
3110 Value *P =
3111 HVC.createHvxIntrinsic(Builder, V6_vmpyh, HvxP32Ty, {Y.Val, X.Val});
3112 // Deinterleave
3113 return HVC.vshuff(Builder, HVC.sublo(Builder, P), HVC.subhi(Builder, P));
3114}
3115
3116auto HvxIdioms::createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
3117 -> Value * {
3118 Type *HvxI16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/false);
3119
3120 if (HVC.HST.useHVXV69Ops()) {
3121 if (X.Sgn != Signed && Y.Sgn != Signed) {
3122 auto V6_vmpyuhvs = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyuhvs);
3123 return HVC.createHvxIntrinsic(Builder, V6_vmpyuhvs, HvxI16Ty,
3124 {X.Val, Y.Val});
3125 }
3126 }
3127
3128 Type *HvxP16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/true);
3129 Value *Pair16 =
3130 Builder.CreateBitCast(createMul16(Builder, X, Y), HvxP16Ty, "cst");
3131 unsigned Len = HVC.length(HvxP16Ty) / 2;
3132
3133 SmallVector<int, 128> PickOdd(Len);
3134 for (int i = 0; i != static_cast<int>(Len); ++i)
3135 PickOdd[i] = 2 * i + 1;
3136
3137 return Builder.CreateShuffleVector(
3138 HVC.sublo(Builder, Pair16), HVC.subhi(Builder, Pair16), PickOdd, "shf");
3139}
3140
3141auto HvxIdioms::createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
3142 -> std::pair<Value *, Value *> {
3143 assert(X.Val->getType() == Y.Val->getType());
3144 assert(X.Val->getType() == HvxI32Ty);
3145
3146 Intrinsic::ID V6_vmpy_parts;
3147 std::tie(X, Y) = canonSgn(X, Y);
3148
3149 if (X.Sgn == Signed) {
3150 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyss_parts;
3151 } else if (Y.Sgn == Signed) {
3152 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyus_parts;
3153 } else {
3154 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyuu_parts;
3155 }
3156
3157 Value *Parts = HVC.createHvxIntrinsic(Builder, V6_vmpy_parts, nullptr,
3158 {X.Val, Y.Val}, {HvxI32Ty});
3159 Value *Hi = Builder.CreateExtractValue(Parts, {0}, "ext");
3160 Value *Lo = Builder.CreateExtractValue(Parts, {1}, "ext");
3161 return {Lo, Hi};
3162}
3163
3164auto HvxIdioms::createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
3165 ArrayRef<Value *> WordY) const
3167 assert(WordX.size() == WordY.size());
3168 unsigned Idx = 0, Length = WordX.size();
3170
3171 while (Idx != Length) {
3172 if (HVC.isZero(WordX[Idx]))
3173 Sum[Idx] = WordY[Idx];
3174 else if (HVC.isZero(WordY[Idx]))
3175 Sum[Idx] = WordX[Idx];
3176 else
3177 break;
3178 ++Idx;
3179 }
3180
3181 Value *Carry = nullptr;
3182 for (; Idx != Length; ++Idx) {
3183 std::tie(Sum[Idx], Carry) =
3184 createAddCarry(Builder, WordX[Idx], WordY[Idx], Carry);
3185 }
3186
3187 // This drops the final carry beyond the highest word.
3188 return Sum;
3189}
3190
3191auto HvxIdioms::createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
3192 Signedness SgnX, ArrayRef<Value *> WordY,
3193 Signedness SgnY) const -> SmallVector<Value *> {
3194 SmallVector<SmallVector<Value *>> Products(WordX.size() + WordY.size());
3195
3196 // WordX[i] * WordY[j] produces words i+j and i+j+1 of the results,
3197 // that is halves 2(i+j), 2(i+j)+1, 2(i+j)+2, 2(i+j)+3.
3198 for (int i = 0, e = WordX.size(); i != e; ++i) {
3199 for (int j = 0, f = WordY.size(); j != f; ++j) {
3200 // Check the 4 halves that this multiplication can generate.
3201 Signedness SX = (i + 1 == e) ? SgnX : Unsigned;
3202 Signedness SY = (j + 1 == f) ? SgnY : Unsigned;
3203 auto [Lo, Hi] = createMul32(Builder, {WordX[i], SX}, {WordY[j], SY});
3204 Products[i + j + 0].push_back(Lo);
3205 Products[i + j + 1].push_back(Hi);
3206 }
3207 }
3208
3209 Value *Zero = Constant::getNullValue(WordX[0]->getType());
3210
3211 auto pop_back_or_zero = [Zero](auto &Vector) -> Value * {
3212 if (Vector.empty())
3213 return Zero;
3214 auto Last = Vector.back();
3215 Vector.pop_back();
3216 return Last;
3217 };
3218
3219 for (int i = 0, e = Products.size(); i != e; ++i) {
3220 while (Products[i].size() > 1) {
3221 Value *Carry = nullptr; // no carry-in
3222 for (int j = i; j != e; ++j) {
3223 auto &ProdJ = Products[j];
3224 auto [Sum, CarryOut] = createAddCarry(Builder, pop_back_or_zero(ProdJ),
3225 pop_back_or_zero(ProdJ), Carry);
3226 ProdJ.insert(ProdJ.begin(), Sum);
3227 Carry = CarryOut;
3228 }
3229 }
3230 }
3231
3233 for (auto &P : Products) {
3234 assert(P.size() == 1 && "Should have been added together");
3235 WordP.push_back(P.front());
3236 }
3237
3238 return WordP;
3239}
3240
3241auto HvxIdioms::run() -> bool {
3242 bool Changed = false;
3243
3244 for (BasicBlock &B : HVC.F) {
3245 for (auto It = B.rbegin(); It != B.rend(); ++It) {
3246 if (auto Fxm = matchFxpMul(*It)) {
3247 Value *New = processFxpMul(*It, *Fxm);
3248 // Always report "changed" for now.
3249 Changed = true;
3250 if (!New)
3251 continue;
3252 bool StartOver = !isa<Instruction>(New);
3253 It->replaceAllUsesWith(New);
3255 It = StartOver ? B.rbegin()
3256 : cast<Instruction>(New)->getReverseIterator();
3257 Changed = true;
3258 } else if (matchGather(*It)) {
3259 Value *New = processVGather(*It);
3260 if (!New)
3261 continue;
3262 LLVM_DEBUG(dbgs() << " Gather : " << *New << "\n");
3263 // We replace original intrinsic with a new pseudo call.
3264 It->eraseFromParent();
3265 It = cast<Instruction>(New)->getReverseIterator();
3267 Changed = true;
3268 } else if (matchScatter(*It)) {
3269 Value *New = processVScatter(*It);
3270 if (!New)
3271 continue;
3272 LLVM_DEBUG(dbgs() << " Scatter : " << *New << "\n");
3273 // We replace original intrinsic with a new pseudo call.
3274 It->eraseFromParent();
3275 It = cast<Instruction>(New)->getReverseIterator();
3277 Changed = true;
3278 } else if (matchMLoad(*It)) {
3279 Value *New = processMLoad(*It);
3280 if (!New)
3281 continue;
3282 LLVM_DEBUG(dbgs() << " MLoad : " << *New << "\n");
3283 Changed = true;
3284 } else if (matchMStore(*It)) {
3285 Value *New = processMStore(*It);
3286 if (!New)
3287 continue;
3288 LLVM_DEBUG(dbgs() << " MStore : " << *New << "\n");
3289 Changed = true;
3290 }
3291 }
3292 }
3293
3294 return Changed;
3295}
3296
3297// --- End HvxIdioms
3298
3299auto HexagonVectorCombine::run() -> bool {
3300 if (DumpModule)
3301 dbgs() << "Module before HexagonVectorCombine\n" << *F.getParent();
3302
3303 bool Changed = false;
3304 if (HST.useHVXOps()) {
3305 if (VAEnabled)
3306 Changed |= AlignVectors(*this).run();
3307 if (VIEnabled)
3308 Changed |= HvxIdioms(*this).run();
3309 }
3310
3311 if (DumpModule) {
3312 dbgs() << "Module " << (Changed ? "(modified)" : "(unchanged)")
3313 << " after HexagonVectorCombine\n"
3314 << *F.getParent();
3315 }
3316 return Changed;
3317}
3318
3319auto HexagonVectorCombine::getIntTy(unsigned Width) const -> IntegerType * {
3320 return IntegerType::get(F.getContext(), Width);
3321}
3322
3323auto HexagonVectorCombine::getByteTy(int ElemCount) const -> Type * {
3324 assert(ElemCount >= 0);
3325 IntegerType *ByteTy = Type::getInt8Ty(F.getContext());
3326 if (ElemCount == 0)
3327 return ByteTy;
3328 return VectorType::get(ByteTy, ElemCount, /*Scalable=*/false);
3329}
3330
3331auto HexagonVectorCombine::getBoolTy(int ElemCount) const -> Type * {
3332 assert(ElemCount >= 0);
3333 IntegerType *BoolTy = Type::getInt1Ty(F.getContext());
3334 if (ElemCount == 0)
3335 return BoolTy;
3336 return VectorType::get(BoolTy, ElemCount, /*Scalable=*/false);
3337}
3338
3339auto HexagonVectorCombine::getConstInt(int Val, unsigned Width) const
3340 -> ConstantInt * {
3341 return ConstantInt::getSigned(getIntTy(Width), Val);
3342}
3343
3344auto HexagonVectorCombine::isZero(const Value *Val) const -> bool {
3345 if (auto *C = dyn_cast<Constant>(Val))
3346 return C->isZeroValue();
3347 return false;
3348}
3349
3350auto HexagonVectorCombine::getIntValue(const Value *Val) const
3351 -> std::optional<APInt> {
3352 if (auto *CI = dyn_cast<ConstantInt>(Val))
3353 return CI->getValue();
3354 return std::nullopt;
3355}
3356
3357auto HexagonVectorCombine::isUndef(const Value *Val) const -> bool {
3358 return isa<UndefValue>(Val);
3359}
3360
3361auto HexagonVectorCombine::isTrue(const Value *Val) const -> bool {
3362 return Val == ConstantInt::getTrue(Val->getType());
3363}
3364
3365auto HexagonVectorCombine::isFalse(const Value *Val) const -> bool {
3366 return isZero(Val);
3367}
3368
3369auto HexagonVectorCombine::getHvxTy(Type *ElemTy, bool Pair) const
3370 -> VectorType * {
3371 EVT ETy = EVT::getEVT(ElemTy, false);
3372 assert(ETy.isSimple() && "Invalid HVX element type");
3373 // Do not allow boolean types here: they don't have a fixed length.
3374 assert(HST.isHVXElementType(ETy.getSimpleVT(), /*IncludeBool=*/false) &&
3375 "Invalid HVX element type");
3376 unsigned HwLen = HST.getVectorLength();
3377 unsigned NumElems = (8 * HwLen) / ETy.getSizeInBits();
3378 return VectorType::get(ElemTy, Pair ? 2 * NumElems : NumElems,
3379 /*Scalable=*/false);
3380}
3381
3382auto HexagonVectorCombine::getSizeOf(const Value *Val, SizeKind Kind) const
3383 -> int {
3384 return getSizeOf(Val->getType(), Kind);
3385}
3386
3387auto HexagonVectorCombine::getSizeOf(const Type *Ty, SizeKind Kind) const
3388 -> int {
3389 auto *NcTy = const_cast<Type *>(Ty);
3390 switch (Kind) {
3391 case Store:
3392 return DL.getTypeStoreSize(NcTy).getFixedValue();
3393 case Alloc:
3394 return DL.getTypeAllocSize(NcTy).getFixedValue();
3395 }
3396 llvm_unreachable("Unhandled SizeKind enum");
3397}
3398
3399auto HexagonVectorCombine::getTypeAlignment(Type *Ty) const -> int {
3400 // The actual type may be shorter than the HVX vector, so determine
3401 // the alignment based on subtarget info.
3402 if (HST.isTypeForHVX(Ty))
3403 return HST.getVectorLength();
3404 return DL.getABITypeAlign(Ty).value();
3405}
3406
3407auto HexagonVectorCombine::length(Value *Val) const -> size_t {
3408 return length(Val->getType());
3409}
3410
3411auto HexagonVectorCombine::length(Type *Ty) const -> size_t {
3412 auto *VecTy = dyn_cast<VectorType>(Ty);
3413 assert(VecTy && "Must be a vector type");
3414 return VecTy->getElementCount().getFixedValue();
3415}
3416
3417auto HexagonVectorCombine::simplify(Value *V) const -> Value * {
3418 if (auto *In = dyn_cast<Instruction>(V)) {
3419 SimplifyQuery Q(DL, &TLI, &DT, &AC, In);
3420 return simplifyInstruction(In, Q);
3421 }
3422 return nullptr;
3423}
3424
3425// Insert bytes [Start..Start+Length) of Src into Dst at byte Where.
3426auto HexagonVectorCombine::insertb(IRBuilderBase &Builder, Value *Dst,
3427 Value *Src, int Start, int Length,
3428 int Where) const -> Value * {
3429 assert(isByteVecTy(Dst->getType()) && isByteVecTy(Src->getType()));
3430 int SrcLen = getSizeOf(Src);
3431 int DstLen = getSizeOf(Dst);
3432 assert(0 <= Start && Start + Length <= SrcLen);
3433 assert(0 <= Where && Where + Length <= DstLen);
3434
3435 int P2Len = PowerOf2Ceil(SrcLen | DstLen);
3436 auto *Poison = PoisonValue::get(getByteTy());
3437 Value *P2Src = vresize(Builder, Src, P2Len, Poison);
3438 Value *P2Dst = vresize(Builder, Dst, P2Len, Poison);
3439
3440 SmallVector<int, 256> SMask(P2Len);
3441 for (int i = 0; i != P2Len; ++i) {
3442 // If i is in [Where, Where+Length), pick Src[Start+(i-Where)].
3443 // Otherwise, pick Dst[i];
3444 SMask[i] =
3445 (Where <= i && i < Where + Length) ? P2Len + Start + (i - Where) : i;
3446 }
3447
3448 Value *P2Insert = Builder.CreateShuffleVector(P2Dst, P2Src, SMask, "shf");
3449 return vresize(Builder, P2Insert, DstLen, Poison);
3450}
3451
3452auto HexagonVectorCombine::vlalignb(IRBuilderBase &Builder, Value *Lo,
3453 Value *Hi, Value *Amt) const -> Value * {
3454 assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
3455 if (isZero(Amt))
3456 return Hi;
3457 int VecLen = getSizeOf(Hi);
3458 if (auto IntAmt = getIntValue(Amt))
3459 return getElementRange(Builder, Lo, Hi, VecLen - IntAmt->getSExtValue(),
3460 VecLen);
3461
3462 if (HST.isTypeForHVX(Hi->getType())) {
3463 assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
3464 "Expecting an exact HVX type");
3465 return createHvxIntrinsic(Builder, HST.getIntrinsicId(Hexagon::V6_vlalignb),
3466 Hi->getType(), {Hi, Lo, Amt});
3467 }
3468
3469 if (VecLen == 4) {
3470 Value *Pair = concat(Builder, {Lo, Hi});
3471 Value *Shift =
3472 Builder.CreateLShr(Builder.CreateShl(Pair, Amt, "shl"), 32, "lsr");
3473 Value *Trunc =
3474 Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()), "trn");
3475 return Builder.CreateBitCast(Trunc, Hi->getType(), "cst");
3476 }
3477 if (VecLen == 8) {
3478 Value *Sub = Builder.CreateSub(getConstInt(VecLen), Amt, "sub");
3479 return vralignb(Builder, Lo, Hi, Sub);
3480 }
3481 llvm_unreachable("Unexpected vector length");
3482}
3483
3484auto HexagonVectorCombine::vralignb(IRBuilderBase &Builder, Value *Lo,
3485 Value *Hi, Value *Amt) const -> Value * {
3486 assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
3487 if (isZero(Amt))
3488 return Lo;
3489 int VecLen = getSizeOf(Lo);
3490 if (auto IntAmt = getIntValue(Amt))
3491 return getElementRange(Builder, Lo, Hi, IntAmt->getSExtValue(), VecLen);
3492
3493 if (HST.isTypeForHVX(Lo->getType())) {
3494 assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
3495 "Expecting an exact HVX type");
3496 return createHvxIntrinsic(Builder, HST.getIntrinsicId(Hexagon::V6_valignb),
3497 Lo->getType(), {Hi, Lo, Amt});
3498 }
3499
3500 if (VecLen == 4) {
3501 Value *Pair = concat(Builder, {Lo, Hi});
3502 Value *Shift = Builder.CreateLShr(Pair, Amt, "lsr");
3503 Value *Trunc =
3504 Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()), "trn");
3505 return Builder.CreateBitCast(Trunc, Lo->getType(), "cst");
3506 }
3507 if (VecLen == 8) {
3508 Type *Int64Ty = Type::getInt64Ty(F.getContext());
3509 Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty, "cst");
3510 Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty, "cst");
3511 Value *Call = Builder.CreateIntrinsic(Intrinsic::hexagon_S2_valignrb,
3512 {Hi64, Lo64, Amt},
3513 /*FMFSource=*/nullptr, "cup");
3514 return Builder.CreateBitCast(Call, Lo->getType(), "cst");
3515 }
3516 llvm_unreachable("Unexpected vector length");
3517}
3518
3519// Concatenates a sequence of vectors of the same type.
3520auto HexagonVectorCombine::concat(IRBuilderBase &Builder,
3521 ArrayRef<Value *> Vecs) const -> Value * {
3522 assert(!Vecs.empty());
3524 std::vector<Value *> Work[2];
3525 int ThisW = 0, OtherW = 1;
3526
3527 Work[ThisW].assign(Vecs.begin(), Vecs.end());
3528 while (Work[ThisW].size() > 1) {
3529 auto *Ty = cast<VectorType>(Work[ThisW].front()->getType());
3530 SMask.resize(length(Ty) * 2);
3531 std::iota(SMask.begin(), SMask.end(), 0);
3532
3533 Work[OtherW].clear();
3534 if (Work[ThisW].size() % 2 != 0)
3535 Work[ThisW].push_back(UndefValue::get(Ty));
3536 for (int i = 0, e = Work[ThisW].size(); i < e; i += 2) {
3537 Value *Joined = Builder.CreateShuffleVector(
3538 Work[ThisW][i], Work[ThisW][i + 1], SMask, "shf");
3539 Work[OtherW].push_back(Joined);
3540 }
3541 std::swap(ThisW, OtherW);
3542 }
3543
3544 // Since there may have been some undefs appended to make shuffle operands
3545 // have the same type, perform the last shuffle to only pick the original
3546 // elements.
3547 SMask.resize(Vecs.size() * length(Vecs.front()->getType()));
3548 std::iota(SMask.begin(), SMask.end(), 0);
3549 Value *Total = Work[ThisW].front();
3550 return Builder.CreateShuffleVector(Total, SMask, "shf");
3551}
3552
3553auto HexagonVectorCombine::vresize(IRBuilderBase &Builder, Value *Val,
3554 int NewSize, Value *Pad) const -> Value * {
3556 auto *ValTy = cast<VectorType>(Val->getType());
3557 assert(ValTy->getElementType() == Pad->getType());
3558
3559 int CurSize = length(ValTy);
3560 if (CurSize == NewSize)
3561 return Val;
3562 // Truncate?
3563 if (CurSize > NewSize)
3564 return getElementRange(Builder, Val, /*Ignored*/ Val, 0, NewSize);
3565 // Extend.
3566 SmallVector<int, 128> SMask(NewSize);
3567 std::iota(SMask.begin(), SMask.begin() + CurSize, 0);
3568 std::fill(SMask.begin() + CurSize, SMask.end(), CurSize);
3569 Value *PadVec = Builder.CreateVectorSplat(CurSize, Pad, "spt");
3570 return Builder.CreateShuffleVector(Val, PadVec, SMask, "shf");
3571}
3572
3573auto HexagonVectorCombine::rescale(IRBuilderBase &Builder, Value *Mask,
3574 Type *FromTy, Type *ToTy) const -> Value * {
3575 // Mask is a vector <N x i1>, where each element corresponds to an
3576 // element of FromTy. Remap it so that each element will correspond
3577 // to an element of ToTy.
3578 assert(isa<VectorType>(Mask->getType()));
3579
3580 Type *FromSTy = FromTy->getScalarType();
3581 Type *ToSTy = ToTy->getScalarType();
3582 if (FromSTy == ToSTy)
3583 return Mask;
3584
3585 int FromSize = getSizeOf(FromSTy);
3586 int ToSize = getSizeOf(ToSTy);
3587 assert(FromSize % ToSize == 0 || ToSize % FromSize == 0);
3588
3589 auto *MaskTy = cast<VectorType>(Mask->getType());
3590 int FromCount = length(MaskTy);
3591 int ToCount = (FromCount * FromSize) / ToSize;
3592 assert((FromCount * FromSize) % ToSize == 0);
3593
3594 auto *FromITy = getIntTy(FromSize * 8);
3595 auto *ToITy = getIntTy(ToSize * 8);
3596
3597 // Mask <N x i1> -> sext to <N x FromTy> -> bitcast to <M x ToTy> ->
3598 // -> trunc to <M x i1>.
3599 Value *Ext = Builder.CreateSExt(
3600 Mask, VectorType::get(FromITy, FromCount, /*Scalable=*/false), "sxt");
3601 Value *Cast = Builder.CreateBitCast(
3602 Ext, VectorType::get(ToITy, ToCount, /*Scalable=*/false), "cst");
3603 return Builder.CreateTrunc(
3604 Cast, VectorType::get(getBoolTy(), ToCount, /*Scalable=*/false), "trn");
3605}
3606
3607// Bitcast to bytes, and return least significant bits.
3608auto HexagonVectorCombine::vlsb(IRBuilderBase &Builder, Value *Val) const
3609 -> Value * {
3610 Type *ScalarTy = Val->getType()->getScalarType();
3611 if (ScalarTy == getBoolTy())
3612 return Val;
3613
3614 Value *Bytes = vbytes(Builder, Val);
3615 if (auto *VecTy = dyn_cast<VectorType>(Bytes->getType()))
3616 return Builder.CreateTrunc(Bytes, getBoolTy(getSizeOf(VecTy)), "trn");
3617 // If Bytes is a scalar (i.e. Val was a scalar byte), return i1, not
3618 // <1 x i1>.
3619 return Builder.CreateTrunc(Bytes, getBoolTy(), "trn");
3620}
3621
3622// Bitcast to bytes for non-bool. For bool, convert i1 -> i8.
3623auto HexagonVectorCombine::vbytes(IRBuilderBase &Builder, Value *Val) const
3624 -> Value * {
3625 Type *ScalarTy = Val->getType()->getScalarType();
3626 if (ScalarTy == getByteTy())
3627 return Val;
3628
3629 if (ScalarTy != getBoolTy())
3630 return Builder.CreateBitCast(Val, getByteTy(getSizeOf(Val)), "cst");
3631 // For bool, return a sext from i1 to i8.
3632 if (auto *VecTy = dyn_cast<VectorType>(Val->getType()))
3633 return Builder.CreateSExt(Val, VectorType::get(getByteTy(), VecTy), "sxt");
3634 return Builder.CreateSExt(Val, getByteTy(), "sxt");
3635}
3636
3637auto HexagonVectorCombine::subvector(IRBuilderBase &Builder, Value *Val,
3638 unsigned Start, unsigned Length) const
3639 -> Value * {
3640 assert(Start + Length <= length(Val));
3641 return getElementRange(Builder, Val, /*Ignored*/ Val, Start, Length);
3642}
3643
3644auto HexagonVectorCombine::sublo(IRBuilderBase &Builder, Value *Val) const
3645 -> Value * {
3646 size_t Len = length(Val);
3647 assert(Len % 2 == 0 && "Length should be even");
3648 return subvector(Builder, Val, 0, Len / 2);
3649}
3650
3651auto HexagonVectorCombine::subhi(IRBuilderBase &Builder, Value *Val) const
3652 -> Value * {
3653 size_t Len = length(Val);
3654 assert(Len % 2 == 0 && "Length should be even");
3655 return subvector(Builder, Val, Len / 2, Len / 2);
3656}
3657
3658auto HexagonVectorCombine::vdeal(IRBuilderBase &Builder, Value *Val0,
3659 Value *Val1) const -> Value * {
3660 assert(Val0->getType() == Val1->getType());
3661 int Len = length(Val0);
3662 SmallVector<int, 128> Mask(2 * Len);
3663
3664 for (int i = 0; i != Len; ++i) {
3665 Mask[i] = 2 * i; // Even
3666 Mask[i + Len] = 2 * i + 1; // Odd
3667 }
3668 return Builder.CreateShuffleVector(Val0, Val1, Mask, "shf");
3669}
3670
3671auto HexagonVectorCombine::vshuff(IRBuilderBase &Builder, Value *Val0,
3672 Value *Val1) const -> Value * { //
3673 assert(Val0->getType() == Val1->getType());
3674 int Len = length(Val0);
3675 SmallVector<int, 128> Mask(2 * Len);
3676
3677 for (int i = 0; i != Len; ++i) {
3678 Mask[2 * i + 0] = i; // Val0
3679 Mask[2 * i + 1] = i + Len; // Val1
3680 }
3681 return Builder.CreateShuffleVector(Val0, Val1, Mask, "shf");
3682}
3683
3684auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder,
3685 Intrinsic::ID IntID, Type *RetTy,
3686 ArrayRef<Value *> Args,
3687 ArrayRef<Type *> ArgTys,
3688 ArrayRef<Value *> MDSources) const
3689 -> Value * {
3690 auto getCast = [&](IRBuilderBase &Builder, Value *Val,
3691 Type *DestTy) -> Value * {
3692 Type *SrcTy = Val->getType();
3693 if (SrcTy == DestTy)
3694 return Val;
3695
3696 // Non-HVX type. It should be a scalar, and it should already have
3697 // a valid type.
3698 assert(HST.isTypeForHVX(SrcTy, /*IncludeBool=*/true));
3699
3700 Type *BoolTy = Type::getInt1Ty(F.getContext());
3701 if (cast<VectorType>(SrcTy)->getElementType() != BoolTy)
3702 return Builder.CreateBitCast(Val, DestTy, "cst");
3703
3704 // Predicate HVX vector.
3705 unsigned HwLen = HST.getVectorLength();
3706 Intrinsic::ID TC = HwLen == 64 ? Intrinsic::hexagon_V6_pred_typecast
3707 : Intrinsic::hexagon_V6_pred_typecast_128B;
3708 return Builder.CreateIntrinsic(TC, {DestTy, Val->getType()}, {Val},
3709 /*FMFSource=*/nullptr, "cup");
3710 };
3711
3712 Function *IntrFn =
3713 Intrinsic::getOrInsertDeclaration(F.getParent(), IntID, ArgTys);
3714 FunctionType *IntrTy = IntrFn->getFunctionType();
3715
3716 SmallVector<Value *, 4> IntrArgs;
3717 for (int i = 0, e = Args.size(); i != e; ++i) {
3718 Value *A = Args[i];
3719 Type *T = IntrTy->getParamType(i);
3720 if (A->getType() != T) {
3721 IntrArgs.push_back(getCast(Builder, A, T));
3722 } else {
3723 IntrArgs.push_back(A);
3724 }
3725 }
3726 StringRef MaybeName = !IntrTy->getReturnType()->isVoidTy() ? "cup" : "";
3727 CallInst *Call = Builder.CreateCall(IntrFn, IntrArgs, MaybeName);
3728
3729 MemoryEffects ME = Call->getAttributes().getMemoryEffects();
3731 propagateMetadata(Call, MDSources);
3732
3733 Type *CallTy = Call->getType();
3734 if (RetTy == nullptr || CallTy == RetTy)
3735 return Call;
3736 // Scalar types should have RetTy matching the call return type.
3737 assert(HST.isTypeForHVX(CallTy, /*IncludeBool=*/true));
3738 return getCast(Builder, Call, RetTy);
3739}
3740
3741auto HexagonVectorCombine::splitVectorElements(IRBuilderBase &Builder,
3742 Value *Vec,
3743 unsigned ToWidth) const
3745 // Break a vector of wide elements into a series of vectors with narrow
3746 // elements:
3747 // (...c0:b0:a0, ...c1:b1:a1, ...c2:b2:a2, ...)
3748 // -->
3749 // (a0, a1, a2, ...) // lowest "ToWidth" bits
3750 // (b0, b1, b2, ...) // the next lowest...
3751 // (c0, c1, c2, ...) // ...
3752 // ...
3753 //
3754 // The number of elements in each resulting vector is the same as
3755 // in the original vector.
3756
3757 auto *VecTy = cast<VectorType>(Vec->getType());
3758 assert(VecTy->getElementType()->isIntegerTy());
3759 unsigned FromWidth = VecTy->getScalarSizeInBits();
3760 assert(isPowerOf2_32(ToWidth) && isPowerOf2_32(FromWidth));
3761 assert(ToWidth <= FromWidth && "Breaking up into wider elements?");
3762 unsigned NumResults = FromWidth / ToWidth;
3763
3764 SmallVector<Value *> Results(NumResults);
3765 Results[0] = Vec;
3766 unsigned Length = length(VecTy);
3767
3768 // Do it by splitting in half, since those operations correspond to deal
3769 // instructions.
3770 auto splitInHalf = [&](unsigned Begin, unsigned End, auto splitFunc) -> void {
3771 // Take V = Results[Begin], split it in L, H.
3772 // Store Results[Begin] = L, Results[(Begin+End)/2] = H
3773 // Call itself recursively split(Begin, Half), split(Half+1, End)
3774 if (Begin + 1 == End)
3775 return;
3776
3777 Value *Val = Results[Begin];
3778 unsigned Width = Val->getType()->getScalarSizeInBits();
3779
3780 auto *VTy = VectorType::get(getIntTy(Width / 2), 2 * Length, false);
3781 Value *VVal = Builder.CreateBitCast(Val, VTy, "cst");
3782
3783 Value *Res = vdeal(Builder, sublo(Builder, VVal), subhi(Builder, VVal));
3784
3785 unsigned Half = (Begin + End) / 2;
3786 Results[Begin] = sublo(Builder, Res);
3787 Results[Half] = subhi(Builder, Res);
3788
3789 splitFunc(Begin, Half, splitFunc);
3790 splitFunc(Half, End, splitFunc);
3791 };
3792
3793 splitInHalf(0, NumResults, splitInHalf);
3794 return Results;
3795}
3796
3797auto HexagonVectorCombine::joinVectorElements(IRBuilderBase &Builder,
3798 ArrayRef<Value *> Values,
3799 VectorType *ToType) const
3800 -> Value * {
3801 assert(ToType->getElementType()->isIntegerTy());
3802
3803 // If the list of values does not have power-of-2 elements, append copies
3804 // of the sign bit to it, to make the size be 2^n.
3805 // The reason for this is that the values will be joined in pairs, because
3806 // otherwise the shuffles will result in convoluted code. With pairwise
3807 // joins, the shuffles will hopefully be folded into a perfect shuffle.
3808 // The output will need to be sign-extended to a type with element width
3809 // being a power-of-2 anyways.
3810 SmallVector<Value *> Inputs(Values);
3811
3812 unsigned ToWidth = ToType->getScalarSizeInBits();
3813 unsigned Width = Inputs.front()->getType()->getScalarSizeInBits();
3814 assert(Width <= ToWidth);
3815 assert(isPowerOf2_32(Width) && isPowerOf2_32(ToWidth));
3816 unsigned Length = length(Inputs.front()->getType());
3817
3818 unsigned NeedInputs = ToWidth / Width;
3819 if (Inputs.size() != NeedInputs) {
3820 // Having too many inputs is ok: drop the high bits (usual wrap-around).
3821 // If there are too few, fill them with the sign bit.
3822 Value *Last = Inputs.back();
3823 Value *Sign = Builder.CreateAShr(
3824 Last, ConstantInt::get(Last->getType(), Width - 1), "asr");
3825 Inputs.resize(NeedInputs, Sign);
3826 }
3827
3828 while (Inputs.size() > 1) {
3829 Width *= 2;
3830 auto *VTy = VectorType::get(getIntTy(Width), Length, false);
3831 for (int i = 0, e = Inputs.size(); i < e; i += 2) {
3832 Value *Res = vshuff(Builder, Inputs[i], Inputs[i + 1]);
3833 Inputs[i / 2] = Builder.CreateBitCast(Res, VTy, "cst");
3834 }
3835 Inputs.resize(Inputs.size() / 2);
3836 }
3837
3838 assert(Inputs.front()->getType() == ToType);
3839 return Inputs.front();
3840}
3841
3842auto HexagonVectorCombine::calculatePointerDifference(Value *Ptr0,
3843 Value *Ptr1) const
3844 -> std::optional<int> {
3845 // Try SCEV first.
3846 const SCEV *Scev0 = SE.getSCEV(Ptr0);
3847 const SCEV *Scev1 = SE.getSCEV(Ptr1);
3848 const SCEV *ScevDiff = SE.getMinusSCEV(Scev0, Scev1);
3849 if (auto *Const = dyn_cast<SCEVConstant>(ScevDiff)) {
3850 APInt V = Const->getAPInt();
3851 if (V.isSignedIntN(8 * sizeof(int)))
3852 return static_cast<int>(V.getSExtValue());
3853 }
3854
3855 struct Builder : IRBuilder<> {
3856 Builder(BasicBlock *B) : IRBuilder<>(B->getTerminator()) {}
3857 ~Builder() {
3858 for (Instruction *I : llvm::reverse(ToErase))
3859 I->eraseFromParent();
3860 }
3861 SmallVector<Instruction *, 8> ToErase;
3862 };
3863
3864#define CallBuilder(B, F) \
3865 [&](auto &B_) { \
3866 Value *V = B_.F; \
3867 if (auto *I = dyn_cast<Instruction>(V)) \
3868 B_.ToErase.push_back(I); \
3869 return V; \
3870 }(B)
3871
3872 auto Simplify = [this](Value *V) {
3873 if (Value *S = simplify(V))
3874 return S;
3875 return V;
3876 };
3877
3878 auto StripBitCast = [](Value *V) {
3879 while (auto *C = dyn_cast<BitCastInst>(V))
3880 V = C->getOperand(0);
3881 return V;
3882 };
3883
3884 Ptr0 = StripBitCast(Ptr0);
3885 Ptr1 = StripBitCast(Ptr1);
3887 return std::nullopt;
3888
3889 auto *Gep0 = cast<GetElementPtrInst>(Ptr0);
3890 auto *Gep1 = cast<GetElementPtrInst>(Ptr1);
3891 if (Gep0->getPointerOperand() != Gep1->getPointerOperand())
3892 return std::nullopt;
3893 if (Gep0->getSourceElementType() != Gep1->getSourceElementType())
3894 return std::nullopt;
3895
3896 Builder B(Gep0->getParent());
3897 int Scale = getSizeOf(Gep0->getSourceElementType(), Alloc);
3898
3899 // FIXME: for now only check GEPs with a single index.
3900 if (Gep0->getNumOperands() != 2 || Gep1->getNumOperands() != 2)
3901 return std::nullopt;
3902
3903 Value *Idx0 = Gep0->getOperand(1);
3904 Value *Idx1 = Gep1->getOperand(1);
3905
3906 // First, try to simplify the subtraction directly.
3907 if (auto *Diff = dyn_cast<ConstantInt>(
3908 Simplify(CallBuilder(B, CreateSub(Idx0, Idx1)))))
3909 return Diff->getSExtValue() * Scale;
3910
3911 KnownBits Known0 = getKnownBits(Idx0, Gep0);
3912 KnownBits Known1 = getKnownBits(Idx1, Gep1);
3913 APInt Unknown = ~(Known0.Zero | Known0.One) | ~(Known1.Zero | Known1.One);
3914 if (Unknown.isAllOnes())
3915 return std::nullopt;
3916
3917 Value *MaskU = ConstantInt::get(Idx0->getType(), Unknown);
3918 Value *AndU0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskU)));
3919 Value *AndU1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskU)));
3920 Value *SubU = Simplify(CallBuilder(B, CreateSub(AndU0, AndU1)));
3921 int Diff0 = 0;
3922 if (auto *C = dyn_cast<ConstantInt>(SubU)) {
3923 Diff0 = C->getSExtValue();
3924 } else {
3925 return std::nullopt;
3926 }
3927
3928 Value *MaskK = ConstantInt::get(MaskU->getType(), ~Unknown);
3929 Value *AndK0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskK)));
3930 Value *AndK1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskK)));
3931 Value *SubK = Simplify(CallBuilder(B, CreateSub(AndK0, AndK1)));
3932 int Diff1 = 0;
3933 if (auto *C = dyn_cast<ConstantInt>(SubK)) {
3934 Diff1 = C->getSExtValue();
3935 } else {
3936 return std::nullopt;
3937 }
3938
3939 return (Diff0 + Diff1) * Scale;
3940
3941#undef CallBuilder
3942}
3943
3944auto HexagonVectorCombine::getNumSignificantBits(const Value *V,
3945 const Instruction *CtxI) const
3946 -> unsigned {
3947 return ComputeMaxSignificantBits(V, DL, &AC, CtxI, &DT);
3948}
3949
3950auto HexagonVectorCombine::getKnownBits(const Value *V,
3951 const Instruction *CtxI) const
3952 -> KnownBits {
3953 return computeKnownBits(V, DL, &AC, CtxI, &DT);
3954}
3955
3956auto HexagonVectorCombine::isSafeToClone(const Instruction &In) const -> bool {
3957 if (In.mayHaveSideEffects() || In.isAtomic() || In.isVolatile() ||
3958 In.isFenceLike() || In.mayReadOrWriteMemory()) {
3959 return false;
3960 }
3961 if (isa<CallBase>(In) || isa<AllocaInst>(In))
3962 return false;
3963 return true;
3964}
3965
3966template <typename T>
3967auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In,
3969 const T &IgnoreInsts) const
3970 -> bool {
3971 auto getLocOrNone =
3972 [this](const Instruction &I) -> std::optional<MemoryLocation> {
3973 if (const auto *II = dyn_cast<IntrinsicInst>(&I)) {
3974 switch (II->getIntrinsicID()) {
3975 case Intrinsic::masked_load:
3976 return MemoryLocation::getForArgument(II, 0, TLI);
3977 case Intrinsic::masked_store:
3978 return MemoryLocation::getForArgument(II, 1, TLI);
3979 }
3980 }
3982 };
3983
3984 // The source and the destination must be in the same basic block.
3985 const BasicBlock &Block = *In.getParent();
3986 assert(Block.begin() == To || Block.end() == To || To->getParent() == &Block);
3987 // No PHIs.
3988 if (isa<PHINode>(In) || (To != Block.end() && isa<PHINode>(*To)))
3989 return false;
3990
3992 return true;
3993 bool MayWrite = In.mayWriteToMemory();
3994 auto MaybeLoc = getLocOrNone(In);
3995
3996 auto From = In.getIterator();
3997 if (From == To)
3998 return true;
3999 bool MoveUp = (To != Block.end() && To->comesBefore(&In));
4000 auto Range =
4001 MoveUp ? std::make_pair(To, From) : std::make_pair(std::next(From), To);
4002 for (auto It = Range.first; It != Range.second; ++It) {
4003 const Instruction &I = *It;
4004 if (llvm::is_contained(IgnoreInsts, &I))
4005 continue;
4006 // assume intrinsic can be ignored
4007 if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
4008 if (II->getIntrinsicID() == Intrinsic::assume)
4009 continue;
4010 }
4011 // Parts based on isSafeToMoveBefore from CoveMoverUtils.cpp.
4012 if (I.mayThrow())
4013 return false;
4014 if (auto *CB = dyn_cast<CallBase>(&I)) {
4015 if (!CB->hasFnAttr(Attribute::WillReturn))
4016 return false;
4017 if (!CB->hasFnAttr(Attribute::NoSync))
4018 return false;
4019 }
4020 if (I.mayReadOrWriteMemory()) {
4021 auto MaybeLocI = getLocOrNone(I);
4022 if (MayWrite || I.mayWriteToMemory()) {
4023 if (!MaybeLoc || !MaybeLocI)
4024 return false;
4025 if (!AA.isNoAlias(*MaybeLoc, *MaybeLocI))
4026 return false;
4027 }
4028 }
4029 }
4030 return true;
4031}
4032
4033auto HexagonVectorCombine::isByteVecTy(Type *Ty) const -> bool {
4034 if (auto *VecTy = dyn_cast<VectorType>(Ty))
4035 return VecTy->getElementType() == getByteTy();
4036 return false;
4037}
4038
4039auto HexagonVectorCombine::getElementRange(IRBuilderBase &Builder, Value *Lo,
4040 Value *Hi, int Start,
4041 int Length) const -> Value * {
4042 assert(0 <= Start && size_t(Start + Length) < length(Lo) + length(Hi));
4043 SmallVector<int, 128> SMask(Length);
4044 std::iota(SMask.begin(), SMask.end(), Start);
4045 return Builder.CreateShuffleVector(Lo, Hi, SMask, "shf");
4046}
4047
4048// Pass management.
4049
4050namespace {
4051class HexagonVectorCombineLegacy : public FunctionPass {
4052public:
4053 static char ID;
4054
4055 HexagonVectorCombineLegacy() : FunctionPass(ID) {}
4056
4057 StringRef getPassName() const override { return "Hexagon Vector Combine"; }
4058
4059 void getAnalysisUsage(AnalysisUsage &AU) const override {
4060 AU.setPreservesCFG();
4061 AU.addRequired<AAResultsWrapperPass>();
4062 AU.addRequired<AssumptionCacheTracker>();
4063 AU.addRequired<DominatorTreeWrapperPass>();
4064 AU.addRequired<ScalarEvolutionWrapperPass>();
4065 AU.addRequired<TargetLibraryInfoWrapperPass>();
4066 AU.addRequired<TargetPassConfig>();
4067 FunctionPass::getAnalysisUsage(AU);
4068 }
4069
4070 bool runOnFunction(Function &F) override {
4071 if (skipFunction(F))
4072 return false;
4073 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
4074 AssumptionCache &AC =
4075 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
4076 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
4077 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
4078 TargetLibraryInfo &TLI =
4079 getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
4080 auto &TM = getAnalysis<TargetPassConfig>().getTM<HexagonTargetMachine>();
4081 HexagonVectorCombine HVC(F, AA, AC, DT, SE, TLI, TM);
4082 return HVC.run();
4083 }
4084};
4085} // namespace
4086
4087char HexagonVectorCombineLegacy::ID = 0;
4088
4089INITIALIZE_PASS_BEGIN(HexagonVectorCombineLegacy, DEBUG_TYPE,
4090 "Hexagon Vector Combine", false, false)
4097INITIALIZE_PASS_END(HexagonVectorCombineLegacy, DEBUG_TYPE,
4098 "Hexagon Vector Combine", false, false)
4099
4101 return new HexagonVectorCombineLegacy();
4102}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Prepare AGPR Alloc
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
static IntegerType * getIntTy(IRBuilderBase &B, const TargetLibraryInfo *TLI)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
hexagon bit simplify
Hexagon Common GEP
static cl::opt< unsigned > SizeLimit("eif-limit", cl::init(6), cl::Hidden, cl::desc("Size limit in Hexagon early if-conversion"))
static Value * locateIndexesFromIntrinsic(Instruction *In)
Instruction * locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual)
Value * getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC, IRBuilderBase &Builder, LLVMContext &Ctx, Value *I)
static Value * locateIndexesFromGEP(Value *In)
#define CallBuilder(B, F)
Value * getPointer(Value *Ptr)
#define DEFAULT_HVX_VTCM_PAGE_SIZE
static Value * locateAddressFromIntrinsic(Instruction *In)
static Instruction * selectDestination(Instruction *In, HvxIdioms::DstQualifier &Qual)
Value * get_i32_Mask(const HexagonVectorCombine &HVC, IRBuilderBase &Builder, LLVMContext &Ctx, unsigned int pattern)
bool isArithmetic(unsigned Opc)
static Type * getIndexType(Value *In)
GetElementPtrInst * locateGepFromIntrinsic(Instruction *In)
Value * getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC, IRBuilderBase &Builder, LLVMContext &Ctx, Value *I)
static Align effectiveAlignForValueTy(const DataLayout &DL, Type *ValTy, int Requested)
iv Induction Variable Users
Definition IVUsers.cpp:48
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
#define H(x, y, z)
Definition MD5.cpp:56
static bool isCandidate(const MachineInstr *MI, Register &DefedReg, Register FrameReg)
static bool isUndef(const MachineInstr &MI)
This file implements a map that provides insertion order iteration.
This file contains the declarations for metadata subclasses.
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Remove Loads Into Fake Uses
static ConstantInt * getConstInt(MDNode *MD, unsigned NumOp)
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
Target-Independent Code Generator Pass Configuration Options pass.
static uint32_t getAlignment(const MCSectionCOFF &Sec)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:828
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
InstListType::const_iterator const_iterator
Definition BasicBlock.h:171
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
AttributeList getAttributes() const
Return the attributes for this call.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
unsigned getPointerSizeInBits(unsigned AS=0) const
The size in bits of the pointer representation in a given address space.
Definition DataLayout.h:490
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
iterator_range< iterator > children()
NodeT * getBlock() const
DomTreeNodeBase< NodeT > * getRootNode()
getRootNode - This returns the entry node for the CFG of the function.
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:321
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool empty() const
Definition Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const BasicBlock & back() const
Definition Function.h:860
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool isHVXVectorType(EVT VecTy, bool IncludeBool=false) const
unsigned getVectorLength() const
bool isTypeForHVX(Type *VecTy, bool IncludeBool=false) const
Intrinsic::ID getIntrinsicId(unsigned Opc) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition IRBuilder.h:1833
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2632
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2097
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2336
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2289
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1420
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2607
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2197
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2511
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2071
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1532
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2442
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
const char * getOpcodeName() const
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
void remove_if(Predicate Pred)
Remove the elements that match the predicate.
bool empty() const
Definition MapVector.h:77
size_type size() const
Definition MapVector.h:56
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:220
bool onlyAccessesInaccessibleMem() const
Whether this function only (at most) accesses inaccessible memory.
Definition ModRef.h:239
static LLVM_ABI std::optional< MemoryLocation > getOrNone(const Instruction *Inst)
static LLVM_ABI MemoryLocation getForArgument(const CallBase *Call, unsigned ArgIdx, const TargetLibraryInfo *TLI)
Return a location representing a particular argument of a call.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
The main scalar evolution driver.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Value * getOperand(unsigned i) const
Definition User.h:233
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
Rounding
Possible values of current rounding mode, which is specified in bits 23:22 of FPCR.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate
Predicate - These are "(BI << 5) | BO" for various predicates.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
BinOpPred_match< LHS, RHS, is_right_shift_op > m_Shr(const LHS &L, const RHS &R)
Matches logical shift operations.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
constexpr double e
@ User
could "use" a pointer
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI Instruction * getTerminator() const
LLVM_ABI Instruction & front() const
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
FunctionPass * createHexagonVectorCombineLegacyPass()
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2184
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:301
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1789
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI Value * simplifyInstruction(Instruction *I, const SimplifyQuery &Q)
See if we can compute a simplified version of this instruction.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition Local.h:252
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2168
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
MaskT vshuff(ArrayRef< int > Vu, ArrayRef< int > Vv, unsigned Size, bool TakeOdd)
MaskT vdeal(ArrayRef< int > Vu, ArrayRef< int > Vv, unsigned Size, bool TakeOdd)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316