LLVM 23.0.0git
HexagonVectorCombine.cpp
Go to the documentation of this file.
1//===-- HexagonVectorCombine.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// HexagonVectorCombine is a utility class implementing a variety of functions
9// that assist in vector-based optimizations.
10//
11// AlignVectors: replace unaligned vector loads and stores with aligned ones.
12// HvxIdioms: recognize various opportunities to generate HVX intrinsic code.
13//===----------------------------------------------------------------------===//
14
15#include "llvm/ADT/APInt.h"
16#include "llvm/ADT/ArrayRef.h"
17#include "llvm/ADT/DenseMap.h"
18#include "llvm/ADT/MapVector.h"
19#include "llvm/ADT/STLExtras.h"
33#include "llvm/IR/Dominators.h"
34#include "llvm/IR/IRBuilder.h"
36#include "llvm/IR/Intrinsics.h"
37#include "llvm/IR/IntrinsicsHexagon.h"
38#include "llvm/IR/Metadata.h"
41#include "llvm/Pass.h"
48
49#include "Hexagon.h"
50#include "HexagonSubtarget.h"
52
53#include <algorithm>
54#include <deque>
55#include <map>
56#include <optional>
57#include <set>
58#include <utility>
59#include <vector>
60
61#define DEBUG_TYPE "hexagon-vc"
62
63// This is a const that represents default HVX VTCM page size.
64// It is boot time configurable, so we probably want an API to
65// read it, but for now assume 128KB
66#define DEFAULT_HVX_VTCM_PAGE_SIZE 131072
67
68using namespace llvm;
69
70namespace {
71cl::opt<bool> DumpModule("hvc-dump-module", cl::Hidden);
72cl::opt<bool> VAEnabled("hvc-va", cl::Hidden, cl::init(true)); // Align
73cl::opt<bool> VIEnabled("hvc-vi", cl::Hidden, cl::init(true)); // Idioms
74cl::opt<bool> VADoFullStores("hvc-va-full-stores", cl::Hidden);
75
76cl::opt<unsigned> VAGroupCountLimit("hvc-va-group-count-limit", cl::Hidden,
77 cl::init(~0));
78cl::opt<unsigned> VAGroupSizeLimit("hvc-va-group-size-limit", cl::Hidden,
79 cl::init(~0));
81 MinLoadGroupSizeForAlignment("hvc-ld-min-group-size-for-alignment",
83
84class HexagonVectorCombine {
85public:
86 HexagonVectorCombine(Function &F_, AliasAnalysis &AA_, AssumptionCache &AC_,
88 TargetLibraryInfo &TLI_, const TargetMachine &TM_,
90 : F(F_), DL(F.getDataLayout()), AA(AA_), AC(AC_), DT(DT_), SE(SE_),
91 TLI(TLI_),
92 HST(static_cast<const HexagonSubtarget &>(*TM_.getSubtargetImpl(F))),
93 ORE(ORE_) {}
94
95 bool run();
96
97 // Common integer type.
98 IntegerType *getIntTy(unsigned Width = 32) const;
99 // Byte type: either scalar (when Length = 0), or vector with given
100 // element count.
101 Type *getByteTy(int ElemCount = 0) const;
102 // Boolean type: either scalar (when Length = 0), or vector with given
103 // element count.
104 Type *getBoolTy(int ElemCount = 0) const;
105 // Create a ConstantInt of type returned by getIntTy with the value Val.
106 ConstantInt *getConstInt(int Val, unsigned Width = 32) const;
107 // Get the integer value of V, if it exists.
108 std::optional<APInt> getIntValue(const Value *Val) const;
109 // Is Val a constant 0, or a vector of 0s?
110 bool isZero(const Value *Val) const;
111 // Is Val an undef value?
112 bool isUndef(const Value *Val) const;
113 // Is Val a scalar (i1 true) or a vector of (i1 true)?
114 bool isTrue(const Value *Val) const;
115 // Is Val a scalar (i1 false) or a vector of (i1 false)?
116 bool isFalse(const Value *Val) const;
117
118 // Get HVX vector type with the given element type.
119 VectorType *getHvxTy(Type *ElemTy, bool Pair = false) const;
120
121 enum SizeKind {
122 Store, // Store size
123 Alloc, // Alloc size
124 };
125 int getSizeOf(const Value *Val, SizeKind Kind = Store) const;
126 int getSizeOf(const Type *Ty, SizeKind Kind = Store) const;
127 int getTypeAlignment(Type *Ty) const;
128 size_t length(Value *Val) const;
129 size_t length(Type *Ty) const;
130
131 Value *simplify(Value *Val) const;
132
133 Value *insertb(IRBuilderBase &Builder, Value *Dest, Value *Src, int Start,
134 int Length, int Where) const;
135 Value *vlalignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
136 Value *Amt) const;
137 Value *vralignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
138 Value *Amt) const;
139 Value *concat(IRBuilderBase &Builder, ArrayRef<Value *> Vecs) const;
140 Value *vresize(IRBuilderBase &Builder, Value *Val, int NewSize,
141 Value *Pad) const;
142 Value *rescale(IRBuilderBase &Builder, Value *Mask, Type *FromTy,
143 Type *ToTy) const;
144 Value *vlsb(IRBuilderBase &Builder, Value *Val) const;
145 Value *vbytes(IRBuilderBase &Builder, Value *Val) const;
146 Value *subvector(IRBuilderBase &Builder, Value *Val, unsigned Start,
147 unsigned Length) const;
148 Value *sublo(IRBuilderBase &Builder, Value *Val) const;
149 Value *subhi(IRBuilderBase &Builder, Value *Val) const;
150 Value *vdeal(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
151 Value *vshuff(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
152
153 Value *createHvxIntrinsic(IRBuilderBase &Builder, Intrinsic::ID IntID,
154 Type *RetTy, ArrayRef<Value *> Args,
155 ArrayRef<Type *> ArgTys = {},
156 ArrayRef<Value *> MDSources = {}) const;
157 SmallVector<Value *> splitVectorElements(IRBuilderBase &Builder, Value *Vec,
158 unsigned ToWidth) const;
159 Value *joinVectorElements(IRBuilderBase &Builder, ArrayRef<Value *> Values,
160 VectorType *ToType) const;
161
162 std::optional<int> calculatePointerDifference(Value *Ptr0, Value *Ptr1) const;
163
164 unsigned getNumSignificantBits(const Value *V,
165 const Instruction *CtxI = nullptr) const;
166 KnownBits getKnownBits(const Value *V,
167 const Instruction *CtxI = nullptr) const;
168
169 bool isSafeToClone(const Instruction &In) const;
170
171 template <typename T = std::vector<Instruction *>>
172 bool isSafeToMoveBeforeInBB(const Instruction &In,
174 const T &IgnoreInsts = {}) const;
175
176 // This function is only used for assertions at the moment.
177 [[maybe_unused]] bool isByteVecTy(Type *Ty) const;
178
179 Function &F;
180 const DataLayout &DL;
182 AssumptionCache &AC;
183 DominatorTree &DT;
184 ScalarEvolution &SE;
186 const HexagonSubtarget &HST;
188
189private:
190 Value *getElementRange(IRBuilderBase &Builder, Value *Lo, Value *Hi,
191 int Start, int Length) const;
192};
193
194class AlignVectors {
195 // This code tries to replace unaligned vector loads/stores with aligned
196 // ones.
197 // Consider unaligned load:
198 // %v = original_load %some_addr, align <bad>
199 // %user = %v
200 // It will generate
201 // = load ..., align <good>
202 // = load ..., align <good>
203 // = valign
204 // etc.
205 // %synthesize = combine/shuffle the loaded data so that it looks
206 // exactly like what "original_load" has loaded.
207 // %user = %synthesize
208 // Similarly for stores.
209public:
210 AlignVectors(const HexagonVectorCombine &HVC_) : HVC(HVC_) {}
211
212 bool run();
213
214private:
215 using InstList = std::vector<Instruction *>;
217
218 struct AddrInfo {
219 AddrInfo(const AddrInfo &) = default;
220 AddrInfo &operator=(const AddrInfo &) = default;
221 AddrInfo(const HexagonVectorCombine &HVC, Instruction *I, Value *A, Type *T,
222 Align H)
223 : Inst(I), Addr(A), ValTy(T), HaveAlign(H),
224 NeedAlign(HVC.getTypeAlignment(ValTy)) {}
225
226 // XXX: add Size member?
227 Instruction *Inst;
228 Value *Addr;
229 Type *ValTy;
230 Align HaveAlign;
231 Align NeedAlign;
232 int Offset = 0; // Offset (in bytes) from the first member of the
233 // containing AddrList.
234 };
235 using AddrList = std::vector<AddrInfo>;
236
237 struct InstrLess {
238 bool operator()(const Instruction *A, const Instruction *B) const {
239 return A->comesBefore(B);
240 }
241 };
242 using DepList = std::set<Instruction *, InstrLess>;
243
244 struct MoveGroup {
245 MoveGroup(const AddrInfo &AI, Instruction *B, bool Hvx, bool Load)
246 : Base(B), Main{AI.Inst}, Clones{}, IsHvx(Hvx), IsLoad(Load) {}
247 MoveGroup() = default;
248 Instruction *Base; // Base instruction of the parent address group.
249 InstList Main; // Main group of instructions.
250 InstList Deps; // List of dependencies.
251 InstMap Clones; // Map from original Deps to cloned ones.
252 bool IsHvx; // Is this group of HVX instructions?
253 bool IsLoad; // Is this a load group?
254 };
255 using MoveList = std::vector<MoveGroup>;
256
257 struct ByteSpan {
258 // A representation of "interesting" bytes within a given span of memory.
259 // These bytes are those that are loaded or stored, and they don't have
260 // to cover the entire span of memory.
261 //
262 // The representation works by picking a contiguous sequence of bytes
263 // from somewhere within a llvm::Value, and placing it at a given offset
264 // within the span.
265 //
266 // The sequence of bytes from llvm:Value is represented by Segment.
267 // Block is Segment, plus where it goes in the span.
268 //
269 // An important feature of ByteSpan is being able to make a "section",
270 // i.e. creating another ByteSpan corresponding to a range of offsets
271 // relative to the source span.
272
273 struct Segment {
274 // Segment of a Value: 'Len' bytes starting at byte 'Begin'.
275 Segment(Value *Val, int Begin, int Len)
276 : Val(Val), Start(Begin), Size(Len) {}
277 Segment(const Segment &Seg) = default;
278 Segment &operator=(const Segment &Seg) = default;
279 Value *Val; // Value representable as a sequence of bytes.
280 int Start; // First byte of the value that belongs to the segment.
281 int Size; // Number of bytes in the segment.
282 };
283
284 struct Block {
285 Block(Value *Val, int Len, int Pos) : Seg(Val, 0, Len), Pos(Pos) {}
286 Block(Value *Val, int Off, int Len, int Pos)
287 : Seg(Val, Off, Len), Pos(Pos) {}
288 Block(const Block &Blk) = default;
289 Block &operator=(const Block &Blk) = default;
290 Segment Seg; // Value segment.
291 int Pos; // Position (offset) of the block in the span.
292 };
293
294 int extent() const;
295 ByteSpan section(int Start, int Length) const;
296 ByteSpan &shift(int Offset);
297 SmallVector<Value *, 8> values() const;
298
299 int size() const { return Blocks.size(); }
300 Block &operator[](int i) { return Blocks[i]; }
301 const Block &operator[](int i) const { return Blocks[i]; }
302
303 std::vector<Block> Blocks;
304
305 using iterator = decltype(Blocks)::iterator;
306 iterator begin() { return Blocks.begin(); }
307 iterator end() { return Blocks.end(); }
308 using const_iterator = decltype(Blocks)::const_iterator;
309 const_iterator begin() const { return Blocks.begin(); }
310 const_iterator end() const { return Blocks.end(); }
311 };
312
313 std::optional<AddrInfo> getAddrInfo(Instruction &In) const;
314 bool isHvx(const AddrInfo &AI) const;
315 // This function is only used for assertions at the moment.
316 [[maybe_unused]] bool isSectorTy(Type *Ty) const;
317
318 Value *getPayload(Value *Val) const;
319 Value *getMask(Value *Val) const;
320 Value *getPassThrough(Value *Val) const;
321
322 Value *createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
323 int Adjust,
324 const InstMap &CloneMap = InstMap()) const;
325 Value *createAlignedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
326 int Alignment,
327 const InstMap &CloneMap = InstMap()) const;
328
329 Value *createLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
330 Value *Predicate, int Alignment, Value *Mask,
331 Value *PassThru, ArrayRef<Value *> MDSources = {}) const;
332 Value *createSimpleLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
333 int Alignment,
334 ArrayRef<Value *> MDSources = {}) const;
335
336 Value *createStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
337 Value *Predicate, int Alignment, Value *Mask,
338 ArrayRef<Value *> MDSources = {}) const;
339 Value *createSimpleStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
340 int Alignment,
341 ArrayRef<Value *> MDSources = {}) const;
342
343 Value *createPredicatedLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
344 Value *Predicate, int Alignment,
345 ArrayRef<Value *> MDSources = {}) const;
346 Value *createPredicatedStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
347 Value *Predicate, int Alignment,
348 ArrayRef<Value *> MDSources = {}) const;
349
350 DepList getUpwardDeps(Instruction *In, Instruction *Base) const;
351 bool createAddressGroups();
352 MoveList createLoadGroups(const AddrList &Group) const;
353 MoveList createStoreGroups(const AddrList &Group) const;
354 bool moveTogether(MoveGroup &Move) const;
355 template <typename T>
356 InstMap cloneBefore(BasicBlock::iterator To, T &&Insts) const;
357
358 void realignLoadGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
359 int ScLen, Value *AlignVal, Value *AlignAddr) const;
360 void realignStoreGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
361 int ScLen, Value *AlignVal, Value *AlignAddr) const;
362 bool realignGroup(const MoveGroup &Move);
363 Value *makeTestIfUnaligned(IRBuilderBase &Builder, Value *AlignVal,
364 int Alignment) const;
365
366 using AddrGroupMap = MapVector<Instruction *, AddrList>;
367 AddrGroupMap AddrGroups;
368
369 friend raw_ostream &operator<<(raw_ostream &OS, const AddrList &L);
370 friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);
371 friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);
372 friend raw_ostream &operator<<(raw_ostream &OS, const MoveList &L);
373 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan::Block &B);
374 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);
375 friend raw_ostream &operator<<(raw_ostream &OS, const AddrGroupMap &AG);
376 friend raw_ostream &operator<<(raw_ostream &OS, const AddrList &L);
377 friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);
378 friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);
379 friend raw_ostream &operator<<(raw_ostream &OS, const MoveList &L);
380 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan::Block &B);
381 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);
382 friend raw_ostream &operator<<(raw_ostream &OS, const AddrGroupMap &AG);
383
384 const HexagonVectorCombine &HVC;
385};
386
387[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
388 const AlignVectors::AddrGroupMap &AG) {
389 OS << "Printing AddrGroups:"
390 << "\n";
391 for (auto &It : AG) {
392 OS << "\n\tInstruction: ";
393 It.first->dump();
394 OS << "\n\tAddrInfo: ";
395 for (auto &AI : It.second)
396 OS << AI << "\n";
397 }
398 return OS;
399}
400
401[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
402 const AlignVectors::AddrList &AL) {
403 OS << "\n *** Addr List: ***\n";
404 for (auto &AG : AL) {
405 OS << "\n *** Addr Group: ***\n";
406 OS << AG;
407 OS << "\n";
408 }
409 return OS;
410}
411
412[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
413 const AlignVectors::AddrInfo &AI) {
414 OS << "Inst: " << AI.Inst << " " << *AI.Inst << '\n';
415 OS << "Addr: " << *AI.Addr << '\n';
416 OS << "Type: " << *AI.ValTy << '\n';
417 OS << "HaveAlign: " << AI.HaveAlign.value() << '\n';
418 OS << "NeedAlign: " << AI.NeedAlign.value() << '\n';
419 OS << "Offset: " << AI.Offset;
420 return OS;
421}
422
423[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
424 const AlignVectors::MoveList &ML) {
425 OS << "\n *** Move List: ***\n";
426 for (auto &MG : ML) {
427 OS << "\n *** Move Group: ***\n";
428 OS << MG;
429 OS << "\n";
430 }
431 return OS;
432}
433
434[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
435 const AlignVectors::MoveGroup &MG) {
436 OS << "IsLoad:" << (MG.IsLoad ? "yes" : "no");
437 OS << ", IsHvx:" << (MG.IsHvx ? "yes" : "no") << '\n';
438 OS << "Main\n";
439 for (Instruction *I : MG.Main)
440 OS << " " << *I << '\n';
441 OS << "Deps\n";
442 for (Instruction *I : MG.Deps)
443 OS << " " << *I << '\n';
444 OS << "Clones\n";
445 for (auto [K, V] : MG.Clones) {
446 OS << " ";
447 K->printAsOperand(OS, false);
448 OS << "\t-> " << *V << '\n';
449 }
450 return OS;
451}
452
453[[maybe_unused]] raw_ostream &
454operator<<(raw_ostream &OS, const AlignVectors::ByteSpan::Block &B) {
455 OS << " @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] ";
456 if (B.Seg.Val == reinterpret_cast<const Value *>(&B)) {
457 OS << "(self:" << B.Seg.Val << ')';
458 } else if (B.Seg.Val != nullptr) {
459 OS << *B.Seg.Val;
460 } else {
461 OS << "(null)";
462 }
463 return OS;
464}
465
466[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
467 const AlignVectors::ByteSpan &BS) {
468 OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
469 for (const AlignVectors::ByteSpan::Block &B : BS)
470 OS << B << '\n';
471 OS << ']';
472 return OS;
473}
474
475class HvxIdioms {
476public:
477 enum DstQualifier {
478 Undefined = 0,
479 Arithmetic,
480 LdSt,
481 LLVM_Gather,
482 LLVM_Scatter,
483 HEX_Gather_Scatter,
484 HEX_Gather,
485 HEX_Scatter,
486 Call
487 };
488
489 HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) {
490 auto *Int32Ty = HVC.getIntTy(32);
491 HvxI32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/false);
492 HvxP32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/true);
493 }
494
495 bool run();
496
497private:
498 enum Signedness { Positive, Signed, Unsigned };
499
500 // Value + sign
501 // This is to keep track of whether the value should be treated as signed
502 // or unsigned, or is known to be positive.
503 struct SValue {
504 Value *Val;
505 Signedness Sgn;
506 };
507
508 struct FxpOp {
509 unsigned Opcode;
510 unsigned Frac; // Number of fraction bits
511 SValue X, Y;
512 // If present, add 1 << RoundAt before shift:
513 std::optional<unsigned> RoundAt;
514 VectorType *ResTy;
515 };
516
517 auto getNumSignificantBits(Value *V, Instruction *In) const
518 -> std::pair<unsigned, Signedness>;
519 auto canonSgn(SValue X, SValue Y) const -> std::pair<SValue, SValue>;
520
521 auto matchFxpMul(Instruction &In) const -> std::optional<FxpOp>;
522 auto processFxpMul(Instruction &In, const FxpOp &Op) const -> Value *;
523
524 auto processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
525 const FxpOp &Op) const -> Value *;
526 auto createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
527 bool Rounding) const -> Value *;
528 auto createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
529 bool Rounding) const -> Value *;
530 // Return {Result, Carry}, where Carry is a vector predicate.
531 auto createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
532 Value *CarryIn = nullptr) const
533 -> std::pair<Value *, Value *>;
534 auto createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const -> Value *;
535 auto createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
536 -> Value *;
537 auto createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
538 -> std::pair<Value *, Value *>;
539 auto createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
541 auto createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
542 Signedness SgnX, ArrayRef<Value *> WordY,
543 Signedness SgnY) const -> SmallVector<Value *>;
544
545 bool matchMLoad(Instruction &In) const;
546 bool matchMStore(Instruction &In) const;
547 Value *processMLoad(Instruction &In) const;
548 Value *processMStore(Instruction &In) const;
549 std::optional<uint64_t> getAlignment(Instruction &In, Value *ptr) const;
550 std::optional<uint64_t>
551 getAlignmentImpl(Instruction &In, Value *ptr,
552 SmallPtrSet<Value *, 16> &Visited) const;
553 std::optional<uint64_t> getPHIBaseMinAlignment(Instruction &In,
554 PHINode *PN) const;
555
556 // Vector manipulations for Ripple
557 bool matchScatter(Instruction &In) const;
558 bool matchGather(Instruction &In) const;
559 Value *processVScatter(Instruction &In) const;
560 Value *processVGather(Instruction &In) const;
561
562 VectorType *HvxI32Ty;
563 VectorType *HvxP32Ty;
564 const HexagonVectorCombine &HVC;
565
566 friend raw_ostream &operator<<(raw_ostream &, const FxpOp &);
567};
568
569[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
570 const HvxIdioms::FxpOp &Op) {
571 static const char *SgnNames[] = {"Positive", "Signed", "Unsigned"};
572 OS << Instruction::getOpcodeName(Op.Opcode) << '.' << Op.Frac;
573 if (Op.RoundAt.has_value()) {
574 if (Op.Frac != 0 && *Op.RoundAt == Op.Frac - 1) {
575 OS << ":rnd";
576 } else {
577 OS << " + 1<<" << *Op.RoundAt;
578 }
579 }
580 OS << "\n X:(" << SgnNames[Op.X.Sgn] << ") " << *Op.X.Val << "\n"
581 << " Y:(" << SgnNames[Op.Y.Sgn] << ") " << *Op.Y.Val;
582 return OS;
583}
584
585} // namespace
586
587namespace {
588
589template <typename T> T *getIfUnordered(T *MaybeT) {
590 return MaybeT && MaybeT->isUnordered() ? MaybeT : nullptr;
591}
592template <typename T> T *isCandidate(Instruction *In) {
593 return dyn_cast<T>(In);
594}
596 return getIfUnordered(dyn_cast<LoadInst>(In));
597}
599 return getIfUnordered(dyn_cast<StoreInst>(In));
600}
601
602// Forward other erase_ifs to the LLVM implementations.
603template <typename Pred, typename T> void erase_if(T &&container, Pred p) {
604 llvm::erase_if(std::forward<T>(container), p);
605}
606
607} // namespace
608
609// --- Begin AlignVectors
610
611// For brevity, only consider loads. We identify a group of loads where we
612// know the relative differences between their addresses, so we know how they
613// are laid out in memory (relative to one another). These loads can overlap,
614// can be shorter or longer than the desired vector length.
615// Ultimately we want to generate a sequence of aligned loads that will load
616// every byte that the original loads loaded, and have the program use these
617// loaded values instead of the original loads.
618// We consider the contiguous memory area spanned by all these loads.
619//
620// Let's say that a single aligned vector load can load 16 bytes at a time.
621// If the program wanted to use a byte at offset 13 from the beginning of the
622// original span, it will be a byte at offset 13+x in the aligned data for
623// some x>=0. This may happen to be in the first aligned load, or in the load
624// following it. Since we generally don't know what the that alignment value
625// is at compile time, we proactively do valigns on the aligned loads, so that
626// byte that was at offset 13 is still at offset 13 after the valigns.
627//
628// This will be the starting point for making the rest of the program use the
629// data loaded by the new loads.
630// For each original load, and its users:
631// %v = load ...
632// ... = %v
633// ... = %v
634// we create
635// %new_v = extract/combine/shuffle data from loaded/valigned vectors so
636// it contains the same value as %v did before
637// then replace all users of %v with %new_v.
638// ... = %new_v
639// ... = %new_v
640
641auto AlignVectors::ByteSpan::extent() const -> int {
642 if (size() == 0)
643 return 0;
644 int Min = Blocks[0].Pos;
645 int Max = Blocks[0].Pos + Blocks[0].Seg.Size;
646 for (int i = 1, e = size(); i != e; ++i) {
647 Min = std::min(Min, Blocks[i].Pos);
648 Max = std::max(Max, Blocks[i].Pos + Blocks[i].Seg.Size);
649 }
650 return Max - Min;
651}
652
653auto AlignVectors::ByteSpan::section(int Start, int Length) const -> ByteSpan {
654 ByteSpan Section;
655 for (const ByteSpan::Block &B : Blocks) {
656 int L = std::max(B.Pos, Start); // Left end.
657 int R = std::min(B.Pos + B.Seg.Size, Start + Length); // Right end+1.
658 if (L < R) {
659 // How much to chop off the beginning of the segment:
660 int Off = L > B.Pos ? L - B.Pos : 0;
661 Section.Blocks.emplace_back(B.Seg.Val, B.Seg.Start + Off, R - L, L);
662 }
663 }
664 return Section;
665}
666
667auto AlignVectors::ByteSpan::shift(int Offset) -> ByteSpan & {
668 for (Block &B : Blocks)
669 B.Pos += Offset;
670 return *this;
671}
672
673auto AlignVectors::ByteSpan::values() const -> SmallVector<Value *, 8> {
674 SmallVector<Value *, 8> Values(Blocks.size());
675 for (int i = 0, e = Blocks.size(); i != e; ++i)
676 Values[i] = Blocks[i].Seg.Val;
677 return Values;
678}
679
680// Turn a requested integer alignment into the effective Align to use.
681// If Requested == 0 -> use ABI alignment of the value type (old semantics).
682// 0 means "ABI alignment" in old IR.
684 int Requested) {
685 if (Requested > 0)
686 return Align(static_cast<uint64_t>(Requested));
687 return Align(DL.getABITypeAlign(ValTy).value());
688}
689
690auto AlignVectors::getAddrInfo(Instruction &In) const
691 -> std::optional<AddrInfo> {
692 if (auto *L = isCandidate<LoadInst>(&In))
693 return AddrInfo(HVC, L, L->getPointerOperand(), L->getType(),
694 L->getAlign());
695 if (auto *S = isCandidate<StoreInst>(&In))
696 return AddrInfo(HVC, S, S->getPointerOperand(),
697 S->getValueOperand()->getType(), S->getAlign());
698 if (auto *II = isCandidate<IntrinsicInst>(&In)) {
699 Intrinsic::ID ID = II->getIntrinsicID();
700 switch (ID) {
701 case Intrinsic::masked_load:
702 return AddrInfo(HVC, II, II->getArgOperand(0), II->getType(),
703 II->getParamAlign(0).valueOrOne());
704 case Intrinsic::masked_store:
705 return AddrInfo(HVC, II, II->getArgOperand(1),
706 II->getArgOperand(0)->getType(),
707 II->getParamAlign(1).valueOrOne());
708 }
709 }
710 return std::nullopt;
711}
712
713auto AlignVectors::isHvx(const AddrInfo &AI) const -> bool {
714 return HVC.HST.isTypeForHVX(AI.ValTy);
715}
716
717auto AlignVectors::getPayload(Value *Val) const -> Value * {
718 if (auto *In = dyn_cast<Instruction>(Val)) {
719 Intrinsic::ID ID = 0;
720 if (auto *II = dyn_cast<IntrinsicInst>(In))
721 ID = II->getIntrinsicID();
722 if (isa<StoreInst>(In) || ID == Intrinsic::masked_store)
723 return In->getOperand(0);
724 }
725 return Val;
726}
727
728auto AlignVectors::getMask(Value *Val) const -> Value * {
729 if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
730 switch (II->getIntrinsicID()) {
731 case Intrinsic::masked_load:
732 return II->getArgOperand(1);
733 case Intrinsic::masked_store:
734 return II->getArgOperand(2);
735 }
736 }
737
738 Type *ValTy = getPayload(Val)->getType();
739 if (auto *VecTy = dyn_cast<VectorType>(ValTy))
740 return Constant::getAllOnesValue(HVC.getBoolTy(HVC.length(VecTy)));
741 return Constant::getAllOnesValue(HVC.getBoolTy());
742}
743
744auto AlignVectors::getPassThrough(Value *Val) const -> Value * {
745 if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
746 if (II->getIntrinsicID() == Intrinsic::masked_load)
747 return II->getArgOperand(2);
748 }
749 return UndefValue::get(getPayload(Val)->getType());
750}
751
752auto AlignVectors::createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr,
753 Type *ValTy, int Adjust,
754 const InstMap &CloneMap) const
755 -> Value * {
756 if (auto *I = dyn_cast<Instruction>(Ptr))
757 if (Instruction *New = CloneMap.lookup(I))
758 Ptr = New;
759 return Builder.CreatePtrAdd(Ptr, HVC.getConstInt(Adjust), "gep");
760}
761
762auto AlignVectors::createAlignedPointer(IRBuilderBase &Builder, Value *Ptr,
763 Type *ValTy, int Alignment,
764 const InstMap &CloneMap) const
765 -> Value * {
766 auto remap = [&](Value *V) -> Value * {
767 if (auto *I = dyn_cast<Instruction>(V)) {
768 for (auto [Old, New] : CloneMap)
769 I->replaceUsesOfWith(Old, New);
770 return I;
771 }
772 return V;
773 };
774 Value *AsInt = Builder.CreatePtrToInt(Ptr, HVC.getIntTy(), "pti");
775 Value *Mask = HVC.getConstInt(-Alignment);
776 Value *And = Builder.CreateAnd(remap(AsInt), Mask, "and");
777 return Builder.CreateIntToPtr(
778 And, PointerType::getUnqual(ValTy->getContext()), "itp");
779}
780
781auto AlignVectors::createLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
782 Value *Predicate, int Alignment, Value *Mask,
783 Value *PassThru,
784 ArrayRef<Value *> MDSources) const -> Value * {
785 // Predicate is nullptr if not creating predicated load
786 if (Predicate) {
787 assert(!Predicate->getType()->isVectorTy() &&
788 "Expectning scalar predicate");
789 if (HVC.isFalse(Predicate))
790 return UndefValue::get(ValTy);
791 if (!HVC.isTrue(Predicate)) {
792 Value *Load = createPredicatedLoad(Builder, ValTy, Ptr, Predicate,
793 Alignment, MDSources);
794 return Builder.CreateSelect(Mask, Load, PassThru);
795 }
796 // Predicate == true here.
797 }
798 assert(!HVC.isUndef(Mask)); // Should this be allowed?
799 if (HVC.isZero(Mask))
800 return PassThru;
801
802 Align EffA = effectiveAlignForValueTy(HVC.DL, ValTy, Alignment);
803 if (HVC.isTrue(Mask))
804 return createSimpleLoad(Builder, ValTy, Ptr, EffA.value(), MDSources);
805
807 Builder.CreateMaskedLoad(ValTy, Ptr, EffA, Mask, PassThru, "mld");
808 LLVM_DEBUG(dbgs() << "\t[Creating masked Load:] "; Load->dump());
809 propagateMetadata(Load, MDSources);
810 return Load;
811}
812
813auto AlignVectors::createSimpleLoad(IRBuilderBase &Builder, Type *ValTy,
814 Value *Ptr, int Alignment,
815 ArrayRef<Value *> MDSources) const
816 -> Value * {
817 Align EffA = effectiveAlignForValueTy(HVC.DL, ValTy, Alignment);
818 Instruction *Load = Builder.CreateAlignedLoad(ValTy, Ptr, EffA, "ald");
819 propagateMetadata(Load, MDSources);
820 LLVM_DEBUG(dbgs() << "\t[Creating Load:] "; Load->dump());
821 return Load;
822}
823
824auto AlignVectors::createPredicatedLoad(IRBuilderBase &Builder, Type *ValTy,
825 Value *Ptr, Value *Predicate,
826 int Alignment,
827 ArrayRef<Value *> MDSources) const
828 -> Value * {
829 assert(HVC.HST.isTypeForHVX(ValTy) &&
830 "Predicates 'scalar' vector loads not yet supported");
831 assert(Predicate);
832 assert(!Predicate->getType()->isVectorTy() && "Expectning scalar predicate");
833 Align EffA = effectiveAlignForValueTy(HVC.DL, ValTy, Alignment);
834 assert(HVC.getSizeOf(ValTy, HVC.Alloc) % EffA.value() == 0);
835
836 if (HVC.isFalse(Predicate))
837 return UndefValue::get(ValTy);
838 if (HVC.isTrue(Predicate))
839 return createSimpleLoad(Builder, ValTy, Ptr, EffA.value(), MDSources);
840
841 auto V6_vL32b_pred_ai = HVC.HST.getIntrinsicId(Hexagon::V6_vL32b_pred_ai);
842 // FIXME: This may not put the offset from Ptr into the vmem offset.
843 return HVC.createHvxIntrinsic(Builder, V6_vL32b_pred_ai, ValTy,
844 {Predicate, Ptr, HVC.getConstInt(0)}, {},
845 MDSources);
846}
847
848auto AlignVectors::createStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
849 Value *Predicate, int Alignment, Value *Mask,
850 ArrayRef<Value *> MDSources) const -> Value * {
851 if (HVC.isZero(Mask) || HVC.isUndef(Val) || HVC.isUndef(Mask))
852 return UndefValue::get(Val->getType());
853 assert(!Predicate || (!Predicate->getType()->isVectorTy() &&
854 "Expectning scalar predicate"));
855 if (Predicate) {
856 if (HVC.isFalse(Predicate))
857 return UndefValue::get(Val->getType());
858 if (HVC.isTrue(Predicate))
859 Predicate = nullptr;
860 }
861 // Here both Predicate and Mask are true or unknown.
862
863 if (HVC.isTrue(Mask)) {
864 if (Predicate) { // Predicate unknown
865 return createPredicatedStore(Builder, Val, Ptr, Predicate, Alignment,
866 MDSources);
867 }
868 // Predicate is true:
869 return createSimpleStore(Builder, Val, Ptr, Alignment, MDSources);
870 }
871
872 // Mask is unknown
873 if (!Predicate) {
875 Builder.CreateMaskedStore(Val, Ptr, Align(Alignment), Mask);
876 propagateMetadata(Store, MDSources);
877 return Store;
878 }
879
880 // Both Predicate and Mask are unknown.
881 // Emulate masked store with predicated-load + mux + predicated-store.
882 Value *PredLoad = createPredicatedLoad(Builder, Val->getType(), Ptr,
883 Predicate, Alignment, MDSources);
884 Value *Mux = Builder.CreateSelect(Mask, Val, PredLoad);
885 return createPredicatedStore(Builder, Mux, Ptr, Predicate, Alignment,
886 MDSources);
887}
888
889auto AlignVectors::createSimpleStore(IRBuilderBase &Builder, Value *Val,
890 Value *Ptr, int Alignment,
891 ArrayRef<Value *> MDSources) const
892 -> Value * {
893 Align EffA = effectiveAlignForValueTy(HVC.DL, Val->getType(), Alignment);
894 Instruction *Store = Builder.CreateAlignedStore(Val, Ptr, EffA);
895 LLVM_DEBUG(dbgs() << "\t[Creating store:] "; Store->dump());
896 propagateMetadata(Store, MDSources);
897 return Store;
898}
899
900auto AlignVectors::createPredicatedStore(IRBuilderBase &Builder, Value *Val,
901 Value *Ptr, Value *Predicate,
902 int Alignment,
903 ArrayRef<Value *> MDSources) const
904 -> Value * {
905 Align EffA = effectiveAlignForValueTy(HVC.DL, Val->getType(), Alignment);
906 assert(HVC.HST.isTypeForHVX(Val->getType()) &&
907 "Predicates 'scalar' vector stores not yet supported");
908 assert(Predicate);
909 if (HVC.isFalse(Predicate))
910 return UndefValue::get(Val->getType());
911 if (HVC.isTrue(Predicate))
912 return createSimpleStore(Builder, Val, Ptr, EffA.value(), MDSources);
913
914 assert(HVC.getSizeOf(Val, HVC.Alloc) % EffA.value() == 0);
915 auto V6_vS32b_pred_ai = HVC.HST.getIntrinsicId(Hexagon::V6_vS32b_pred_ai);
916 // FIXME: This may not put the offset from Ptr into the vmem offset.
917 return HVC.createHvxIntrinsic(Builder, V6_vS32b_pred_ai, nullptr,
918 {Predicate, Ptr, HVC.getConstInt(0), Val}, {},
919 MDSources);
920}
921
922auto AlignVectors::getUpwardDeps(Instruction *In, Instruction *Base) const
923 -> DepList {
924 BasicBlock *Parent = Base->getParent();
925 assert(In->getParent() == Parent &&
926 "Base and In should be in the same block");
927 assert(Base->comesBefore(In) && "Base should come before In");
928
929 DepList Deps;
930 std::deque<Instruction *> WorkQ = {In};
931 while (!WorkQ.empty()) {
932 Instruction *D = WorkQ.front();
933 WorkQ.pop_front();
934 if (D != In)
935 Deps.insert(D);
936 for (Value *Op : D->operands()) {
937 if (auto *I = dyn_cast<Instruction>(Op)) {
938 if (I->getParent() == Parent && Base->comesBefore(I))
939 WorkQ.push_back(I);
940 }
941 }
942 }
943 return Deps;
944}
945
946auto AlignVectors::createAddressGroups() -> bool {
947 // An address group created here may contain instructions spanning
948 // multiple basic blocks.
949 AddrList WorkStack;
950
951 auto findBaseAndOffset = [&](AddrInfo &AI) -> std::pair<Instruction *, int> {
952 for (AddrInfo &W : WorkStack) {
953 if (auto D = HVC.calculatePointerDifference(AI.Addr, W.Addr))
954 return std::make_pair(W.Inst, *D);
955 }
956 return std::make_pair(nullptr, 0);
957 };
958
959 auto traverseBlock = [&](DomTreeNode *DomN, auto Visit) -> void {
960 BasicBlock &Block = *DomN->getBlock();
961 for (Instruction &I : Block) {
962 auto AI = this->getAddrInfo(I); // Use this-> for gcc6.
963 if (!AI)
964 continue;
965 auto F = findBaseAndOffset(*AI);
966 Instruction *GroupInst;
967 if (Instruction *BI = F.first) {
968 AI->Offset = F.second;
969 GroupInst = BI;
970 } else {
971 WorkStack.push_back(*AI);
972 GroupInst = AI->Inst;
973 }
974 AddrGroups[GroupInst].push_back(*AI);
975 }
976
977 for (DomTreeNode *C : DomN->children())
978 Visit(C, Visit);
979
980 while (!WorkStack.empty() && WorkStack.back().Inst->getParent() == &Block)
981 WorkStack.pop_back();
982 };
983
984 traverseBlock(HVC.DT.getRootNode(), traverseBlock);
985 assert(WorkStack.empty());
986
987 // AddrGroups are formed.
988 // Remove groups of size 1.
989 AddrGroups.remove_if([](auto &G) { return G.second.size() == 1; });
990 // Remove groups that don't use HVX types.
991 AddrGroups.remove_if([&](auto &G) {
992 return llvm::none_of(
993 G.second, [&](auto &I) { return HVC.HST.isTypeForHVX(I.ValTy); });
994 });
995
996 LLVM_DEBUG(dbgs() << AddrGroups);
997 return !AddrGroups.empty();
998}
999
1000auto AlignVectors::createLoadGroups(const AddrList &Group) const -> MoveList {
1001 // Form load groups.
1002 // To avoid complications with moving code across basic blocks, only form
1003 // groups that are contained within a single basic block.
1004 unsigned SizeLimit = VAGroupSizeLimit;
1005 if (SizeLimit == 0)
1006 return {};
1007
1008 auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
1009 assert(!Move.Main.empty() && "Move group should have non-empty Main");
1010 if (Move.Main.size() >= SizeLimit) {
1011 HVC.ORE.emit([&]() {
1012 return OptimizationRemarkMissed(DEBUG_TYPE, "GroupSizeLimitExceeded",
1013 Info.Inst->getDebugLoc(),
1014 Info.Inst->getParent())
1015 << "alignment group exceeds size limit";
1016 });
1017 return false;
1018 }
1019 // Don't mix HVX and non-HVX instructions.
1020 if (Move.IsHvx != isHvx(Info))
1021 return false;
1022 // Leading instruction in the load group.
1023 Instruction *Base = Move.Main.front();
1024 if (Base->getParent() != Info.Inst->getParent())
1025 return false;
1026 // Check if it's safe to move the load.
1027 if (!HVC.isSafeToMoveBeforeInBB(*Info.Inst, Base->getIterator())) {
1028 HVC.ORE.emit([&]() {
1029 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsafeToRelocate",
1030 Info.Inst->getDebugLoc(),
1031 Info.Inst->getParent())
1032 << "unsafe to relocate memory access for alignment";
1033 });
1034 return false;
1035 }
1036 // And if it's safe to clone the dependencies.
1037 auto isSafeToCopyAtBase = [&](const Instruction *I) {
1038 return HVC.isSafeToMoveBeforeInBB(*I, Base->getIterator()) &&
1039 HVC.isSafeToClone(*I);
1040 };
1041 DepList Deps = getUpwardDeps(Info.Inst, Base);
1042 if (!llvm::all_of(Deps, isSafeToCopyAtBase))
1043 return false;
1044
1045 Move.Main.push_back(Info.Inst);
1046 llvm::append_range(Move.Deps, Deps);
1047 return true;
1048 };
1049
1050 MoveList LoadGroups;
1051
1052 for (const AddrInfo &Info : Group) {
1053 if (!Info.Inst->mayReadFromMemory())
1054 continue;
1055 if (LoadGroups.empty() || !tryAddTo(Info, LoadGroups.back()))
1056 LoadGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), true);
1057 }
1058
1059 // Erase groups smaller than the minimum load group size.
1060 unsigned LoadGroupSizeLimit = MinLoadGroupSizeForAlignment;
1061 erase_if(LoadGroups, [LoadGroupSizeLimit](const MoveGroup &G) {
1062 return G.Main.size() < LoadGroupSizeLimit;
1063 });
1064
1065 // Erase HVX groups on targets < HvxV62 (due to lack of predicated loads).
1066 if (!HVC.HST.useHVXV62Ops()) {
1067 bool HadHvx =
1068 llvm::any_of(LoadGroups, [](const MoveGroup &G) { return G.IsHvx; });
1069 erase_if(LoadGroups, [](const MoveGroup &G) { return G.IsHvx; });
1070 if (HadHvx) {
1071 HVC.ORE.emit([&]() {
1072 return OptimizationRemarkMissed(DEBUG_TYPE, "HvxVersionTooLow",
1073 HVC.F.getSubprogram(), &HVC.F.front())
1074 << "HVX version too low for predicated load operations";
1075 });
1076 }
1077 }
1078
1079 LLVM_DEBUG(dbgs() << "LoadGroups list: " << LoadGroups);
1080 return LoadGroups;
1081}
1082
1083auto AlignVectors::createStoreGroups(const AddrList &Group) const -> MoveList {
1084 // Form store groups.
1085 // To avoid complications with moving code across basic blocks, only form
1086 // groups that are contained within a single basic block.
1087 unsigned SizeLimit = VAGroupSizeLimit;
1088 if (SizeLimit == 0)
1089 return {};
1090
1091 auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
1092 assert(!Move.Main.empty() && "Move group should have non-empty Main");
1093 if (Move.Main.size() >= SizeLimit) {
1094 HVC.ORE.emit([&]() {
1095 return OptimizationRemarkMissed(DEBUG_TYPE, "GroupSizeLimitExceeded",
1096 Info.Inst->getDebugLoc(),
1097 Info.Inst->getParent())
1098 << "alignment group exceeds size limit";
1099 });
1100 return false;
1101 }
1102 // For stores with return values we'd have to collect downward dependencies.
1103 // There are no such stores that we handle at the moment, so omit that.
1104 assert(Info.Inst->getType()->isVoidTy() &&
1105 "Not handling stores with return values");
1106 // Don't mix HVX and non-HVX instructions.
1107 if (Move.IsHvx != isHvx(Info))
1108 return false;
1109 // For stores we need to be careful whether it's safe to move them.
1110 // Stores that are otherwise safe to move together may not appear safe
1111 // to move over one another (i.e. isSafeToMoveBefore may return false).
1112 Instruction *Base = Move.Main.front();
1113 if (Base->getParent() != Info.Inst->getParent())
1114 return false;
1115 if (!HVC.isSafeToMoveBeforeInBB(*Info.Inst, Base->getIterator(),
1116 Move.Main)) {
1117 HVC.ORE.emit([&]() {
1118 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsafeToRelocate",
1119 Info.Inst->getDebugLoc(),
1120 Info.Inst->getParent())
1121 << "unsafe to relocate memory access for alignment";
1122 });
1123 return false;
1124 }
1125 Move.Main.push_back(Info.Inst);
1126 return true;
1127 };
1128
1129 MoveList StoreGroups;
1130
1131 for (auto I = Group.rbegin(), E = Group.rend(); I != E; ++I) {
1132 const AddrInfo &Info = *I;
1133 if (!Info.Inst->mayWriteToMemory())
1134 continue;
1135 if (StoreGroups.empty() || !tryAddTo(Info, StoreGroups.back()))
1136 StoreGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), false);
1137 }
1138
1139 // Erase singleton groups.
1140 erase_if(StoreGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
1141
1142 // Erase HVX groups on targets < HvxV62 (due to lack of predicated loads).
1143 if (!HVC.HST.useHVXV62Ops()) {
1144 bool HadHvx =
1145 llvm::any_of(StoreGroups, [](const MoveGroup &G) { return G.IsHvx; });
1146 erase_if(StoreGroups, [](const MoveGroup &G) { return G.IsHvx; });
1147 if (HadHvx) {
1148 HVC.ORE.emit([&]() {
1149 return OptimizationRemarkMissed(DEBUG_TYPE, "HvxVersionTooLow",
1150 HVC.F.getSubprogram(), &HVC.F.front())
1151 << "HVX version too low for predicated store operations";
1152 });
1153 }
1154 }
1155
1156 // Erase groups where every store is a full HVX vector. The reason is that
1157 // aligning predicated stores generates complex code that may be less
1158 // efficient than a sequence of unaligned vector stores.
1159 if (!VADoFullStores) {
1160 erase_if(StoreGroups, [this](const MoveGroup &G) {
1161 return G.IsHvx && llvm::all_of(G.Main, [this](Instruction *S) {
1162 auto MaybeInfo = this->getAddrInfo(*S);
1163 assert(MaybeInfo.has_value());
1164 return HVC.HST.isHVXVectorType(
1165 EVT::getEVT(MaybeInfo->ValTy, false));
1166 });
1167 });
1168 }
1169
1170 return StoreGroups;
1171}
1172
1173auto AlignVectors::moveTogether(MoveGroup &Move) const -> bool {
1174 // Move all instructions to be adjacent.
1175 assert(!Move.Main.empty() && "Move group should have non-empty Main");
1176 Instruction *Where = Move.Main.front();
1177
1178 if (Move.IsLoad) {
1179 // Move all the loads (and dependencies) to where the first load is.
1180 // Clone all deps to before Where, keeping order.
1181 Move.Clones = cloneBefore(Where->getIterator(), Move.Deps);
1182 // Move all main instructions to after Where, keeping order.
1183 ArrayRef<Instruction *> Main(Move.Main);
1184 for (Instruction *M : Main) {
1185 if (M != Where)
1186 M->moveAfter(Where);
1187 for (auto [Old, New] : Move.Clones)
1188 M->replaceUsesOfWith(Old, New);
1189 Where = M;
1190 }
1191 // Replace Deps with the clones.
1192 for (int i = 0, e = Move.Deps.size(); i != e; ++i)
1193 Move.Deps[i] = Move.Clones[Move.Deps[i]];
1194 } else {
1195 // Move all the stores to where the last store is.
1196 // NOTE: Deps are empty for "store" groups. If they need to be
1197 // non-empty, decide on the order.
1198 assert(Move.Deps.empty());
1199 // Move all main instructions to before Where, inverting order.
1200 ArrayRef<Instruction *> Main(Move.Main);
1201 for (Instruction *M : Main.drop_front(1)) {
1202 M->moveBefore(Where->getIterator());
1203 Where = M;
1204 }
1205 }
1206
1207 return Move.Main.size() + Move.Deps.size() > 1;
1208}
1209
1210template <typename T>
1211auto AlignVectors::cloneBefore(BasicBlock::iterator To, T &&Insts) const
1212 -> InstMap {
1213 InstMap Map;
1214
1215 for (Instruction *I : Insts) {
1216 assert(HVC.isSafeToClone(*I));
1217 Instruction *C = I->clone();
1218 C->setName(Twine("c.") + I->getName() + ".");
1219 C->insertBefore(To);
1220
1221 for (auto [Old, New] : Map)
1222 C->replaceUsesOfWith(Old, New);
1223 Map.insert(std::make_pair(I, C));
1224 }
1225 return Map;
1226}
1227
1228auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
1229 const ByteSpan &VSpan, int ScLen,
1230 Value *AlignVal, Value *AlignAddr) const
1231 -> void {
1232 LLVM_DEBUG(dbgs() << __func__ << "\n");
1233
1234 Type *SecTy = HVC.getByteTy(ScLen);
1235 int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
1236 bool DoAlign = !HVC.isZero(AlignVal);
1237 BasicBlock::iterator BasePos = Builder.GetInsertPoint();
1238 BasicBlock *BaseBlock = Builder.GetInsertBlock();
1239
1240 ByteSpan ASpan;
1241 auto *True = Constant::getAllOnesValue(HVC.getBoolTy(ScLen));
1242 auto *Undef = UndefValue::get(SecTy);
1243
1244 // Created load does not have to be "Instruction" (e.g. "undef").
1245 SmallVector<Value *> Loads(NumSectors + DoAlign, nullptr);
1246
1247 // We could create all of the aligned loads, and generate the valigns
1248 // at the location of the first load, but for large load groups, this
1249 // could create highly suboptimal code (there have been groups of 140+
1250 // loads in real code).
1251 // Instead, place the loads/valigns as close to the users as possible.
1252 // In any case we need to have a mapping from the blocks of VSpan (the
1253 // span covered by the pre-existing loads) to ASpan (the span covered
1254 // by the aligned loads). There is a small problem, though: ASpan needs
1255 // to have pointers to the loads/valigns, but we don't have these loads
1256 // because we don't know where to put them yet. We find out by creating
1257 // a section of ASpan that corresponds to values (blocks) from VSpan,
1258 // and checking where the new load should be placed. We need to attach
1259 // this location information to each block in ASpan somehow, so we put
1260 // distincts values for Seg.Val in each ASpan.Blocks[i], and use a map
1261 // to store the location for each Seg.Val.
1262 // The distinct values happen to be Blocks[i].Seg.Val = &Blocks[i],
1263 // which helps with printing ByteSpans without crashing when printing
1264 // Segments with these temporary identifiers in place of Val.
1265
1266 // Populate the blocks first, to avoid reallocations of the vector
1267 // interfering with generating the placeholder addresses.
1268 for (int Index = 0; Index != NumSectors; ++Index)
1269 ASpan.Blocks.emplace_back(nullptr, ScLen, Index * ScLen);
1270 for (int Index = 0; Index != NumSectors; ++Index) {
1271 ASpan.Blocks[Index].Seg.Val =
1272 reinterpret_cast<Value *>(&ASpan.Blocks[Index]);
1273 }
1274
1275 // Multiple values from VSpan can map to the same value in ASpan. Since we
1276 // try to create loads lazily, we need to find the earliest use for each
1277 // value from ASpan.
1278 DenseMap<void *, Instruction *> EarliestUser;
1279 auto isEarlier = [](Instruction *A, Instruction *B) {
1280 if (B == nullptr)
1281 return true;
1282 if (A == nullptr)
1283 return false;
1284 assert(A->getParent() == B->getParent());
1285 return A->comesBefore(B);
1286 };
1287 auto earliestUser = [&](const auto &Uses) {
1288 Instruction *User = nullptr;
1289 for (const Use &U : Uses) {
1290 auto *I = dyn_cast<Instruction>(U.getUser());
1291 assert(I != nullptr && "Load used in a non-instruction?");
1292 // Make sure we only consider users in this block, but we need
1293 // to remember if there were users outside the block too. This is
1294 // because if no users are found, aligned loads will not be created.
1295 if (I->getParent() == BaseBlock) {
1296 if (!isa<PHINode>(I))
1297 User = std::min(User, I, isEarlier);
1298 } else {
1299 User = std::min(User, BaseBlock->getTerminator(), isEarlier);
1300 }
1301 }
1302 return User;
1303 };
1304
1305 for (const ByteSpan::Block &B : VSpan) {
1306 ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size);
1307 for (const ByteSpan::Block &S : ASection) {
1308 auto &EU = EarliestUser[S.Seg.Val];
1309 EU = std::min(EU, earliestUser(B.Seg.Val->uses()), isEarlier);
1310 }
1311 }
1312
1313 LLVM_DEBUG({
1314 dbgs() << "ASpan:\n" << ASpan << '\n';
1315 dbgs() << "Earliest users of ASpan:\n";
1316 for (auto &[Val, User] : EarliestUser) {
1317 dbgs() << Val << "\n ->" << *User << '\n';
1318 }
1319 });
1320
1321 auto createLoad = [&](IRBuilderBase &Builder, const ByteSpan &VSpan,
1322 int Index, bool MakePred) {
1323 Value *Ptr =
1324 createAdjustedPointer(Builder, AlignAddr, SecTy, Index * ScLen);
1325 Value *Predicate =
1326 MakePred ? makeTestIfUnaligned(Builder, AlignVal, ScLen) : nullptr;
1327
1328 // If vector shifting is potentially needed, accumulate metadata
1329 // from source sections of twice the load width.
1330 int Start = (Index - DoAlign) * ScLen;
1331 int Width = (1 + DoAlign) * ScLen;
1332 return this->createLoad(Builder, SecTy, Ptr, Predicate, ScLen, True, Undef,
1333 VSpan.section(Start, Width).values());
1334 };
1335
1336 auto moveBefore = [this](BasicBlock::iterator In, BasicBlock::iterator To) {
1337 // Move In and its upward dependencies to before To.
1338 assert(In->getParent() == To->getParent());
1339 DepList Deps = getUpwardDeps(&*In, &*To);
1340 In->moveBefore(To);
1341 // DepList is sorted with respect to positions in the basic block.
1342 InstMap Map = cloneBefore(In, Deps);
1343 for (auto [Old, New] : Map)
1344 In->replaceUsesOfWith(Old, New);
1345 };
1346
1347 // Generate necessary loads at appropriate locations.
1348 LLVM_DEBUG(dbgs() << "Creating loads for ASpan sectors\n");
1349 for (int Index = 0; Index != NumSectors + 1; ++Index) {
1350 // In ASpan, each block will be either a single aligned load, or a
1351 // valign of a pair of loads. In the latter case, an aligned load j
1352 // will belong to the current valign, and the one in the previous
1353 // block (for j > 0).
1354 // Place the load at a location which will dominate the valign, assuming
1355 // the valign will be placed right before the earliest user.
1356 Instruction *PrevAt =
1357 DoAlign && Index > 0 ? EarliestUser[&ASpan[Index - 1]] : nullptr;
1358 Instruction *ThisAt =
1359 Index < NumSectors ? EarliestUser[&ASpan[Index]] : nullptr;
1360 if (auto *Where = std::min(PrevAt, ThisAt, isEarlier)) {
1361 Builder.SetInsertPoint(Where);
1362 Loads[Index] =
1363 createLoad(Builder, VSpan, Index, DoAlign && Index == NumSectors);
1364 // We know it's safe to put the load at BasePos, but we'd prefer to put
1365 // it at "Where". To see if the load is safe to be placed at Where, put
1366 // it there first and then check if it's safe to move it to BasePos.
1367 // If not, then the load needs to be placed at BasePos.
1368 // We can't do this check proactively because we need the load to exist
1369 // in order to check legality.
1370 if (auto *Load = dyn_cast<Instruction>(Loads[Index])) {
1371 if (!HVC.isSafeToMoveBeforeInBB(*Load, BasePos))
1372 moveBefore(Load->getIterator(), BasePos);
1373 }
1374 LLVM_DEBUG(dbgs() << "Loads[" << Index << "]:" << *Loads[Index] << '\n');
1375 }
1376 }
1377
1378 // Generate valigns if needed, and fill in proper values in ASpan
1379 LLVM_DEBUG(dbgs() << "Creating values for ASpan sectors\n");
1380 for (int Index = 0; Index != NumSectors; ++Index) {
1381 ASpan[Index].Seg.Val = nullptr;
1382 if (auto *Where = EarliestUser[&ASpan[Index]]) {
1383 Builder.SetInsertPoint(Where);
1384 Value *Val = Loads[Index];
1385 assert(Val != nullptr);
1386 if (DoAlign) {
1387 Value *NextLoad = Loads[Index + 1];
1388 assert(NextLoad != nullptr);
1389 Val = HVC.vralignb(Builder, Val, NextLoad, AlignVal);
1390 }
1391 ASpan[Index].Seg.Val = Val;
1392 LLVM_DEBUG(dbgs() << "ASpan[" << Index << "]:" << *Val << '\n');
1393 }
1394 }
1395
1396 for (const ByteSpan::Block &B : VSpan) {
1397 ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);
1398 Value *Accum = UndefValue::get(HVC.getByteTy(B.Seg.Size));
1399 Builder.SetInsertPoint(cast<Instruction>(B.Seg.Val));
1400
1401 // We're generating a reduction, where each instruction depends on
1402 // the previous one, so we need to order them according to the position
1403 // of their inputs in the code.
1404 std::vector<ByteSpan::Block *> ABlocks;
1405 for (ByteSpan::Block &S : ASection) {
1406 if (S.Seg.Val != nullptr)
1407 ABlocks.push_back(&S);
1408 }
1409 llvm::sort(ABlocks,
1410 [&](const ByteSpan::Block *A, const ByteSpan::Block *B) {
1411 return isEarlier(cast<Instruction>(A->Seg.Val),
1412 cast<Instruction>(B->Seg.Val));
1413 });
1414 for (ByteSpan::Block *S : ABlocks) {
1415 // The processing of the data loaded by the aligned loads
1416 // needs to be inserted after the data is available.
1417 Instruction *SegI = cast<Instruction>(S->Seg.Val);
1418 Builder.SetInsertPoint(&*std::next(SegI->getIterator()));
1419 Value *Pay = HVC.vbytes(Builder, getPayload(S->Seg.Val));
1420 Accum =
1421 HVC.insertb(Builder, Accum, Pay, S->Seg.Start, S->Seg.Size, S->Pos);
1422 }
1423 // Instead of casting everything to bytes for the vselect, cast to the
1424 // original value type. This will avoid complications with casting masks.
1425 // For example, in cases when the original mask applied to i32, it could
1426 // be converted to a mask applicable to i8 via pred_typecast intrinsic,
1427 // but if the mask is not exactly of HVX length, extra handling would be
1428 // needed to make it work.
1429 Type *ValTy = getPayload(B.Seg.Val)->getType();
1430 Value *Cast = Builder.CreateBitCast(Accum, ValTy, "cst");
1431 Value *Sel = Builder.CreateSelect(getMask(B.Seg.Val), Cast,
1432 getPassThrough(B.Seg.Val), "sel");
1433 B.Seg.Val->replaceAllUsesWith(Sel);
1434 }
1435}
1436
1437auto AlignVectors::realignStoreGroup(IRBuilderBase &Builder,
1438 const ByteSpan &VSpan, int ScLen,
1439 Value *AlignVal, Value *AlignAddr) const
1440 -> void {
1441 LLVM_DEBUG(dbgs() << __func__ << "\n");
1442
1443 Type *SecTy = HVC.getByteTy(ScLen);
1444 int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
1445 bool DoAlign = !HVC.isZero(AlignVal);
1446
1447 // Stores.
1448 ByteSpan ASpanV, ASpanM;
1449
1450 // Return a vector value corresponding to the input value Val:
1451 // either <1 x Val> for scalar Val, or Val itself for vector Val.
1452 auto MakeVec = [](IRBuilderBase &Builder, Value *Val) -> Value * {
1453 Type *Ty = Val->getType();
1454 if (Ty->isVectorTy())
1455 return Val;
1456 auto *VecTy = VectorType::get(Ty, 1, /*Scalable=*/false);
1457 return Builder.CreateBitCast(Val, VecTy, "cst");
1458 };
1459
1460 // Create an extra "undef" sector at the beginning and at the end.
1461 // They will be used as the left/right filler in the vlalign step.
1462 for (int Index = (DoAlign ? -1 : 0); Index != NumSectors + DoAlign; ++Index) {
1463 // For stores, the size of each section is an aligned vector length.
1464 // Adjust the store offsets relative to the section start offset.
1465 ByteSpan VSection =
1466 VSpan.section(Index * ScLen, ScLen).shift(-Index * ScLen);
1467 Value *Undef = UndefValue::get(SecTy);
1469 Value *AccumV = Undef;
1470 Value *AccumM = Zero;
1471 for (ByteSpan::Block &S : VSection) {
1472 Value *Pay = getPayload(S.Seg.Val);
1473 Value *Mask = HVC.rescale(Builder, MakeVec(Builder, getMask(S.Seg.Val)),
1474 Pay->getType(), HVC.getByteTy());
1475 Value *PartM = HVC.insertb(Builder, Zero, HVC.vbytes(Builder, Mask),
1476 S.Seg.Start, S.Seg.Size, S.Pos);
1477 AccumM = Builder.CreateOr(AccumM, PartM);
1478
1479 Value *PartV = HVC.insertb(Builder, Undef, HVC.vbytes(Builder, Pay),
1480 S.Seg.Start, S.Seg.Size, S.Pos);
1481
1482 AccumV = Builder.CreateSelect(
1483 Builder.CreateICmp(CmpInst::ICMP_NE, PartM, Zero), PartV, AccumV);
1484 }
1485 ASpanV.Blocks.emplace_back(AccumV, ScLen, Index * ScLen);
1486 ASpanM.Blocks.emplace_back(AccumM, ScLen, Index * ScLen);
1487 }
1488
1489 LLVM_DEBUG({
1490 dbgs() << "ASpanV before vlalign:\n" << ASpanV << '\n';
1491 dbgs() << "ASpanM before vlalign:\n" << ASpanM << '\n';
1492 });
1493
1494 // vlalign
1495 if (DoAlign) {
1496 for (int Index = 1; Index != NumSectors + 2; ++Index) {
1497 Value *PrevV = ASpanV[Index - 1].Seg.Val, *ThisV = ASpanV[Index].Seg.Val;
1498 Value *PrevM = ASpanM[Index - 1].Seg.Val, *ThisM = ASpanM[Index].Seg.Val;
1499 assert(isSectorTy(PrevV->getType()) && isSectorTy(PrevM->getType()));
1500 ASpanV[Index - 1].Seg.Val = HVC.vlalignb(Builder, PrevV, ThisV, AlignVal);
1501 ASpanM[Index - 1].Seg.Val = HVC.vlalignb(Builder, PrevM, ThisM, AlignVal);
1502 }
1503 }
1504
1505 LLVM_DEBUG({
1506 dbgs() << "ASpanV after vlalign:\n" << ASpanV << '\n';
1507 dbgs() << "ASpanM after vlalign:\n" << ASpanM << '\n';
1508 });
1509
1510 auto createStore = [&](IRBuilderBase &Builder, const ByteSpan &ASpanV,
1511 const ByteSpan &ASpanM, int Index, bool MakePred) {
1512 Value *Val = ASpanV[Index].Seg.Val;
1513 Value *Mask = ASpanM[Index].Seg.Val; // bytes
1514 if (HVC.isUndef(Val) || HVC.isZero(Mask))
1515 return;
1516 Value *Ptr =
1517 createAdjustedPointer(Builder, AlignAddr, SecTy, Index * ScLen);
1518 Value *Predicate =
1519 MakePred ? makeTestIfUnaligned(Builder, AlignVal, ScLen) : nullptr;
1520
1521 // If vector shifting is potentially needed, accumulate metadata
1522 // from source sections of twice the store width.
1523 int Start = (Index - DoAlign) * ScLen;
1524 int Width = (1 + DoAlign) * ScLen;
1525 this->createStore(Builder, Val, Ptr, Predicate, ScLen,
1526 HVC.vlsb(Builder, Mask),
1527 VSpan.section(Start, Width).values());
1528 };
1529
1530 for (int Index = 0; Index != NumSectors + DoAlign; ++Index) {
1531 createStore(Builder, ASpanV, ASpanM, Index, DoAlign && Index == NumSectors);
1532 }
1533}
1534
1535auto AlignVectors::realignGroup(const MoveGroup &Move) -> bool {
1536 LLVM_DEBUG(dbgs() << "Realigning group:\n" << Move << '\n');
1537
1538 // TODO: Needs support for masked loads/stores of "scalar" vectors.
1539 if (!Move.IsHvx)
1540 return false;
1541
1542 // Return the element with the maximum alignment from Range,
1543 // where GetValue obtains the value to compare from an element.
1544 auto getMaxOf = [](auto Range, auto GetValue) {
1545 return *llvm::max_element(Range, [&GetValue](auto &A, auto &B) {
1546 return GetValue(A) < GetValue(B);
1547 });
1548 };
1549
1550 AddrList &BaseInfos = AddrGroups[Move.Base];
1551
1552 // Conceptually, there is a vector of N bytes covering the addresses
1553 // starting from the minimum offset (i.e. Base.Addr+Start). This vector
1554 // represents a contiguous memory region that spans all accessed memory
1555 // locations.
1556 // The correspondence between loaded or stored values will be expressed
1557 // in terms of this vector. For example, the 0th element of the vector
1558 // from the Base address info will start at byte Start from the beginning
1559 // of this conceptual vector.
1560 //
1561 // This vector will be loaded/stored starting at the nearest down-aligned
1562 // address and the amount of the down-alignment will be AlignVal:
1563 // valign(load_vector(align_down(Base+Start)), AlignVal)
1564
1565 std::set<Instruction *> TestSet(Move.Main.begin(), Move.Main.end());
1566 AddrList MoveInfos;
1567
1569 BaseInfos, std::back_inserter(MoveInfos),
1570 [&TestSet](const AddrInfo &AI) { return TestSet.count(AI.Inst); });
1571
1572 // Maximum alignment present in the whole address group.
1573 const AddrInfo &WithMaxAlign =
1574 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.HaveAlign; });
1575 Align MaxGiven = WithMaxAlign.HaveAlign;
1576
1577 // Minimum alignment present in the move address group.
1578 const AddrInfo &WithMinOffset =
1579 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return -AI.Offset; });
1580
1581 const AddrInfo &WithMaxNeeded =
1582 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.NeedAlign; });
1583 Align MinNeeded = WithMaxNeeded.NeedAlign;
1584
1585 // Set the builder's insertion point right before the load group, or
1586 // immediately after the store group. (Instructions in a store group are
1587 // listed in reverse order.)
1588 Instruction *InsertAt = Move.Main.front();
1589 if (!Move.IsLoad) {
1590 // There should be a terminator (which store isn't, but check anyways).
1591 assert(InsertAt->getIterator() != InsertAt->getParent()->end());
1592 InsertAt = &*std::next(InsertAt->getIterator());
1593 }
1594
1595 IRBuilder Builder(InsertAt->getParent(), InsertAt->getIterator(),
1596 InstSimplifyFolder(HVC.DL));
1597 Value *AlignAddr = nullptr; // Actual aligned address.
1598 Value *AlignVal = nullptr; // Right-shift amount (for valign).
1599
1600 if (MinNeeded <= MaxGiven) {
1601 int Start = WithMinOffset.Offset;
1602 int OffAtMax = WithMaxAlign.Offset;
1603 // Shift the offset of the maximally aligned instruction (OffAtMax)
1604 // back by just enough multiples of the required alignment to cover the
1605 // distance from Start to OffAtMax.
1606 // Calculate the address adjustment amount based on the address with the
1607 // maximum alignment. This is to allow a simple gep instruction instead
1608 // of potential bitcasts to i8*.
1609 int Adjust = -alignTo(OffAtMax - Start, MinNeeded.value());
1610 AlignAddr = createAdjustedPointer(Builder, WithMaxAlign.Addr,
1611 WithMaxAlign.ValTy, Adjust, Move.Clones);
1612 int Diff = Start - (OffAtMax + Adjust);
1613 AlignVal = HVC.getConstInt(Diff);
1614 assert(Diff >= 0);
1615 assert(static_cast<decltype(MinNeeded.value())>(Diff) < MinNeeded.value());
1616 } else {
1617 // WithMinOffset is the lowest address in the group,
1618 // WithMinOffset.Addr = Base+Start.
1619 // Align instructions for both HVX (V6_valign) and scalar (S2_valignrb)
1620 // mask off unnecessary bits, so it's ok to just the original pointer as
1621 // the alignment amount.
1622 // Do an explicit down-alignment of the address to avoid creating an
1623 // aligned instruction with an address that is not really aligned.
1624 AlignAddr =
1625 createAlignedPointer(Builder, WithMinOffset.Addr, WithMinOffset.ValTy,
1626 MinNeeded.value(), Move.Clones);
1627 AlignVal =
1628 Builder.CreatePtrToInt(WithMinOffset.Addr, HVC.getIntTy(), "pti");
1629 if (auto *I = dyn_cast<Instruction>(AlignVal)) {
1630 for (auto [Old, New] : Move.Clones)
1631 I->replaceUsesOfWith(Old, New);
1632 }
1633 }
1634
1635 ByteSpan VSpan;
1636 for (const AddrInfo &AI : MoveInfos) {
1637 VSpan.Blocks.emplace_back(AI.Inst, HVC.getSizeOf(AI.ValTy),
1638 AI.Offset - WithMinOffset.Offset);
1639 }
1640
1641 // The aligned loads/stores will use blocks that are either scalars,
1642 // or HVX vectors. Let "sector" be the unified term for such a block.
1643 // blend(scalar, vector) -> sector...
1644 int ScLen = Move.IsHvx ? HVC.HST.getVectorLength()
1645 : std::max<int>(MinNeeded.value(), 4);
1646 assert(!Move.IsHvx || ScLen == 64 || ScLen == 128);
1647 assert(Move.IsHvx || ScLen == 4 || ScLen == 8);
1648
1649 LLVM_DEBUG({
1650 dbgs() << "ScLen: " << ScLen << "\n";
1651 dbgs() << "AlignVal:" << *AlignVal << "\n";
1652 dbgs() << "AlignAddr:" << *AlignAddr << "\n";
1653 dbgs() << "VSpan:\n" << VSpan << '\n';
1654 });
1655
1656 if (Move.IsLoad)
1657 realignLoadGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1658 else
1659 realignStoreGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1660
1661 Instruction *Front = Move.Main.front();
1662 HVC.ORE.emit([&]() {
1663 return OptimizationRemark(DEBUG_TYPE, "VectorsAligned",
1664 Front->getDebugLoc(), Front->getParent())
1665 << "aligned vector memory operations";
1666 });
1667
1668 for (auto *Inst : Move.Main)
1669 Inst->eraseFromParent();
1670
1671 return true;
1672}
1673
1674auto AlignVectors::makeTestIfUnaligned(IRBuilderBase &Builder, Value *AlignVal,
1675 int Alignment) const -> Value * {
1676 auto *AlignTy = AlignVal->getType();
1677 Value *And = Builder.CreateAnd(
1678 AlignVal, ConstantInt::get(AlignTy, Alignment - 1), "and");
1679 Value *Zero = ConstantInt::get(AlignTy, 0);
1680 return Builder.CreateICmpNE(And, Zero, "isz");
1681}
1682
1683auto AlignVectors::isSectorTy(Type *Ty) const -> bool {
1684 if (!HVC.isByteVecTy(Ty))
1685 return false;
1686 int Size = HVC.getSizeOf(Ty);
1687 if (HVC.HST.isTypeForHVX(Ty))
1688 return Size == static_cast<int>(HVC.HST.getVectorLength());
1689 return Size == 4 || Size == 8;
1690}
1691
1692auto AlignVectors::run() -> bool {
1693 LLVM_DEBUG(dbgs() << "\nRunning HVC::AlignVectors on " << HVC.F.getName()
1694 << '\n');
1695 if (!createAddressGroups())
1696 return false;
1697
1698 LLVM_DEBUG({
1699 dbgs() << "Address groups(" << AddrGroups.size() << "):\n";
1700 for (auto &[In, AL] : AddrGroups) {
1701 for (const AddrInfo &AI : AL)
1702 dbgs() << "---\n" << AI << '\n';
1703 }
1704 });
1705
1706 bool Changed = false;
1707 MoveList LoadGroups, StoreGroups;
1708
1709 for (auto &G : AddrGroups) {
1710 llvm::append_range(LoadGroups, createLoadGroups(G.second));
1711 llvm::append_range(StoreGroups, createStoreGroups(G.second));
1712 }
1713
1714 LLVM_DEBUG({
1715 dbgs() << "\nLoad groups(" << LoadGroups.size() << "):\n";
1716 for (const MoveGroup &G : LoadGroups)
1717 dbgs() << G << "\n";
1718 dbgs() << "Store groups(" << StoreGroups.size() << "):\n";
1719 for (const MoveGroup &G : StoreGroups)
1720 dbgs() << G << "\n";
1721 });
1722
1723 // Cumulative limit on the number of groups.
1724 unsigned CountLimit = VAGroupCountLimit;
1725 if (CountLimit == 0)
1726 return false;
1727
1728 if (LoadGroups.size() > CountLimit) {
1729 LoadGroups.resize(CountLimit);
1730 StoreGroups.clear();
1731 } else {
1732 unsigned StoreLimit = CountLimit - LoadGroups.size();
1733 if (StoreGroups.size() > StoreLimit)
1734 StoreGroups.resize(StoreLimit);
1735 }
1736
1737 for (auto &M : LoadGroups)
1738 Changed |= moveTogether(M);
1739 for (auto &M : StoreGroups)
1740 Changed |= moveTogether(M);
1741
1742 LLVM_DEBUG(dbgs() << "After moveTogether:\n" << HVC.F);
1743
1744 for (auto &M : LoadGroups)
1745 Changed |= realignGroup(M);
1746 for (auto &M : StoreGroups)
1747 Changed |= realignGroup(M);
1748
1749 return Changed;
1750}
1751
1752// --- End AlignVectors
1753
1754// --- Begin HvxIdioms
1755
1756auto HvxIdioms::getNumSignificantBits(Value *V, Instruction *In) const
1757 -> std::pair<unsigned, Signedness> {
1758 unsigned Bits = HVC.getNumSignificantBits(V, In);
1759 // The significant bits are calculated including the sign bit. This may
1760 // add an extra bit for zero-extended values, e.g. (zext i32 to i64) may
1761 // result in 33 significant bits. To avoid extra words, skip the extra
1762 // sign bit, but keep information that the value is to be treated as
1763 // unsigned.
1764 KnownBits Known = HVC.getKnownBits(V, In);
1765 Signedness Sign = Signed;
1766 unsigned NumToTest = 0; // Number of bits used in test for unsignedness.
1767 if (isPowerOf2_32(Bits))
1768 NumToTest = Bits;
1769 else if (Bits > 1 && isPowerOf2_32(Bits - 1))
1770 NumToTest = Bits - 1;
1771
1772 if (NumToTest != 0 && Known.Zero.ashr(NumToTest).isAllOnes()) {
1773 Sign = Unsigned;
1774 Bits = NumToTest;
1775 }
1776
1777 // If the top bit of the nearest power-of-2 is zero, this value is
1778 // positive. It could be treated as either signed or unsigned.
1779 if (unsigned Pow2 = PowerOf2Ceil(Bits); Pow2 != Bits) {
1780 if (Known.Zero.ashr(Pow2 - 1).isAllOnes())
1781 Sign = Positive;
1782 }
1783 return {Bits, Sign};
1784}
1785
1786auto HvxIdioms::canonSgn(SValue X, SValue Y) const
1787 -> std::pair<SValue, SValue> {
1788 // Canonicalize the signedness of X and Y, so that the result is one of:
1789 // S, S
1790 // U/P, S
1791 // U/P, U/P
1792 if (X.Sgn == Signed && Y.Sgn != Signed)
1793 std::swap(X, Y);
1794 return {X, Y};
1795}
1796
1797// Match
1798// (X * Y) [>> N], or
1799// ((X * Y) + (1 << M)) >> N
1800auto HvxIdioms::matchFxpMul(Instruction &In) const -> std::optional<FxpOp> {
1801 using namespace PatternMatch;
1802 auto *Ty = In.getType();
1803
1804 if (!Ty->isVectorTy() || !Ty->getScalarType()->isIntegerTy())
1805 return std::nullopt;
1806
1807 unsigned Width = cast<IntegerType>(Ty->getScalarType())->getBitWidth();
1808
1809 FxpOp Op;
1810 Value *Exp = &In;
1811
1812 // Fixed-point multiplication is always shifted right (except when the
1813 // fraction is 0 bits).
1814 auto m_Shr = [](auto &&V, auto &&S) {
1815 return m_CombineOr(m_LShr(V, S), m_AShr(V, S));
1816 };
1817
1818 uint64_t Qn = 0;
1819 if (Value *T; match(Exp, m_Shr(m_Value(T), m_ConstantInt(Qn)))) {
1820 Op.Frac = Qn;
1821 Exp = T;
1822 } else {
1823 Op.Frac = 0;
1824 }
1825
1826 if (Op.Frac > Width)
1827 return std::nullopt;
1828
1829 // Check if there is rounding added.
1830 uint64_t CV;
1831 if (Value *T;
1832 Op.Frac > 0 && match(Exp, m_Add(m_Value(T), m_ConstantInt(CV)))) {
1833 if (CV != 0 && !isPowerOf2_64(CV))
1834 return std::nullopt;
1835 if (CV != 0)
1836 Op.RoundAt = Log2_64(CV);
1837 Exp = T;
1838 }
1839
1840 // Check if the rest is a multiplication.
1841 if (match(Exp, m_Mul(m_Value(Op.X.Val), m_Value(Op.Y.Val)))) {
1842 Op.Opcode = Instruction::Mul;
1843 // FIXME: The information below is recomputed.
1844 Op.X.Sgn = getNumSignificantBits(Op.X.Val, &In).second;
1845 Op.Y.Sgn = getNumSignificantBits(Op.Y.Val, &In).second;
1846 Op.ResTy = cast<VectorType>(Ty);
1847 return Op;
1848 }
1849
1850 return std::nullopt;
1851}
1852
1853auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const
1854 -> Value * {
1855 assert(Op.X.Val->getType() == Op.Y.Val->getType());
1856
1857 auto *VecTy = dyn_cast<VectorType>(Op.X.Val->getType());
1858 if (VecTy == nullptr)
1859 return nullptr;
1860 auto *ElemTy = cast<IntegerType>(VecTy->getElementType());
1861 unsigned ElemWidth = ElemTy->getBitWidth();
1862
1863 // TODO: This can be relaxed after legalization is done pre-isel.
1864 if ((HVC.length(VecTy) * ElemWidth) % (8 * HVC.HST.getVectorLength()) != 0)
1865 return nullptr;
1866
1867 // There are no special intrinsics that should be used for multiplying
1868 // signed 8-bit values, so just skip them. Normal codegen should handle
1869 // this just fine.
1870 if (ElemWidth <= 8)
1871 return nullptr;
1872 // Similarly, if this is just a multiplication that can be handled without
1873 // intervention, then leave it alone.
1874 if (ElemWidth <= 32 && Op.Frac == 0)
1875 return nullptr;
1876
1877 auto [BitsX, SignX] = getNumSignificantBits(Op.X.Val, &In);
1878 auto [BitsY, SignY] = getNumSignificantBits(Op.Y.Val, &In);
1879
1880 // TODO: Add multiplication of vectors by scalar registers (up to 4 bytes).
1881
1882 Value *X = Op.X.Val, *Y = Op.Y.Val;
1883 IRBuilder Builder(In.getParent(), In.getIterator(),
1884 InstSimplifyFolder(HVC.DL));
1885
1886 auto roundUpWidth = [](unsigned Width) -> unsigned {
1887 if (Width <= 32 && !isPowerOf2_32(Width)) {
1888 // If the element width is not a power of 2, round it up
1889 // to the next one. Do this for widths not exceeding 32.
1890 return PowerOf2Ceil(Width);
1891 }
1892 if (Width > 32 && Width % 32 != 0) {
1893 // For wider elements, round it up to the multiple of 32.
1894 return alignTo(Width, 32u);
1895 }
1896 return Width;
1897 };
1898
1899 BitsX = roundUpWidth(BitsX);
1900 BitsY = roundUpWidth(BitsY);
1901
1902 // For elementwise multiplication vectors must have the same lengths, so
1903 // resize the elements of both inputs to the same width, the max of the
1904 // calculated significant bits.
1905 unsigned Width = std::max(BitsX, BitsY);
1906
1907 auto *ResizeTy = VectorType::get(HVC.getIntTy(Width), VecTy);
1908 if (Width < ElemWidth) {
1909 X = Builder.CreateTrunc(X, ResizeTy, "trn");
1910 Y = Builder.CreateTrunc(Y, ResizeTy, "trn");
1911 } else if (Width > ElemWidth) {
1912 X = SignX == Signed ? Builder.CreateSExt(X, ResizeTy, "sxt")
1913 : Builder.CreateZExt(X, ResizeTy, "zxt");
1914 Y = SignY == Signed ? Builder.CreateSExt(Y, ResizeTy, "sxt")
1915 : Builder.CreateZExt(Y, ResizeTy, "zxt");
1916 };
1917
1918 assert(X->getType() == Y->getType() && X->getType() == ResizeTy);
1919
1920 unsigned VecLen = HVC.length(ResizeTy);
1921 unsigned ChopLen = (8 * HVC.HST.getVectorLength()) / std::min(Width, 32u);
1922
1924 FxpOp ChopOp = Op;
1925 ChopOp.ResTy = VectorType::get(Op.ResTy->getElementType(), ChopLen, false);
1926
1927 for (unsigned V = 0; V != VecLen / ChopLen; ++V) {
1928 ChopOp.X.Val = HVC.subvector(Builder, X, V * ChopLen, ChopLen);
1929 ChopOp.Y.Val = HVC.subvector(Builder, Y, V * ChopLen, ChopLen);
1930 Results.push_back(processFxpMulChopped(Builder, In, ChopOp));
1931 if (Results.back() == nullptr)
1932 break;
1933 }
1934
1935 if (Results.empty() || Results.back() == nullptr)
1936 return nullptr;
1937
1938 Value *Cat = HVC.concat(Builder, Results);
1939 Value *Ext = SignX == Signed || SignY == Signed
1940 ? Builder.CreateSExt(Cat, VecTy, "sxt")
1941 : Builder.CreateZExt(Cat, VecTy, "zxt");
1942 return Ext;
1943}
1944
1945inline bool HvxIdioms::matchScatter(Instruction &In) const {
1946 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
1947 if (!II)
1948 return false;
1949 return (II->getIntrinsicID() == Intrinsic::masked_scatter);
1950}
1951
1952inline bool HvxIdioms::matchGather(Instruction &In) const {
1953 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
1954 if (!II)
1955 return false;
1956 return (II->getIntrinsicID() == Intrinsic::masked_gather);
1957}
1958
1959inline bool HvxIdioms::matchMLoad(Instruction &In) const {
1960 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
1961 if (!II)
1962 return false;
1963 return (II->getIntrinsicID() == Intrinsic::masked_load);
1964}
1965
1966inline bool HvxIdioms::matchMStore(Instruction &In) const {
1967 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
1968 if (!II)
1969 return false;
1970 return (II->getIntrinsicID() == Intrinsic::masked_store);
1971}
1972
1973Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual);
1974
1975// Binary instructions we want to handle as users of gather/scatter.
1976inline bool isArithmetic(unsigned Opc) {
1977 switch (Opc) {
1978 case Instruction::Add:
1979 case Instruction::Sub:
1980 case Instruction::Mul:
1981 case Instruction::And:
1982 case Instruction::Or:
1983 case Instruction::Xor:
1984 case Instruction::AShr:
1985 case Instruction::LShr:
1986 case Instruction::Shl:
1987 case Instruction::UDiv:
1988 return true;
1989 }
1990 return false;
1991}
1992
1993// TODO: Maybe use MemoryLocation for this. See getLocOrNone above.
1994inline Value *getPointer(Value *Ptr) {
1995 assert(Ptr && "Unable to extract pointer");
1996 if (isa<AllocaInst>(Ptr) || isa<Argument>(Ptr) || isa<GlobalValue>(Ptr))
1997 return Ptr;
1998 if (isa<LoadInst>(Ptr) || isa<StoreInst>(Ptr))
1999 return getLoadStorePointerOperand(Ptr);
2001 if (II->getIntrinsicID() == Intrinsic::masked_store)
2002 return II->getOperand(1);
2003 }
2004 return nullptr;
2005}
2006
2008 HvxIdioms::DstQualifier &Qual) {
2009 Instruction *Destination = nullptr;
2010 if (!In)
2011 return Destination;
2012 if (isa<StoreInst>(In)) {
2013 Destination = In;
2014 Qual = HvxIdioms::LdSt;
2015 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {
2016 if (II->getIntrinsicID() == Intrinsic::masked_gather) {
2017 Destination = In;
2018 Qual = HvxIdioms::LLVM_Gather;
2019 } else if (II->getIntrinsicID() == Intrinsic::masked_scatter) {
2020 Destination = In;
2021 Qual = HvxIdioms::LLVM_Scatter;
2022 } else if (II->getIntrinsicID() == Intrinsic::masked_store) {
2023 Destination = In;
2024 Qual = HvxIdioms::LdSt;
2025 } else if (II->getIntrinsicID() ==
2026 Intrinsic::hexagon_V6_vgather_vscattermh) {
2027 Destination = In;
2028 Qual = HvxIdioms::HEX_Gather_Scatter;
2029 } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vscattermh_128B) {
2030 Destination = In;
2031 Qual = HvxIdioms::HEX_Scatter;
2032 } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vgathermh_128B) {
2033 Destination = In;
2034 Qual = HvxIdioms::HEX_Gather;
2035 }
2036 } else if (isa<ZExtInst>(In)) {
2037 return locateDestination(In, Qual);
2038 } else if (isa<CastInst>(In)) {
2039 return locateDestination(In, Qual);
2040 } else if (isa<CallInst>(In)) {
2041 Destination = In;
2042 Qual = HvxIdioms::Call;
2043 } else if (isa<GetElementPtrInst>(In)) {
2044 return locateDestination(In, Qual);
2045 } else if (isArithmetic(In->getOpcode())) {
2046 Destination = In;
2047 Qual = HvxIdioms::Arithmetic;
2048 } else {
2049 LLVM_DEBUG(dbgs() << "Unhandled destination : " << *In << "\n");
2050 }
2051 return Destination;
2052}
2053
2054// This method attempts to find destination (user) for a given intrinsic.
2055// Given that these are produced only by Ripple, the number of options is
2056// limited. Simplest case is explicit store which in fact is redundant (since
2057// HVX gater creates its own store during packetization). Nevertheless we need
2058// to figure address where we storing. Other cases are more complicated, but
2059// still few.
2060Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual) {
2061 Instruction *Destination = nullptr;
2062 if (!In)
2063 return Destination;
2064 // Get all possible destinations
2066 // Iterate over the uses of the instruction
2067 for (auto &U : In->uses()) {
2068 if (auto *UI = dyn_cast<Instruction>(U.getUser())) {
2069 Destination = selectDestination(UI, Qual);
2070 if (Destination)
2071 Users.push_back(Destination);
2072 }
2073 }
2074 // Now see which of the users (if any) is a memory destination.
2075 for (auto *I : Users)
2076 if (getPointer(I))
2077 return I;
2078 return Destination;
2079}
2080
2081// The two intrinsics we handle here have GEP in a different position.
2083 assert(In && "Bad instruction");
2085 assert((IIn && (IIn->getIntrinsicID() == Intrinsic::masked_gather ||
2086 IIn->getIntrinsicID() == Intrinsic::masked_scatter)) &&
2087 "Not a gather Intrinsic");
2088 GetElementPtrInst *GEPIndex = nullptr;
2089 if (IIn->getIntrinsicID() == Intrinsic::masked_gather)
2090 GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(0));
2091 else
2092 GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(1));
2093 return GEPIndex;
2094}
2095
2096// Given the intrinsic find its GEP argument and extract base address it uses.
2097// The method relies on the way how Ripple typically forms the GEP for
2098// scatter/gather.
2101 if (!GEPIndex) {
2102 LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n");
2103 return nullptr;
2104 }
2105 Value *BaseAddress = GEPIndex->getPointerOperand();
2106 auto *IndexLoad = dyn_cast<LoadInst>(BaseAddress);
2107 if (IndexLoad)
2108 return IndexLoad;
2109
2110 auto *IndexZEx = dyn_cast<ZExtInst>(BaseAddress);
2111 if (IndexZEx) {
2112 IndexLoad = dyn_cast<LoadInst>(IndexZEx->getOperand(0));
2113 if (IndexLoad)
2114 return IndexLoad;
2115 IntrinsicInst *II = dyn_cast<IntrinsicInst>(IndexZEx->getOperand(0));
2116 if (II && II->getIntrinsicID() == Intrinsic::masked_gather)
2118 }
2119 auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(BaseAddress);
2120 if (BaseShuffle) {
2121 IndexLoad = dyn_cast<LoadInst>(BaseShuffle->getOperand(0));
2122 if (IndexLoad)
2123 return IndexLoad;
2124 auto *IE = dyn_cast<InsertElementInst>(BaseShuffle->getOperand(0));
2125 if (IE) {
2126 auto *Src = IE->getOperand(1);
2127 IndexLoad = dyn_cast<LoadInst>(Src);
2128 if (IndexLoad)
2129 return IndexLoad;
2130 auto *Alloca = dyn_cast<AllocaInst>(Src);
2131 if (Alloca)
2132 return Alloca;
2133 if (isa<Argument>(Src)) {
2134 return Src;
2135 }
2136 if (isa<GlobalValue>(Src)) {
2137 return Src;
2138 }
2139 }
2140 }
2141 LLVM_DEBUG(dbgs() << " Unable to locate Address from intrinsic\n");
2142 return nullptr;
2143}
2144
2146 if (!In)
2147 return nullptr;
2148
2149 if (isa<LoadInst>(In) || isa<StoreInst>(In))
2150 return getLoadStoreType(In);
2151
2153 if (II->getIntrinsicID() == Intrinsic::masked_load)
2154 return II->getType();
2155 if (II->getIntrinsicID() == Intrinsic::masked_store)
2156 return II->getOperand(0)->getType();
2157 }
2158 return In->getType();
2159}
2160
2162 if (!In)
2163 return nullptr;
2164 if (isa<LoadInst>(In))
2165 return In;
2167 if (II->getIntrinsicID() == Intrinsic::masked_load)
2168 return In;
2169 if (II->getIntrinsicID() == Intrinsic::masked_gather)
2170 return In;
2171 }
2172 if (auto *IndexZEx = dyn_cast<ZExtInst>(In))
2173 return locateIndexesFromGEP(IndexZEx->getOperand(0));
2174 if (auto *IndexSEx = dyn_cast<SExtInst>(In))
2175 return locateIndexesFromGEP(IndexSEx->getOperand(0));
2176 if (auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(In))
2177 return locateIndexesFromGEP(BaseShuffle->getOperand(0));
2178 if (auto *IE = dyn_cast<InsertElementInst>(In))
2179 return locateIndexesFromGEP(IE->getOperand(1));
2180 if (auto *cstDataVector = dyn_cast<ConstantDataVector>(In))
2181 return cstDataVector;
2182 if (auto *GEPIndex = dyn_cast<GetElementPtrInst>(In))
2183 return GEPIndex->getOperand(0);
2184 return nullptr;
2185}
2186
2187// Given the intrinsic find its GEP argument and extract offsetts from the base
2188// address it uses.
2191 if (!GEPIndex) {
2192 LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n");
2193 return nullptr;
2194 }
2195 Value *Indexes = GEPIndex->getOperand(1);
2196 if (auto *IndexLoad = locateIndexesFromGEP(Indexes))
2197 return IndexLoad;
2198
2199 LLVM_DEBUG(dbgs() << " Unable to locate Index from intrinsic\n");
2200 return nullptr;
2201}
2202
2203// Because of aukward definition of many Hex intrinsics we often have to
2204// reinterprete HVX native <64 x i16> as <32 x i32> which in practice is a NOP
2205// for all use cases, so this only exist to make IR builder happy.
2206inline Value *getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC,
2207 IRBuilderBase &Builder,
2208 LLVMContext &Ctx, Value *I) {
2209 assert(I && "Unable to reinterprete cast");
2210 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2211 std::vector<unsigned> shuffleMask;
2212 for (unsigned i = 0; i < 64; ++i)
2213 shuffleMask.push_back(i);
2214 Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);
2215 Value *CastShuffle =
2216 Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");
2217 return Builder.CreateBitCast(CastShuffle, NT, "cst64_i16_to_32_i32");
2218}
2219
2220// Recast <128 x i8> as <32 x i32>
2221inline Value *getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC,
2222 IRBuilderBase &Builder,
2223 LLVMContext &Ctx, Value *I) {
2224 assert(I && "Unable to reinterprete cast");
2225 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2226 std::vector<unsigned> shuffleMask;
2227 for (unsigned i = 0; i < 128; ++i)
2228 shuffleMask.push_back(i);
2229 Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);
2230 Value *CastShuffle =
2231 Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");
2232 return Builder.CreateBitCast(CastShuffle, NT, "cst128_i8_to_32_i32");
2233}
2234
2235// Create <32 x i32> mask reinterpreted as <128 x i1> with a given pattern
2236inline Value *get_i32_Mask(const HexagonVectorCombine &HVC,
2237 IRBuilderBase &Builder, LLVMContext &Ctx,
2238 unsigned int pattern) {
2239 std::vector<unsigned int> byteMask;
2240 for (unsigned i = 0; i < 32; ++i)
2241 byteMask.push_back(pattern);
2242
2243 return Builder.CreateIntrinsic(
2244 HVC.getBoolTy(128), HVC.HST.getIntrinsicId(Hexagon::V6_vandvrt),
2245 {llvm::ConstantDataVector::get(Ctx, byteMask), HVC.getConstInt(~0)},
2246 nullptr);
2247}
2248
2249Value *HvxIdioms::processVScatter(Instruction &In) const {
2250 auto *InpTy = dyn_cast<VectorType>(In.getOperand(0)->getType());
2251 assert(InpTy && "Cannot handle no vector type for llvm.scatter/gather");
2252 unsigned InpSize = HVC.getSizeOf(InpTy);
2253 auto *F = In.getFunction();
2254 LLVMContext &Ctx = F->getContext();
2255 auto *ElemTy = dyn_cast<IntegerType>(InpTy->getElementType());
2256 assert(ElemTy && "llvm.scatter needs integer type argument");
2257 unsigned ElemWidth = HVC.DL.getTypeAllocSize(ElemTy);
2258 LLVM_DEBUG({
2259 unsigned Elements = HVC.length(InpTy);
2260 dbgs() << "\n[Process scatter](" << In << ")\n" << *In.getParent() << "\n";
2261 dbgs() << " Input type(" << *InpTy << ") elements(" << Elements
2262 << ") VecLen(" << InpSize << ") type(" << *ElemTy << ") ElemWidth("
2263 << ElemWidth << ")\n";
2264 });
2265
2266 IRBuilder Builder(In.getParent(), In.getIterator(),
2267 InstSimplifyFolder(HVC.DL));
2268
2269 auto *ValueToScatter = In.getOperand(0);
2270 LLVM_DEBUG(dbgs() << " ValueToScatter : " << *ValueToScatter << "\n");
2271
2272 if (HVC.HST.getVectorLength() != InpSize) {
2273 LLVM_DEBUG(dbgs() << "Unhandled vector size(" << InpSize
2274 << ") for vscatter\n");
2275 return nullptr;
2276 }
2277
2278 // Base address of indexes.
2279 auto *IndexLoad = locateAddressFromIntrinsic(&In);
2280 if (!IndexLoad)
2281 return nullptr;
2282 LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n");
2283
2284 // Address of destination. Must be in VTCM.
2285 auto *Ptr = getPointer(IndexLoad);
2286 if (!Ptr)
2287 return nullptr;
2288 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2289 // Indexes/offsets
2290 auto *Indexes = locateIndexesFromIntrinsic(&In);
2291 if (!Indexes)
2292 return nullptr;
2293 LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n");
2294 Value *CastedDst = Builder.CreateBitOrPointerCast(Ptr, Type::getInt32Ty(Ctx),
2295 "cst_ptr_to_i32");
2296 LLVM_DEBUG(dbgs() << " CastedDst : " << *CastedDst << "\n");
2297 // Adjust Indexes
2298 auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
2299 Value *CastIndex = nullptr;
2300 if (cstDataVector) {
2301 // Our indexes are represented as a constant. We need it in a reg.
2302 Type *IndexVectorType = HVC.getHvxTy(HVC.getIntTy(32), false);
2303 AllocaInst *IndexesAlloca = Builder.CreateAlloca(IndexVectorType);
2304 [[maybe_unused]] auto *StoreIndexes =
2305 Builder.CreateStore(cstDataVector, IndexesAlloca);
2306 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2307 CastIndex =
2308 Builder.CreateLoad(IndexVectorType, IndexesAlloca, "reload_index");
2309 } else {
2310 if (ElemWidth == 2)
2311 CastIndex = getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
2312 else
2313 CastIndex = Indexes;
2314 }
2315 LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n");
2316
2317 if (ElemWidth == 1) {
2318 // v128i8 There is no native instruction for this.
2319 // Do this as two Hi/Lo gathers with masking.
2320 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2321 // Extend indexes. We assume that indexes are in 128i8 format - need to
2322 // expand them to Hi/Lo 64i16
2323 Value *CastIndexes = Builder.CreateBitCast(CastIndex, NT, "cast_to_32i32");
2324 auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);
2325 auto *UnpackedIndexes = Builder.CreateIntrinsic(
2326 HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastIndexes, nullptr);
2327 LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes << ")\n");
2328
2329 auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);
2330 auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);
2331 [[maybe_unused]] Value *IndexHi =
2332 HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);
2333 [[maybe_unused]] Value *IndexLo =
2334 HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);
2335 LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n");
2336 LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n");
2337 // Now unpack values to scatter
2338 Value *CastSrc =
2339 getReinterpretiveCast_i8_to_i32(HVC, Builder, Ctx, ValueToScatter);
2340 LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n");
2341 auto *UnpackedValueToScatter = Builder.CreateIntrinsic(
2342 HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastSrc, nullptr);
2343 LLVM_DEBUG(dbgs() << " UnpackedValToScat: " << *UnpackedValueToScatter
2344 << ")\n");
2345
2346 [[maybe_unused]] Value *UVSHi =
2347 HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedValueToScatter);
2348 [[maybe_unused]] Value *UVSLo =
2349 HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedValueToScatter);
2350 LLVM_DEBUG(dbgs() << " UVSHi : " << *UVSHi << ")\n");
2351 LLVM_DEBUG(dbgs() << " UVSLo : " << *UVSLo << ")\n");
2352
2353 // Create the mask for individual bytes
2354 auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);
2355 LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n");
2356 [[maybe_unused]] auto *ResHi = Builder.CreateIntrinsic(
2357 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,
2358 {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2359 IndexHi, UVSHi},
2360 nullptr);
2361 LLVM_DEBUG(dbgs() << " ResHi : " << *ResHi << ")\n");
2362 return Builder.CreateIntrinsic(
2363 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,
2364 {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2365 IndexLo, UVSLo},
2366 nullptr);
2367 } else if (ElemWidth == 2) {
2368 Value *CastSrc =
2369 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, ValueToScatter);
2370 LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n");
2371 return Builder.CreateIntrinsic(
2372 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermh_128B,
2373 {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
2374 CastSrc},
2375 nullptr);
2376 } else if (ElemWidth == 4) {
2377 return Builder.CreateIntrinsic(
2378 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermw_128B,
2379 {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
2380 ValueToScatter},
2381 nullptr);
2382 } else {
2383 LLVM_DEBUG(dbgs() << "Unhandled element type for vscatter\n");
2384 return nullptr;
2385 }
2386}
2387
2388Value *HvxIdioms::processVGather(Instruction &In) const {
2389 [[maybe_unused]] auto *InpTy =
2390 dyn_cast<VectorType>(In.getOperand(0)->getType());
2391 assert(InpTy && "Cannot handle no vector type for llvm.gather");
2392 [[maybe_unused]] auto *ElemTy =
2393 dyn_cast<PointerType>(InpTy->getElementType());
2394 assert(ElemTy && "llvm.gather needs vector of ptr argument");
2395 auto *F = In.getFunction();
2396 LLVMContext &Ctx = F->getContext();
2397 LLVM_DEBUG(dbgs() << "\n[Process gather](" << In << ")\n"
2398 << *In.getParent() << "\n");
2399 LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements("
2400 << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
2401 << ") type(" << *ElemTy << ") Access alignment("
2402 << *In.getOperand(1) << ") AddressSpace("
2403 << ElemTy->getAddressSpace() << ")\n");
2404
2405 // TODO: Handle masking of elements.
2406 assert(dyn_cast<VectorType>(In.getOperand(2)->getType()) &&
2407 "llvm.gather needs vector for mask");
2408 IRBuilder Builder(In.getParent(), In.getIterator(),
2409 InstSimplifyFolder(HVC.DL));
2410
2411 // See who is using the result. The difference between LLVM and HVX vgather
2412 // Intrinsic makes it impossible to handle all cases with temp storage. Alloca
2413 // in VTCM is not yet supported, so for now we just bail out for those cases.
2414 HvxIdioms::DstQualifier Qual = HvxIdioms::Undefined;
2415 Instruction *Dst = locateDestination(&In, Qual);
2416 if (!Dst) {
2417 LLVM_DEBUG(dbgs() << " Unable to locate vgather destination\n");
2418 return nullptr;
2419 }
2420 LLVM_DEBUG(dbgs() << " Destination : " << *Dst << " Qual(" << Qual
2421 << ")\n");
2422
2423 // Address of destination. Must be in VTCM.
2424 auto *Ptr = getPointer(Dst);
2425 if (!Ptr) {
2426 LLVM_DEBUG(dbgs() << "Could not locate vgather destination ptr\n");
2427 return nullptr;
2428 }
2429
2430 // Result type. Assume it is a vector type.
2431 auto *DstType = cast<VectorType>(getIndexType(Dst));
2432 assert(DstType && "Cannot handle non vector dst type for llvm.gather");
2433
2434 // Base address for sources to be loaded
2435 auto *IndexLoad = locateAddressFromIntrinsic(&In);
2436 if (!IndexLoad)
2437 return nullptr;
2438 LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n");
2439
2440 // Gather indexes/offsets
2441 auto *Indexes = locateIndexesFromIntrinsic(&In);
2442 if (!Indexes)
2443 return nullptr;
2444 LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n");
2445
2446 Instruction *Gather = nullptr;
2447 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2448 if (Qual == HvxIdioms::LdSt || Qual == HvxIdioms::Arithmetic) {
2449 // We fully assume the address space is in VTCM. We also assume that all
2450 // pointers in Operand(0) have the same base(!).
2451 // This is the most basic case of all the above.
2452 unsigned OutputSize = HVC.getSizeOf(DstType);
2453 auto *DstElemTy = cast<IntegerType>(DstType->getElementType());
2454 unsigned ElemWidth = HVC.DL.getTypeAllocSize(DstElemTy);
2455 LLVM_DEBUG(dbgs() << " Buffer type : " << *Ptr->getType()
2456 << " Address space ("
2457 << Ptr->getType()->getPointerAddressSpace() << ")\n"
2458 << " Result type : " << *DstType
2459 << "\n Size in bytes : " << OutputSize
2460 << " element type(" << *DstElemTy
2461 << ")\n ElemWidth : " << ElemWidth << " bytes\n");
2462
2463 auto *IndexType = cast<VectorType>(getIndexType(Indexes));
2464 assert(IndexType && "Cannot handle non vector index type for llvm.gather");
2465 unsigned IndexWidth = HVC.DL.getTypeAllocSize(IndexType->getElementType());
2466 LLVM_DEBUG(dbgs() << " IndexWidth(" << IndexWidth << ")\n");
2467
2468 // Intrinsic takes i32 instead of pointer so cast.
2469 Value *CastedPtr = Builder.CreateBitOrPointerCast(
2470 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2471 // [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ...]
2472 // int_hexagon_V6_vgathermh [... , llvm_v16i32_ty]
2473 // int_hexagon_V6_vgathermh_128B [... , llvm_v32i32_ty]
2474 // int_hexagon_V6_vgathermhw [... , llvm_v32i32_ty]
2475 // int_hexagon_V6_vgathermhw_128B [... , llvm_v64i32_ty]
2476 // int_hexagon_V6_vgathermw [... , llvm_v16i32_ty]
2477 // int_hexagon_V6_vgathermw_128B [... , llvm_v32i32_ty]
2478 if (HVC.HST.getVectorLength() == OutputSize) {
2479 if (ElemWidth == 1) {
2480 // v128i8 There is no native instruction for this.
2481 // Do this as two Hi/Lo gathers with masking.
2482 // Unpack indexes. We assume that indexes are in 128i8 format - need to
2483 // expand them to Hi/Lo 64i16
2484 Value *CastIndexes =
2485 Builder.CreateBitCast(Indexes, NT, "cast_to_32i32");
2486 auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);
2487 auto *UnpackedIndexes =
2488 Builder.CreateIntrinsic(HVC.getHvxTy(HVC.getIntTy(32), true),
2489 V6_vunpack, CastIndexes, nullptr);
2490 LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes
2491 << ")\n");
2492
2493 auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);
2494 auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);
2495 [[maybe_unused]] Value *IndexHi =
2496 HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);
2497 [[maybe_unused]] Value *IndexLo =
2498 HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);
2499 LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n");
2500 LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n");
2501 // Create the mask for individual bytes
2502 auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);
2503 LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n");
2504 // We use our destination allocation as a temp storage
2505 // This is unlikely to work properly for masked gather.
2506 auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermhq);
2507 [[maybe_unused]] auto GatherHi = Builder.CreateIntrinsic(
2508 Type::getVoidTy(Ctx), V6_vgather,
2509 {Ptr, QByteMask, CastedPtr,
2510 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexHi},
2511 nullptr);
2512 LLVM_DEBUG(dbgs() << " GatherHi : " << *GatherHi << ")\n");
2513 // Rematerialize the result
2514 [[maybe_unused]] Value *LoadedResultHi = Builder.CreateLoad(
2515 HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_hi");
2516 LLVM_DEBUG(dbgs() << " LoadedResultHi : " << *LoadedResultHi << "\n");
2517 // Same for the low part. Here we use Gather to return non-NULL result
2518 // from this function and continue to iterate. We also are deleting Dst
2519 // store below.
2520 Gather = Builder.CreateIntrinsic(
2521 Type::getVoidTy(Ctx), V6_vgather,
2522 {Ptr, QByteMask, CastedPtr,
2523 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexLo},
2524 nullptr);
2525 LLVM_DEBUG(dbgs() << " GatherLo : " << *Gather << ")\n");
2526 Value *LoadedResultLo = Builder.CreateLoad(
2527 HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_lo");
2528 LLVM_DEBUG(dbgs() << " LoadedResultLo : " << *LoadedResultLo << "\n");
2529 // Now we have properly sized bytes in every other position
2530 // B b A a c a A b B c f F g G h H is presented as
2531 // B . b . A . a . c . a . A . b . B . c . f . F . g . G . h . H
2532 // Use vpack to gather them
2533 auto V6_vpackeb = HVC.HST.getIntrinsicId(Hexagon::V6_vpackeb);
2534 [[maybe_unused]] auto Res = Builder.CreateIntrinsic(
2535 NT, V6_vpackeb, {LoadedResultHi, LoadedResultLo}, nullptr);
2536 LLVM_DEBUG(dbgs() << " ScaledRes : " << *Res << "\n");
2537 [[maybe_unused]] auto *StoreRes = Builder.CreateStore(Res, Ptr);
2538 LLVM_DEBUG(dbgs() << " StoreRes : " << *StoreRes << "\n");
2539 } else if (ElemWidth == 2) {
2540 // v32i16
2541 if (IndexWidth == 2) {
2542 // Reinterprete 64i16 as 32i32. Only needed for syntactic IR match.
2543 Value *CastIndex =
2544 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
2545 LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n");
2546 // shift all i16 left by 1 to match short addressing mode instead of
2547 // byte.
2548 auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
2549 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2550 Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});
2552 << " Shifted half index: " << *AdjustedIndex << ")\n");
2553
2554 auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermh);
2555 // The 3rd argument is the size of the region to gather from. Probably
2556 // want to set it to max VTCM size.
2557 Gather = Builder.CreateIntrinsic(
2558 Type::getVoidTy(Ctx), V6_vgather,
2559 {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2560 AdjustedIndex},
2561 nullptr);
2562 for (auto &U : Dst->uses()) {
2563 if (auto *UI = dyn_cast<Instruction>(U.getUser()))
2564 dbgs() << " dst used by: " << *UI << "\n";
2565 }
2566 for (auto &U : In.uses()) {
2567 if (auto *UI = dyn_cast<Instruction>(U.getUser()))
2568 dbgs() << " In used by : " << *UI << "\n";
2569 }
2570 // Create temp load from result in case the result is used by any
2571 // other instruction.
2572 Value *LoadedResult = Builder.CreateLoad(
2573 HVC.getHvxTy(HVC.getIntTy(16), false), Ptr, "temp_result");
2574 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2575 In.replaceAllUsesWith(LoadedResult);
2576 } else {
2577 dbgs() << " Unhandled index type for vgather\n";
2578 return nullptr;
2579 }
2580 } else if (ElemWidth == 4) {
2581 if (IndexWidth == 4) {
2582 // v32i32
2583 auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
2584 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2585 Builder, V6_vaslh, NT, {Indexes, HVC.getConstInt(2)});
2587 << " Shifted word index: " << *AdjustedIndex << ")\n");
2588 Gather = Builder.CreateIntrinsic(
2589 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermw_128B,
2590 {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2591 AdjustedIndex},
2592 nullptr);
2593 } else {
2594 LLVM_DEBUG(dbgs() << " Unhandled index type for vgather\n");
2595 return nullptr;
2596 }
2597 } else {
2598 LLVM_DEBUG(dbgs() << " Unhandled element type for vgather\n");
2599 return nullptr;
2600 }
2601 } else if (HVC.HST.getVectorLength() == OutputSize * 2) {
2602 // This is half of the reg width, duplicate low in high
2603 LLVM_DEBUG(dbgs() << " Unhandled half of register size\n");
2604 return nullptr;
2605 } else if (HVC.HST.getVectorLength() * 2 == OutputSize) {
2606 LLVM_DEBUG(dbgs() << " Unhandle twice the register size\n");
2607 return nullptr;
2608 }
2609 // Erase the original intrinsic and store that consumes it.
2610 // HVX will create a pseudo for gather that is expanded to gather + store
2611 // during packetization.
2612 Dst->eraseFromParent();
2613 } else if (Qual == HvxIdioms::LLVM_Scatter) {
2614 // Gather feeds directly into scatter.
2615 LLVM_DEBUG({
2616 auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType());
2617 assert(DstInpTy && "Cannot handle no vector type for llvm.scatter");
2618 unsigned DstInpSize = HVC.getSizeOf(DstInpTy);
2619 unsigned DstElements = HVC.length(DstInpTy);
2620 auto *DstElemTy = cast<PointerType>(DstInpTy->getElementType());
2621 assert(DstElemTy && "llvm.scatter needs vector of ptr argument");
2622 dbgs() << " Gather feeds into scatter\n Values to scatter : "
2623 << *Dst->getOperand(0) << "\n";
2624 dbgs() << " Dst type(" << *DstInpTy << ") elements(" << DstElements
2625 << ") VecLen(" << DstInpSize << ") type(" << *DstElemTy
2626 << ") Access alignment(" << *Dst->getOperand(2) << ")\n";
2627 });
2628 // Address of source
2629 auto *Src = getPointer(IndexLoad);
2630 if (!Src)
2631 return nullptr;
2632 LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n");
2633
2634 if (!isa<PointerType>(Src->getType())) {
2635 LLVM_DEBUG(dbgs() << " Source is not a pointer type...\n");
2636 return nullptr;
2637 }
2638
2639 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2640 Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2641 LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n");
2642
2643 auto *DstLoad = locateAddressFromIntrinsic(Dst);
2644 if (!DstLoad) {
2645 LLVM_DEBUG(dbgs() << " Unable to locate DstLoad\n");
2646 return nullptr;
2647 }
2648 LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n");
2649
2650 Value *Ptr = getPointer(DstLoad);
2651 if (!Ptr)
2652 return nullptr;
2653 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2654 Value *CastIndex =
2655 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, IndexLoad);
2656 LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n");
2657 // Shift all i16 left by 1 to match short addressing mode instead of
2658 // byte.
2659 auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
2660 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2661 Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});
2662 LLVM_DEBUG(dbgs() << " Shifted half index: " << *AdjustedIndex << ")\n");
2663
2664 return Builder.CreateIntrinsic(
2665 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2666 {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2667 AdjustedIndex},
2668 nullptr);
2669 } else if (Qual == HvxIdioms::HEX_Gather_Scatter) {
2670 // Gather feeds into previously inserted pseudo intrinsic.
2671 // These could not be in the same packet, so we need to generate another
2672 // pseudo that is expanded to .tmp + store V6_vgathermh_pseudo
2673 // V6_vgathermh_pseudo (ins IntRegs:$_dst_, s4_0Imm:$Ii, IntRegs:$Rt,
2674 // ModRegs:$Mu, HvxVR:$Vv)
2675 if (isa<AllocaInst>(IndexLoad)) {
2676 auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
2677 if (cstDataVector) {
2678 // Our indexes are represented as a constant. We need THEM in a reg.
2679 // This most likely will not work properly since alloca gives us DDR
2680 // stack location. This will be fixed once we teach compiler about VTCM.
2681 AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);
2682 [[maybe_unused]] auto *StoreIndexes =
2683 Builder.CreateStore(cstDataVector, IndexesAlloca);
2684 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2685 Value *LoadedIndex =
2686 Builder.CreateLoad(NT, IndexesAlloca, "reload_index");
2687 AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
2688 LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca << "\n");
2689
2690 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2691 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2692 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2693
2694 Gather = Builder.CreateIntrinsic(
2695 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2696 {ResultAlloca, CastedSrc,
2697 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
2698 nullptr);
2699 Value *LoadedResult = Builder.CreateLoad(
2700 HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
2701 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2702 LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n");
2703 In.replaceAllUsesWith(LoadedResult);
2704 }
2705 } else {
2706 // Address of source
2707 auto *Src = getPointer(IndexLoad);
2708 if (!Src)
2709 return nullptr;
2710 LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n");
2711
2712 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2713 Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2714 LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n");
2715
2716 auto *DstLoad = locateAddressFromIntrinsic(Dst);
2717 if (!DstLoad)
2718 return nullptr;
2719 LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n");
2720 auto *Ptr = getPointer(DstLoad);
2721 if (!Ptr)
2722 return nullptr;
2723 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2724
2725 Gather = Builder.CreateIntrinsic(
2726 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgather_vscattermh,
2727 {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2728 Indexes},
2729 nullptr);
2730 }
2731 return Gather;
2732 } else if (Qual == HvxIdioms::HEX_Scatter) {
2733 // This is the case when result of a gather is used as an argument to
2734 // Intrinsic::hexagon_V6_vscattermh_128B. Most likely we just inserted it
2735 // ourselves. We have to create alloca, store to it, and replace all uses
2736 // with that.
2737 AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
2738 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2739 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2740 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2741 Value *CastIndex =
2742 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
2743 LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n");
2744
2745 Gather = Builder.CreateIntrinsic(
2746 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2747 {ResultAlloca, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2748 CastIndex},
2749 nullptr);
2750 Value *LoadedResult = Builder.CreateLoad(
2751 HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
2752 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2753 In.replaceAllUsesWith(LoadedResult);
2754 } else if (Qual == HvxIdioms::HEX_Gather) {
2755 // Gather feeds to another gather but already replaced with
2756 // hexagon_V6_vgathermh_128B
2757 if (isa<AllocaInst>(IndexLoad)) {
2758 auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
2759 if (cstDataVector) {
2760 // Our indexes are represented as a constant. We need it in a reg.
2761 AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);
2762
2763 [[maybe_unused]] auto *StoreIndexes =
2764 Builder.CreateStore(cstDataVector, IndexesAlloca);
2765 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2766 Value *LoadedIndex =
2767 Builder.CreateLoad(NT, IndexesAlloca, "reload_index");
2768 AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
2769 LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca
2770 << "\n AddressSpace: "
2771 << ResultAlloca->getAddressSpace() << "\n";);
2772
2773 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2774 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2775 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2776
2777 Gather = Builder.CreateIntrinsic(
2778 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2779 {ResultAlloca, CastedSrc,
2780 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
2781 nullptr);
2782 Value *LoadedResult = Builder.CreateLoad(
2783 HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
2784 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2785 LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n");
2786 In.replaceAllUsesWith(LoadedResult);
2787 }
2788 }
2789 } else if (Qual == HvxIdioms::LLVM_Gather) {
2790 // Gather feeds into another gather
2791 errs() << " Underimplemented vgather to vgather sequence\n";
2792 return nullptr;
2793 } else
2794 llvm_unreachable("Unhandled Qual enum");
2795
2796 return Gather;
2797}
2798
2799// Go through all PHI incomming values and find minimal alignment for non GEP
2800// members.
2801std::optional<uint64_t> HvxIdioms::getPHIBaseMinAlignment(Instruction &In,
2802 PHINode *PN) const {
2803 if (!PN)
2804 return std::nullopt;
2805
2806 SmallVector<Value *, 16> Worklist;
2807 SmallPtrSet<Value *, 16> Visited;
2808 uint64_t minPHIAlignment = Value::MaximumAlignment;
2809 Worklist.push_back(PN);
2810
2811 while (!Worklist.empty()) {
2812 Value *V = Worklist.back();
2813 Worklist.pop_back();
2814 if (!Visited.insert(V).second)
2815 continue;
2816
2817 if (PHINode *PN = dyn_cast<PHINode>(V)) {
2818 for (unsigned i = 0; i < PN->getNumIncomingValues(); ++i) {
2819 Worklist.push_back(PN->getIncomingValue(i));
2820 }
2821 } else if (isa<GetElementPtrInst>(V)) {
2822 // Ignore geps for now.
2823 continue;
2824 } else {
2825 Align KnownAlign = getKnownAlignment(V, HVC.DL, &In, &HVC.AC, &HVC.DT);
2826 if (KnownAlign.value() < minPHIAlignment)
2827 minPHIAlignment = KnownAlign.value();
2828 }
2829 }
2830 if (minPHIAlignment != Value::MaximumAlignment)
2831 return minPHIAlignment;
2832 return std::nullopt;
2833}
2834
2835// Helper function to discover alignment for a ptr.
2836std::optional<uint64_t> HvxIdioms::getAlignment(Instruction &In,
2837 Value *ptr) const {
2838 SmallPtrSet<Value *, 16> Visited;
2839 return getAlignmentImpl(In, ptr, Visited);
2840}
2841
2842std::optional<uint64_t>
2843HvxIdioms::getAlignmentImpl(Instruction &In, Value *ptr,
2844 SmallPtrSet<Value *, 16> &Visited) const {
2845 LLVM_DEBUG(dbgs() << "[getAlignment] for : " << *ptr << "\n");
2846 // Prevent infinite recursion
2847 if (!Visited.insert(ptr).second)
2848 return std::nullopt;
2849 // Try AssumptionCache.
2850 Align KnownAlign = getKnownAlignment(ptr, HVC.DL, &In, &HVC.AC, &HVC.DT);
2851 // This is the most formal and reliable source of information.
2852 if (KnownAlign.value() > 1) {
2853 LLVM_DEBUG(dbgs() << " VC align(" << KnownAlign.value() << ")\n");
2854 return KnownAlign.value();
2855 }
2856
2857 // If it is a PHI try to iterate through inputs
2858 if (PHINode *PN = dyn_cast<PHINode>(ptr)) {
2859 // See if we have a common base to which we know alignment.
2860 auto baseAlignmentOpt = getPHIBaseMinAlignment(In, PN);
2861 if (!baseAlignmentOpt)
2862 return std::nullopt;
2863
2864 uint64_t minBaseAlignment = *baseAlignmentOpt;
2865 // If it is 1, there is no point to keep on looking.
2866 if (minBaseAlignment == 1)
2867 return 1;
2868 // No see if all other incomming phi nodes are just loop carried constants.
2869 uint64_t minPHIAlignment = minBaseAlignment;
2870 LLVM_DEBUG(dbgs() << " It is a PHI with(" << PN->getNumIncomingValues()
2871 << ")nodes and min base aligned to (" << minBaseAlignment
2872 << ")\n");
2873 for (unsigned i = 0; i < PN->getNumIncomingValues(); ++i) {
2874 Value *IV = PN->getIncomingValue(i);
2875 // We have already looked at all other values.
2877 continue;
2878 uint64_t MemberAlignment = Value::MaximumAlignment;
2879 if (auto res = getAlignment(*PN, IV))
2880 MemberAlignment = *res;
2881 else
2882 return std::nullopt;
2883 // Adjust total PHI alignment.
2884 if (minPHIAlignment > MemberAlignment)
2885 minPHIAlignment = MemberAlignment;
2886 }
2887 LLVM_DEBUG(dbgs() << " total PHI alignment(" << minPHIAlignment << ")\n");
2888 return minPHIAlignment;
2889 }
2890
2891 if (auto *GEP = dyn_cast<GetElementPtrInst>(ptr)) {
2892 auto *GEPPtr = GEP->getPointerOperand();
2893 // Only if this is the induction variable with const offset
2894 // Implicit assumption is that induction variable itself is a PHI
2895 if (&In == GEPPtr) {
2896 APInt Offset(HVC.DL.getPointerSizeInBits(
2897 GEPPtr->getType()->getPointerAddressSpace()),
2898 0);
2899 if (GEP->accumulateConstantOffset(HVC.DL, Offset)) {
2900 LLVM_DEBUG(dbgs() << " Induction GEP with const step of ("
2901 << Offset.getZExtValue() << ")\n");
2902 return Offset.getZExtValue();
2903 }
2904 }
2905 }
2906
2907 return std::nullopt;
2908}
2909
2910Value *HvxIdioms::processMStore(Instruction &In) const {
2911 [[maybe_unused]] auto *InpTy =
2912 dyn_cast<VectorType>(In.getOperand(0)->getType());
2913 assert(InpTy && "Cannot handle no vector type for llvm.masked.store");
2914
2915 LLVM_DEBUG(dbgs() << "\n[Process mstore](" << In << ")\n"
2916 << *In.getParent() << "\n");
2917 LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements("
2918 << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
2919 << ") type(" << *InpTy->getElementType() << ") of size("
2920 << InpTy->getScalarSizeInBits() << ")bits\n");
2921 auto *CI = dyn_cast<CallBase>(&In);
2922 assert(CI && "Expected llvm.masked.store to be a call");
2923 Align HaveAlign = CI->getParamAlign(1).valueOrOne();
2924
2925 uint64_t KA = 1;
2926 if (auto res = getAlignment(In, In.getOperand(1))) // ptr operand
2927 KA = *res;
2928 LLVM_DEBUG(dbgs() << " HaveAlign(" << HaveAlign.value() << ") KnownAlign("
2929 << KA << ")\n");
2930 // Normalize 0 -> ABI alignment of the stored value type (operand 0).
2931 Type *ValTy = In.getOperand(0)->getType();
2932 Align EffA =
2933 (KA > 0) ? Align(KA) : Align(HVC.DL.getABITypeAlign(ValTy).value());
2934
2935 if (EffA < HaveAlign)
2936 return nullptr;
2937
2938 // Attach/replace the param attribute on pointer param #1.
2939 AttrBuilder AttrB(CI->getContext());
2940 AttrB.addAlignmentAttr(EffA);
2941 CI->setAttributes(
2942 CI->getAttributes().addParamAttributes(CI->getContext(), 1, AttrB));
2943 return CI;
2944}
2945
2946Value *HvxIdioms::processMLoad(Instruction &In) const {
2947 [[maybe_unused]] auto *InpTy = dyn_cast<VectorType>(In.getType());
2948 assert(InpTy && "Cannot handle non vector type for llvm.masked.store");
2949 LLVM_DEBUG(dbgs() << "\n[Process mload](" << In << ")\n"
2950 << *In.getParent() << "\n");
2951 LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements("
2952 << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
2953 << ") type(" << *InpTy->getElementType() << ") of size("
2954 << InpTy->getScalarSizeInBits() << ")bits\n");
2955 auto *CI = dyn_cast<CallBase>(&In);
2956 assert(CI && "Expected to be a call to llvm.masked.load");
2957 // The pointer is operand #0, and its param attribute index is also 0.
2958 Align HaveAlign = CI->getParamAlign(0).valueOrOne();
2959
2960 // Compute best-known alignment KA from analysis.
2961 uint64_t KA = 1;
2962 if (auto res = getAlignment(In, In.getOperand(0))) // ptr operand
2963 KA = *res;
2964
2965 // Normalize 0 → ABI alignment of the loaded value type.
2966 Type *ValTy = In.getType();
2967 Align EffA =
2968 (KA > 0) ? Align(KA) : Align(HVC.DL.getABITypeAlign(ValTy).value());
2969 if (EffA < HaveAlign)
2970 return nullptr;
2971 LLVM_DEBUG(dbgs() << " HaveAlign(" << HaveAlign.value() << ") KnownAlign("
2972 << KA << ")\n");
2973
2974 // Attach/replace the param attribute on pointer param #0.
2975 AttrBuilder AttrB(CI->getContext());
2976 AttrB.addAlignmentAttr(EffA);
2977 CI->setAttributes(
2978 CI->getAttributes().addParamAttributes(CI->getContext(), 0, AttrB));
2979 return CI;
2980}
2981
2982auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
2983 const FxpOp &Op) const -> Value * {
2984 assert(Op.X.Val->getType() == Op.Y.Val->getType());
2985 auto *InpTy = cast<VectorType>(Op.X.Val->getType());
2986 unsigned Width = InpTy->getScalarSizeInBits();
2987 bool Rounding = Op.RoundAt.has_value();
2988
2989 if (!Op.RoundAt || *Op.RoundAt == Op.Frac - 1) {
2990 // The fixed-point intrinsics do signed multiplication.
2991 if (Width == Op.Frac + 1 && Op.X.Sgn != Unsigned && Op.Y.Sgn != Unsigned) {
2992 Value *QMul = nullptr;
2993 if (Width == 16) {
2994 QMul = createMulQ15(Builder, Op.X, Op.Y, Rounding);
2995 } else if (Width == 32) {
2996 QMul = createMulQ31(Builder, Op.X, Op.Y, Rounding);
2997 }
2998 if (QMul != nullptr)
2999 return QMul;
3000 }
3001 }
3002
3003 assert(Width >= 32 || isPowerOf2_32(Width)); // Width <= 32 => Width is 2^n
3004 assert(Width < 32 || Width % 32 == 0); // Width > 32 => Width is 32*k
3005
3006 // If Width < 32, then it should really be 16.
3007 if (Width < 32) {
3008 if (Width < 16)
3009 return nullptr;
3010 // Getting here with Op.Frac == 0 isn't wrong, but suboptimal: here we
3011 // generate a full precision products, which is unnecessary if there is
3012 // no shift.
3013 assert(Width == 16);
3014 assert(Op.Frac != 0 && "Unshifted mul should have been skipped");
3015 if (Op.Frac == 16) {
3016 // Multiply high
3017 if (Value *MulH = createMulH16(Builder, Op.X, Op.Y))
3018 return MulH;
3019 }
3020 // Do full-precision multiply and shift.
3021 Value *Prod32 = createMul16(Builder, Op.X, Op.Y);
3022 if (Rounding) {
3023 Value *RoundVal =
3024 ConstantInt::get(Prod32->getType(), 1ull << *Op.RoundAt);
3025 Prod32 = Builder.CreateAdd(Prod32, RoundVal, "add");
3026 }
3027
3028 Value *ShiftAmt = ConstantInt::get(Prod32->getType(), Op.Frac);
3029 Value *Shifted = Op.X.Sgn == Signed || Op.Y.Sgn == Signed
3030 ? Builder.CreateAShr(Prod32, ShiftAmt, "asr")
3031 : Builder.CreateLShr(Prod32, ShiftAmt, "lsr");
3032 return Builder.CreateTrunc(Shifted, InpTy, "trn");
3033 }
3034
3035 // Width >= 32
3036
3037 // Break up the arguments Op.X and Op.Y into vectors of smaller widths
3038 // in preparation of doing the multiplication by 32-bit parts.
3039 auto WordX = HVC.splitVectorElements(Builder, Op.X.Val, /*ToWidth=*/32);
3040 auto WordY = HVC.splitVectorElements(Builder, Op.Y.Val, /*ToWidth=*/32);
3041 auto WordP = createMulLong(Builder, WordX, Op.X.Sgn, WordY, Op.Y.Sgn);
3042
3043 auto *HvxWordTy = cast<VectorType>(WordP.front()->getType());
3044
3045 // Add the optional rounding to the proper word.
3046 if (Op.RoundAt.has_value()) {
3047 Value *Zero = Constant::getNullValue(WordX[0]->getType());
3048 SmallVector<Value *> RoundV(WordP.size(), Zero);
3049 RoundV[*Op.RoundAt / 32] =
3050 ConstantInt::get(HvxWordTy, 1ull << (*Op.RoundAt % 32));
3051 WordP = createAddLong(Builder, WordP, RoundV);
3052 }
3053
3054 // createRightShiftLong?
3055
3056 // Shift all products right by Op.Frac.
3057 unsigned SkipWords = Op.Frac / 32;
3058 Constant *ShiftAmt = ConstantInt::get(HvxWordTy, Op.Frac % 32);
3059
3060 for (int Dst = 0, End = WordP.size() - SkipWords; Dst != End; ++Dst) {
3061 int Src = Dst + SkipWords;
3062 Value *Lo = WordP[Src];
3063 if (Src + 1 < End) {
3064 Value *Hi = WordP[Src + 1];
3065 WordP[Dst] = Builder.CreateIntrinsic(HvxWordTy, Intrinsic::fshr,
3066 {Hi, Lo, ShiftAmt},
3067 /*FMFSource*/ nullptr, "int");
3068 } else {
3069 // The shift of the most significant word.
3070 WordP[Dst] = Builder.CreateAShr(Lo, ShiftAmt, "asr");
3071 }
3072 }
3073 if (SkipWords != 0)
3074 WordP.resize(WordP.size() - SkipWords);
3075
3076 return HVC.joinVectorElements(Builder, WordP, Op.ResTy);
3077}
3078
3079auto HvxIdioms::createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
3080 bool Rounding) const -> Value * {
3081 assert(X.Val->getType() == Y.Val->getType());
3082 assert(X.Val->getType()->getScalarType() == HVC.getIntTy(16));
3083 assert(HVC.HST.isHVXVectorType(EVT::getEVT(X.Val->getType(), false)));
3084
3085 // There is no non-rounding intrinsic for i16.
3086 if (!Rounding || X.Sgn == Unsigned || Y.Sgn == Unsigned)
3087 return nullptr;
3088
3089 auto V6_vmpyhvsrs = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhvsrs);
3090 return HVC.createHvxIntrinsic(Builder, V6_vmpyhvsrs, X.Val->getType(),
3091 {X.Val, Y.Val});
3092}
3093
3094auto HvxIdioms::createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
3095 bool Rounding) const -> Value * {
3096 Type *InpTy = X.Val->getType();
3097 assert(InpTy == Y.Val->getType());
3098 assert(InpTy->getScalarType() == HVC.getIntTy(32));
3099 assert(HVC.HST.isHVXVectorType(EVT::getEVT(InpTy, false)));
3100
3101 if (X.Sgn == Unsigned || Y.Sgn == Unsigned)
3102 return nullptr;
3103
3104 auto V6_vmpyewuh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyewuh);
3105 auto V6_vmpyo_acc = Rounding
3106 ? HVC.HST.getIntrinsicId(Hexagon::V6_vmpyowh_rnd_sacc)
3107 : HVC.HST.getIntrinsicId(Hexagon::V6_vmpyowh_sacc);
3108 Value *V1 =
3109 HVC.createHvxIntrinsic(Builder, V6_vmpyewuh, InpTy, {X.Val, Y.Val});
3110 return HVC.createHvxIntrinsic(Builder, V6_vmpyo_acc, InpTy,
3111 {V1, X.Val, Y.Val});
3112}
3113
3114auto HvxIdioms::createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
3115 Value *CarryIn) const
3116 -> std::pair<Value *, Value *> {
3117 assert(X->getType() == Y->getType());
3118 auto VecTy = cast<VectorType>(X->getType());
3119 if (VecTy == HvxI32Ty && HVC.HST.useHVXV62Ops()) {
3121 Intrinsic::ID AddCarry;
3122 if (CarryIn == nullptr && HVC.HST.useHVXV66Ops()) {
3123 AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarryo);
3124 } else {
3125 AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarry);
3126 if (CarryIn == nullptr)
3127 CarryIn = Constant::getNullValue(HVC.getBoolTy(HVC.length(VecTy)));
3128 Args.push_back(CarryIn);
3129 }
3130 Value *Ret = HVC.createHvxIntrinsic(Builder, AddCarry,
3131 /*RetTy=*/nullptr, Args);
3132 Value *Result = Builder.CreateExtractValue(Ret, {0}, "ext");
3133 Value *CarryOut = Builder.CreateExtractValue(Ret, {1}, "ext");
3134 return {Result, CarryOut};
3135 }
3136
3137 // In other cases, do a regular add, and unsigned compare-less-than.
3138 // The carry-out can originate in two places: adding the carry-in or adding
3139 // the two input values.
3140 Value *Result1 = X; // Result1 = X + CarryIn
3141 if (CarryIn != nullptr) {
3142 unsigned Width = VecTy->getScalarSizeInBits();
3143 uint32_t Mask = 1;
3144 if (Width < 32) {
3145 for (unsigned i = 0, e = 32 / Width; i != e; ++i)
3146 Mask = (Mask << Width) | 1;
3147 }
3148 auto V6_vandqrt = HVC.HST.getIntrinsicId(Hexagon::V6_vandqrt);
3149 Value *ValueIn =
3150 HVC.createHvxIntrinsic(Builder, V6_vandqrt, /*RetTy=*/nullptr,
3151 {CarryIn, HVC.getConstInt(Mask)});
3152 Result1 = Builder.CreateAdd(X, ValueIn, "add");
3153 }
3154
3155 Value *CarryOut1 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result1, X, "cmp");
3156 Value *Result2 = Builder.CreateAdd(Result1, Y, "add");
3157 Value *CarryOut2 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result2, Y, "cmp");
3158 return {Result2, Builder.CreateOr(CarryOut1, CarryOut2, "orb")};
3159}
3160
3161auto HvxIdioms::createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const
3162 -> Value * {
3163 Intrinsic::ID V6_vmpyh = 0;
3164 std::tie(X, Y) = canonSgn(X, Y);
3165
3166 if (X.Sgn == Signed) {
3167 V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhv);
3168 } else if (Y.Sgn == Signed) {
3169 // In vmpyhus the second operand is unsigned
3170 V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhus);
3171 } else {
3172 V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyuhv);
3173 }
3174
3175 // i16*i16 -> i32 / interleaved
3176 Value *P =
3177 HVC.createHvxIntrinsic(Builder, V6_vmpyh, HvxP32Ty, {Y.Val, X.Val});
3178 // Deinterleave
3179 return HVC.vshuff(Builder, HVC.sublo(Builder, P), HVC.subhi(Builder, P));
3180}
3181
3182auto HvxIdioms::createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
3183 -> Value * {
3184 Type *HvxI16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/false);
3185
3186 if (HVC.HST.useHVXV69Ops()) {
3187 if (X.Sgn != Signed && Y.Sgn != Signed) {
3188 auto V6_vmpyuhvs = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyuhvs);
3189 return HVC.createHvxIntrinsic(Builder, V6_vmpyuhvs, HvxI16Ty,
3190 {X.Val, Y.Val});
3191 }
3192 }
3193
3194 Type *HvxP16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/true);
3195 Value *Pair16 =
3196 Builder.CreateBitCast(createMul16(Builder, X, Y), HvxP16Ty, "cst");
3197 unsigned Len = HVC.length(HvxP16Ty) / 2;
3198
3199 SmallVector<int, 128> PickOdd(Len);
3200 for (int i = 0; i != static_cast<int>(Len); ++i)
3201 PickOdd[i] = 2 * i + 1;
3202
3203 return Builder.CreateShuffleVector(
3204 HVC.sublo(Builder, Pair16), HVC.subhi(Builder, Pair16), PickOdd, "shf");
3205}
3206
3207auto HvxIdioms::createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
3208 -> std::pair<Value *, Value *> {
3209 assert(X.Val->getType() == Y.Val->getType());
3210 assert(X.Val->getType() == HvxI32Ty);
3211
3212 Intrinsic::ID V6_vmpy_parts;
3213 std::tie(X, Y) = canonSgn(X, Y);
3214
3215 if (X.Sgn == Signed) {
3216 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyss_parts;
3217 } else if (Y.Sgn == Signed) {
3218 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyus_parts;
3219 } else {
3220 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyuu_parts;
3221 }
3222
3223 Value *Parts = HVC.createHvxIntrinsic(Builder, V6_vmpy_parts, nullptr,
3224 {X.Val, Y.Val}, {HvxI32Ty});
3225 Value *Hi = Builder.CreateExtractValue(Parts, {0}, "ext");
3226 Value *Lo = Builder.CreateExtractValue(Parts, {1}, "ext");
3227 return {Lo, Hi};
3228}
3229
3230auto HvxIdioms::createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
3231 ArrayRef<Value *> WordY) const
3233 assert(WordX.size() == WordY.size());
3234 unsigned Idx = 0, Length = WordX.size();
3236
3237 while (Idx != Length) {
3238 if (HVC.isZero(WordX[Idx]))
3239 Sum[Idx] = WordY[Idx];
3240 else if (HVC.isZero(WordY[Idx]))
3241 Sum[Idx] = WordX[Idx];
3242 else
3243 break;
3244 ++Idx;
3245 }
3246
3247 Value *Carry = nullptr;
3248 for (; Idx != Length; ++Idx) {
3249 std::tie(Sum[Idx], Carry) =
3250 createAddCarry(Builder, WordX[Idx], WordY[Idx], Carry);
3251 }
3252
3253 // This drops the final carry beyond the highest word.
3254 return Sum;
3255}
3256
3257auto HvxIdioms::createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
3258 Signedness SgnX, ArrayRef<Value *> WordY,
3259 Signedness SgnY) const -> SmallVector<Value *> {
3260 SmallVector<SmallVector<Value *>> Products(WordX.size() + WordY.size());
3261
3262 // WordX[i] * WordY[j] produces words i+j and i+j+1 of the results,
3263 // that is halves 2(i+j), 2(i+j)+1, 2(i+j)+2, 2(i+j)+3.
3264 for (int i = 0, e = WordX.size(); i != e; ++i) {
3265 for (int j = 0, f = WordY.size(); j != f; ++j) {
3266 // Check the 4 halves that this multiplication can generate.
3267 Signedness SX = (i + 1 == e) ? SgnX : Unsigned;
3268 Signedness SY = (j + 1 == f) ? SgnY : Unsigned;
3269 auto [Lo, Hi] = createMul32(Builder, {WordX[i], SX}, {WordY[j], SY});
3270 Products[i + j + 0].push_back(Lo);
3271 Products[i + j + 1].push_back(Hi);
3272 }
3273 }
3274
3275 Value *Zero = Constant::getNullValue(WordX[0]->getType());
3276
3277 auto pop_back_or_zero = [Zero](auto &Vector) -> Value * {
3278 if (Vector.empty())
3279 return Zero;
3280 auto Last = Vector.back();
3281 Vector.pop_back();
3282 return Last;
3283 };
3284
3285 for (int i = 0, e = Products.size(); i != e; ++i) {
3286 while (Products[i].size() > 1) {
3287 Value *Carry = nullptr; // no carry-in
3288 for (int j = i; j != e; ++j) {
3289 auto &ProdJ = Products[j];
3290 auto [Sum, CarryOut] = createAddCarry(Builder, pop_back_or_zero(ProdJ),
3291 pop_back_or_zero(ProdJ), Carry);
3292 ProdJ.insert(ProdJ.begin(), Sum);
3293 Carry = CarryOut;
3294 }
3295 }
3296 }
3297
3299 for (auto &P : Products) {
3300 assert(P.size() == 1 && "Should have been added together");
3301 WordP.push_back(P.front());
3302 }
3303
3304 return WordP;
3305}
3306
3307auto HvxIdioms::run() -> bool {
3308 bool Changed = false;
3309
3310 for (BasicBlock &B : HVC.F) {
3311 for (auto It = B.rbegin(); It != B.rend(); ++It) {
3312 if (auto Fxm = matchFxpMul(*It)) {
3313 Value *New = processFxpMul(*It, *Fxm);
3314 // Always report "changed" for now.
3315 Changed = true;
3316 if (!New)
3317 continue;
3318 bool StartOver = !isa<Instruction>(New);
3319 It->replaceAllUsesWith(New);
3321 It = StartOver ? B.rbegin()
3322 : cast<Instruction>(New)->getReverseIterator();
3323 Changed = true;
3324 } else if (matchGather(*It)) {
3325 Value *New = processVGather(*It);
3326 if (!New)
3327 continue;
3328 LLVM_DEBUG(dbgs() << " Gather : " << *New << "\n");
3329 // We replace original intrinsic with a new pseudo call.
3330 It->eraseFromParent();
3331 It = cast<Instruction>(New)->getReverseIterator();
3333 Changed = true;
3334 } else if (matchScatter(*It)) {
3335 Value *New = processVScatter(*It);
3336 if (!New)
3337 continue;
3338 LLVM_DEBUG(dbgs() << " Scatter : " << *New << "\n");
3339 // We replace original intrinsic with a new pseudo call.
3340 It->eraseFromParent();
3341 It = cast<Instruction>(New)->getReverseIterator();
3343 Changed = true;
3344 } else if (matchMLoad(*It)) {
3345 Value *New = processMLoad(*It);
3346 if (!New)
3347 continue;
3348 LLVM_DEBUG(dbgs() << " MLoad : " << *New << "\n");
3349 Changed = true;
3350 } else if (matchMStore(*It)) {
3351 Value *New = processMStore(*It);
3352 if (!New)
3353 continue;
3354 LLVM_DEBUG(dbgs() << " MStore : " << *New << "\n");
3355 Changed = true;
3356 }
3357 }
3358 }
3359
3360 return Changed;
3361}
3362
3363// --- End HvxIdioms
3364
3365auto HexagonVectorCombine::run() -> bool {
3366 if (DumpModule)
3367 dbgs() << "Module before HexagonVectorCombine\n" << *F.getParent();
3368
3369 bool Changed = false;
3370 if (HST.useHVXOps()) {
3371 if (VAEnabled)
3372 Changed |= AlignVectors(*this).run();
3373 if (VIEnabled)
3374 Changed |= HvxIdioms(*this).run();
3375 }
3376
3377 if (DumpModule) {
3378 dbgs() << "Module " << (Changed ? "(modified)" : "(unchanged)")
3379 << " after HexagonVectorCombine\n"
3380 << *F.getParent();
3381 }
3382 return Changed;
3383}
3384
3385auto HexagonVectorCombine::getIntTy(unsigned Width) const -> IntegerType * {
3386 return IntegerType::get(F.getContext(), Width);
3387}
3388
3389auto HexagonVectorCombine::getByteTy(int ElemCount) const -> Type * {
3390 assert(ElemCount >= 0);
3391 IntegerType *ByteTy = Type::getInt8Ty(F.getContext());
3392 if (ElemCount == 0)
3393 return ByteTy;
3394 return VectorType::get(ByteTy, ElemCount, /*Scalable=*/false);
3395}
3396
3397auto HexagonVectorCombine::getBoolTy(int ElemCount) const -> Type * {
3398 assert(ElemCount >= 0);
3399 IntegerType *BoolTy = Type::getInt1Ty(F.getContext());
3400 if (ElemCount == 0)
3401 return BoolTy;
3402 return VectorType::get(BoolTy, ElemCount, /*Scalable=*/false);
3403}
3404
3405auto HexagonVectorCombine::getConstInt(int Val, unsigned Width) const
3406 -> ConstantInt * {
3407 return ConstantInt::getSigned(getIntTy(Width), Val);
3408}
3409
3410auto HexagonVectorCombine::isZero(const Value *Val) const -> bool {
3411 if (auto *C = dyn_cast<Constant>(Val))
3412 return C->isNullValue();
3413 return false;
3414}
3415
3416auto HexagonVectorCombine::getIntValue(const Value *Val) const
3417 -> std::optional<APInt> {
3418 if (auto *CI = dyn_cast<ConstantInt>(Val))
3419 return CI->getValue();
3420 return std::nullopt;
3421}
3422
3423auto HexagonVectorCombine::isUndef(const Value *Val) const -> bool {
3424 return isa<UndefValue>(Val);
3425}
3426
3427auto HexagonVectorCombine::isTrue(const Value *Val) const -> bool {
3428 return Val == ConstantInt::getTrue(Val->getType());
3429}
3430
3431auto HexagonVectorCombine::isFalse(const Value *Val) const -> bool {
3432 return isZero(Val);
3433}
3434
3435auto HexagonVectorCombine::getHvxTy(Type *ElemTy, bool Pair) const
3436 -> VectorType * {
3437 EVT ETy = EVT::getEVT(ElemTy, false);
3438 assert(ETy.isSimple() && "Invalid HVX element type");
3439 // Do not allow boolean types here: they don't have a fixed length.
3440 assert(HST.isHVXElementType(ETy.getSimpleVT(), /*IncludeBool=*/false) &&
3441 "Invalid HVX element type");
3442 unsigned HwLen = HST.getVectorLength();
3443 unsigned NumElems = (8 * HwLen) / ETy.getSizeInBits();
3444 return VectorType::get(ElemTy, Pair ? 2 * NumElems : NumElems,
3445 /*Scalable=*/false);
3446}
3447
3448auto HexagonVectorCombine::getSizeOf(const Value *Val, SizeKind Kind) const
3449 -> int {
3450 return getSizeOf(Val->getType(), Kind);
3451}
3452
3453auto HexagonVectorCombine::getSizeOf(const Type *Ty, SizeKind Kind) const
3454 -> int {
3455 auto *NcTy = const_cast<Type *>(Ty);
3456 switch (Kind) {
3457 case Store:
3458 return DL.getTypeStoreSize(NcTy).getFixedValue();
3459 case Alloc:
3460 return DL.getTypeAllocSize(NcTy).getFixedValue();
3461 }
3462 llvm_unreachable("Unhandled SizeKind enum");
3463}
3464
3465auto HexagonVectorCombine::getTypeAlignment(Type *Ty) const -> int {
3466 // The actual type may be shorter than the HVX vector, so determine
3467 // the alignment based on subtarget info.
3468 if (HST.isTypeForHVX(Ty))
3469 return HST.getVectorLength();
3470 return DL.getABITypeAlign(Ty).value();
3471}
3472
3473auto HexagonVectorCombine::length(Value *Val) const -> size_t {
3474 return length(Val->getType());
3475}
3476
3477auto HexagonVectorCombine::length(Type *Ty) const -> size_t {
3478 auto *VecTy = dyn_cast<VectorType>(Ty);
3479 assert(VecTy && "Must be a vector type");
3480 return VecTy->getElementCount().getFixedValue();
3481}
3482
3483auto HexagonVectorCombine::simplify(Value *V) const -> Value * {
3484 if (auto *In = dyn_cast<Instruction>(V)) {
3485 SimplifyQuery Q(DL, &TLI, &DT, &AC, In);
3486 return simplifyInstruction(In, Q);
3487 }
3488 return nullptr;
3489}
3490
3491// Insert bytes [Start..Start+Length) of Src into Dst at byte Where.
3492auto HexagonVectorCombine::insertb(IRBuilderBase &Builder, Value *Dst,
3493 Value *Src, int Start, int Length,
3494 int Where) const -> Value * {
3495 assert(isByteVecTy(Dst->getType()) && isByteVecTy(Src->getType()));
3496 int SrcLen = getSizeOf(Src);
3497 int DstLen = getSizeOf(Dst);
3498 assert(0 <= Start && Start + Length <= SrcLen);
3499 assert(0 <= Where && Where + Length <= DstLen);
3500
3501 int P2Len = PowerOf2Ceil(SrcLen | DstLen);
3502 auto *Poison = PoisonValue::get(getByteTy());
3503 Value *P2Src = vresize(Builder, Src, P2Len, Poison);
3504 Value *P2Dst = vresize(Builder, Dst, P2Len, Poison);
3505
3506 SmallVector<int, 256> SMask(P2Len);
3507 for (int i = 0; i != P2Len; ++i) {
3508 // If i is in [Where, Where+Length), pick Src[Start+(i-Where)].
3509 // Otherwise, pick Dst[i];
3510 SMask[i] =
3511 (Where <= i && i < Where + Length) ? P2Len + Start + (i - Where) : i;
3512 }
3513
3514 Value *P2Insert = Builder.CreateShuffleVector(P2Dst, P2Src, SMask, "shf");
3515 return vresize(Builder, P2Insert, DstLen, Poison);
3516}
3517
3518auto HexagonVectorCombine::vlalignb(IRBuilderBase &Builder, Value *Lo,
3519 Value *Hi, Value *Amt) const -> Value * {
3520 assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
3521 if (isZero(Amt))
3522 return Hi;
3523 int VecLen = getSizeOf(Hi);
3524 if (auto IntAmt = getIntValue(Amt))
3525 return getElementRange(Builder, Lo, Hi, VecLen - IntAmt->getSExtValue(),
3526 VecLen);
3527
3528 if (HST.isTypeForHVX(Hi->getType())) {
3529 assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
3530 "Expecting an exact HVX type");
3531 return createHvxIntrinsic(Builder, HST.getIntrinsicId(Hexagon::V6_vlalignb),
3532 Hi->getType(), {Hi, Lo, Amt});
3533 }
3534
3535 if (VecLen == 4) {
3536 Value *Pair = concat(Builder, {Lo, Hi});
3537 Value *Shift =
3538 Builder.CreateLShr(Builder.CreateShl(Pair, Amt, "shl"), 32, "lsr");
3539 Value *Trunc =
3540 Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()), "trn");
3541 return Builder.CreateBitCast(Trunc, Hi->getType(), "cst");
3542 }
3543 if (VecLen == 8) {
3544 Value *Sub = Builder.CreateSub(getConstInt(VecLen), Amt, "sub");
3545 return vralignb(Builder, Lo, Hi, Sub);
3546 }
3547 llvm_unreachable("Unexpected vector length");
3548}
3549
3550auto HexagonVectorCombine::vralignb(IRBuilderBase &Builder, Value *Lo,
3551 Value *Hi, Value *Amt) const -> Value * {
3552 assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
3553 if (isZero(Amt))
3554 return Lo;
3555 int VecLen = getSizeOf(Lo);
3556 if (auto IntAmt = getIntValue(Amt))
3557 return getElementRange(Builder, Lo, Hi, IntAmt->getSExtValue(), VecLen);
3558
3559 if (HST.isTypeForHVX(Lo->getType())) {
3560 assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
3561 "Expecting an exact HVX type");
3562 return createHvxIntrinsic(Builder, HST.getIntrinsicId(Hexagon::V6_valignb),
3563 Lo->getType(), {Hi, Lo, Amt});
3564 }
3565
3566 if (VecLen == 4) {
3567 Value *Pair = concat(Builder, {Lo, Hi});
3568 Value *Shift = Builder.CreateLShr(Pair, Amt, "lsr");
3569 Value *Trunc =
3570 Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()), "trn");
3571 return Builder.CreateBitCast(Trunc, Lo->getType(), "cst");
3572 }
3573 if (VecLen == 8) {
3574 Type *Int64Ty = Type::getInt64Ty(F.getContext());
3575 Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty, "cst");
3576 Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty, "cst");
3577 Value *Call = Builder.CreateIntrinsic(Intrinsic::hexagon_S2_valignrb,
3578 {Hi64, Lo64, Amt},
3579 /*FMFSource=*/nullptr, "cup");
3580 return Builder.CreateBitCast(Call, Lo->getType(), "cst");
3581 }
3582 llvm_unreachable("Unexpected vector length");
3583}
3584
3585// Concatenates a sequence of vectors of the same type.
3586auto HexagonVectorCombine::concat(IRBuilderBase &Builder,
3587 ArrayRef<Value *> Vecs) const -> Value * {
3588 assert(!Vecs.empty());
3590 std::vector<Value *> Work[2];
3591 int ThisW = 0, OtherW = 1;
3592
3593 Work[ThisW].assign(Vecs.begin(), Vecs.end());
3594 while (Work[ThisW].size() > 1) {
3595 auto *Ty = cast<VectorType>(Work[ThisW].front()->getType());
3596 SMask.resize(length(Ty) * 2);
3597 std::iota(SMask.begin(), SMask.end(), 0);
3598
3599 Work[OtherW].clear();
3600 if (Work[ThisW].size() % 2 != 0)
3601 Work[ThisW].push_back(UndefValue::get(Ty));
3602 for (int i = 0, e = Work[ThisW].size(); i < e; i += 2) {
3603 Value *Joined = Builder.CreateShuffleVector(
3604 Work[ThisW][i], Work[ThisW][i + 1], SMask, "shf");
3605 Work[OtherW].push_back(Joined);
3606 }
3607 std::swap(ThisW, OtherW);
3608 }
3609
3610 // Since there may have been some undefs appended to make shuffle operands
3611 // have the same type, perform the last shuffle to only pick the original
3612 // elements.
3613 SMask.resize(Vecs.size() * length(Vecs.front()->getType()));
3614 std::iota(SMask.begin(), SMask.end(), 0);
3615 Value *Total = Work[ThisW].front();
3616 return Builder.CreateShuffleVector(Total, SMask, "shf");
3617}
3618
3619auto HexagonVectorCombine::vresize(IRBuilderBase &Builder, Value *Val,
3620 int NewSize, Value *Pad) const -> Value * {
3622 auto *ValTy = cast<VectorType>(Val->getType());
3623 assert(ValTy->getElementType() == Pad->getType());
3624
3625 int CurSize = length(ValTy);
3626 if (CurSize == NewSize)
3627 return Val;
3628 // Truncate?
3629 if (CurSize > NewSize)
3630 return getElementRange(Builder, Val, /*Ignored*/ Val, 0, NewSize);
3631 // Extend.
3632 SmallVector<int, 128> SMask(NewSize);
3633 std::iota(SMask.begin(), SMask.begin() + CurSize, 0);
3634 std::fill(SMask.begin() + CurSize, SMask.end(), CurSize);
3635 Value *PadVec = Builder.CreateVectorSplat(CurSize, Pad, "spt");
3636 return Builder.CreateShuffleVector(Val, PadVec, SMask, "shf");
3637}
3638
3639auto HexagonVectorCombine::rescale(IRBuilderBase &Builder, Value *Mask,
3640 Type *FromTy, Type *ToTy) const -> Value * {
3641 // Mask is a vector <N x i1>, where each element corresponds to an
3642 // element of FromTy. Remap it so that each element will correspond
3643 // to an element of ToTy.
3644 assert(isa<VectorType>(Mask->getType()));
3645
3646 Type *FromSTy = FromTy->getScalarType();
3647 Type *ToSTy = ToTy->getScalarType();
3648 if (FromSTy == ToSTy)
3649 return Mask;
3650
3651 int FromSize = getSizeOf(FromSTy);
3652 int ToSize = getSizeOf(ToSTy);
3653 assert(FromSize % ToSize == 0 || ToSize % FromSize == 0);
3654
3655 auto *MaskTy = cast<VectorType>(Mask->getType());
3656 int FromCount = length(MaskTy);
3657 int ToCount = (FromCount * FromSize) / ToSize;
3658 assert((FromCount * FromSize) % ToSize == 0);
3659
3660 auto *FromITy = getIntTy(FromSize * 8);
3661 auto *ToITy = getIntTy(ToSize * 8);
3662
3663 // Mask <N x i1> -> sext to <N x FromTy> -> bitcast to <M x ToTy> ->
3664 // -> trunc to <M x i1>.
3665 Value *Ext = Builder.CreateSExt(
3666 Mask, VectorType::get(FromITy, FromCount, /*Scalable=*/false), "sxt");
3667 Value *Cast = Builder.CreateBitCast(
3668 Ext, VectorType::get(ToITy, ToCount, /*Scalable=*/false), "cst");
3669 return Builder.CreateTrunc(
3670 Cast, VectorType::get(getBoolTy(), ToCount, /*Scalable=*/false), "trn");
3671}
3672
3673// Bitcast to bytes, and return least significant bits.
3674auto HexagonVectorCombine::vlsb(IRBuilderBase &Builder, Value *Val) const
3675 -> Value * {
3676 Type *ScalarTy = Val->getType()->getScalarType();
3677 if (ScalarTy == getBoolTy())
3678 return Val;
3679
3680 Value *Bytes = vbytes(Builder, Val);
3681 if (auto *VecTy = dyn_cast<VectorType>(Bytes->getType()))
3682 return Builder.CreateTrunc(Bytes, getBoolTy(getSizeOf(VecTy)), "trn");
3683 // If Bytes is a scalar (i.e. Val was a scalar byte), return i1, not
3684 // <1 x i1>.
3685 return Builder.CreateTrunc(Bytes, getBoolTy(), "trn");
3686}
3687
3688// Bitcast to bytes for non-bool. For bool, convert i1 -> i8.
3689auto HexagonVectorCombine::vbytes(IRBuilderBase &Builder, Value *Val) const
3690 -> Value * {
3691 Type *ScalarTy = Val->getType()->getScalarType();
3692 if (ScalarTy == getByteTy())
3693 return Val;
3694
3695 if (ScalarTy != getBoolTy())
3696 return Builder.CreateBitCast(Val, getByteTy(getSizeOf(Val)), "cst");
3697 // For bool, return a sext from i1 to i8.
3698 if (auto *VecTy = dyn_cast<VectorType>(Val->getType()))
3699 return Builder.CreateSExt(Val, VectorType::get(getByteTy(), VecTy), "sxt");
3700 return Builder.CreateSExt(Val, getByteTy(), "sxt");
3701}
3702
3703auto HexagonVectorCombine::subvector(IRBuilderBase &Builder, Value *Val,
3704 unsigned Start, unsigned Length) const
3705 -> Value * {
3706 assert(Start + Length <= length(Val));
3707 return getElementRange(Builder, Val, /*Ignored*/ Val, Start, Length);
3708}
3709
3710auto HexagonVectorCombine::sublo(IRBuilderBase &Builder, Value *Val) const
3711 -> Value * {
3712 size_t Len = length(Val);
3713 assert(Len % 2 == 0 && "Length should be even");
3714 return subvector(Builder, Val, 0, Len / 2);
3715}
3716
3717auto HexagonVectorCombine::subhi(IRBuilderBase &Builder, Value *Val) const
3718 -> Value * {
3719 size_t Len = length(Val);
3720 assert(Len % 2 == 0 && "Length should be even");
3721 return subvector(Builder, Val, Len / 2, Len / 2);
3722}
3723
3724auto HexagonVectorCombine::vdeal(IRBuilderBase &Builder, Value *Val0,
3725 Value *Val1) const -> Value * {
3726 assert(Val0->getType() == Val1->getType());
3727 int Len = length(Val0);
3728 SmallVector<int, 128> Mask(2 * Len);
3729
3730 for (int i = 0; i != Len; ++i) {
3731 Mask[i] = 2 * i; // Even
3732 Mask[i + Len] = 2 * i + 1; // Odd
3733 }
3734 return Builder.CreateShuffleVector(Val0, Val1, Mask, "shf");
3735}
3736
3737auto HexagonVectorCombine::vshuff(IRBuilderBase &Builder, Value *Val0,
3738 Value *Val1) const -> Value * { //
3739 assert(Val0->getType() == Val1->getType());
3740 int Len = length(Val0);
3741 SmallVector<int, 128> Mask(2 * Len);
3742
3743 for (int i = 0; i != Len; ++i) {
3744 Mask[2 * i + 0] = i; // Val0
3745 Mask[2 * i + 1] = i + Len; // Val1
3746 }
3747 return Builder.CreateShuffleVector(Val0, Val1, Mask, "shf");
3748}
3749
3750auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder,
3751 Intrinsic::ID IntID, Type *RetTy,
3752 ArrayRef<Value *> Args,
3753 ArrayRef<Type *> ArgTys,
3754 ArrayRef<Value *> MDSources) const
3755 -> Value * {
3756 auto getCast = [&](IRBuilderBase &Builder, Value *Val,
3757 Type *DestTy) -> Value * {
3758 Type *SrcTy = Val->getType();
3759 if (SrcTy == DestTy)
3760 return Val;
3761
3762 // Non-HVX type. It should be a scalar, and it should already have
3763 // a valid type.
3764 assert(HST.isTypeForHVX(SrcTy, /*IncludeBool=*/true));
3765
3766 Type *BoolTy = Type::getInt1Ty(F.getContext());
3767 if (cast<VectorType>(SrcTy)->getElementType() != BoolTy)
3768 return Builder.CreateBitCast(Val, DestTy, "cst");
3769
3770 // Predicate HVX vector.
3771 unsigned HwLen = HST.getVectorLength();
3772 Intrinsic::ID TC = HwLen == 64 ? Intrinsic::hexagon_V6_pred_typecast
3773 : Intrinsic::hexagon_V6_pred_typecast_128B;
3774 return Builder.CreateIntrinsic(TC, {DestTy, Val->getType()}, {Val},
3775 /*FMFSource=*/nullptr, "cup");
3776 };
3777
3778 Function *IntrFn =
3779 Intrinsic::getOrInsertDeclaration(F.getParent(), IntID, ArgTys);
3780 FunctionType *IntrTy = IntrFn->getFunctionType();
3781
3782 SmallVector<Value *, 4> IntrArgs;
3783 for (int i = 0, e = Args.size(); i != e; ++i) {
3784 Value *A = Args[i];
3785 Type *T = IntrTy->getParamType(i);
3786 if (A->getType() != T) {
3787 IntrArgs.push_back(getCast(Builder, A, T));
3788 } else {
3789 IntrArgs.push_back(A);
3790 }
3791 }
3792 StringRef MaybeName = !IntrTy->getReturnType()->isVoidTy() ? "cup" : "";
3793 CallInst *Call = Builder.CreateCall(IntrFn, IntrArgs, MaybeName);
3794
3795 MemoryEffects ME = Call->getAttributes().getMemoryEffects();
3797 propagateMetadata(Call, MDSources);
3798
3799 Type *CallTy = Call->getType();
3800 if (RetTy == nullptr || CallTy == RetTy)
3801 return Call;
3802 // Scalar types should have RetTy matching the call return type.
3803 assert(HST.isTypeForHVX(CallTy, /*IncludeBool=*/true));
3804 return getCast(Builder, Call, RetTy);
3805}
3806
3807auto HexagonVectorCombine::splitVectorElements(IRBuilderBase &Builder,
3808 Value *Vec,
3809 unsigned ToWidth) const
3811 // Break a vector of wide elements into a series of vectors with narrow
3812 // elements:
3813 // (...c0:b0:a0, ...c1:b1:a1, ...c2:b2:a2, ...)
3814 // -->
3815 // (a0, a1, a2, ...) // lowest "ToWidth" bits
3816 // (b0, b1, b2, ...) // the next lowest...
3817 // (c0, c1, c2, ...) // ...
3818 // ...
3819 //
3820 // The number of elements in each resulting vector is the same as
3821 // in the original vector.
3822
3823 auto *VecTy = cast<VectorType>(Vec->getType());
3824 assert(VecTy->getElementType()->isIntegerTy());
3825 unsigned FromWidth = VecTy->getScalarSizeInBits();
3826 assert(isPowerOf2_32(ToWidth) && isPowerOf2_32(FromWidth));
3827 assert(ToWidth <= FromWidth && "Breaking up into wider elements?");
3828 unsigned NumResults = FromWidth / ToWidth;
3829
3830 SmallVector<Value *> Results(NumResults);
3831 Results[0] = Vec;
3832 unsigned Length = length(VecTy);
3833
3834 // Do it by splitting in half, since those operations correspond to deal
3835 // instructions.
3836 auto splitInHalf = [&](unsigned Begin, unsigned End, auto splitFunc) -> void {
3837 // Take V = Results[Begin], split it in L, H.
3838 // Store Results[Begin] = L, Results[(Begin+End)/2] = H
3839 // Call itself recursively split(Begin, Half), split(Half+1, End)
3840 if (Begin + 1 == End)
3841 return;
3842
3843 Value *Val = Results[Begin];
3844 unsigned Width = Val->getType()->getScalarSizeInBits();
3845
3846 auto *VTy = VectorType::get(getIntTy(Width / 2), 2 * Length, false);
3847 Value *VVal = Builder.CreateBitCast(Val, VTy, "cst");
3848
3849 Value *Res = vdeal(Builder, sublo(Builder, VVal), subhi(Builder, VVal));
3850
3851 unsigned Half = (Begin + End) / 2;
3852 Results[Begin] = sublo(Builder, Res);
3853 Results[Half] = subhi(Builder, Res);
3854
3855 splitFunc(Begin, Half, splitFunc);
3856 splitFunc(Half, End, splitFunc);
3857 };
3858
3859 splitInHalf(0, NumResults, splitInHalf);
3860 return Results;
3861}
3862
3863auto HexagonVectorCombine::joinVectorElements(IRBuilderBase &Builder,
3864 ArrayRef<Value *> Values,
3865 VectorType *ToType) const
3866 -> Value * {
3867 assert(ToType->getElementType()->isIntegerTy());
3868
3869 // If the list of values does not have power-of-2 elements, append copies
3870 // of the sign bit to it, to make the size be 2^n.
3871 // The reason for this is that the values will be joined in pairs, because
3872 // otherwise the shuffles will result in convoluted code. With pairwise
3873 // joins, the shuffles will hopefully be folded into a perfect shuffle.
3874 // The output will need to be sign-extended to a type with element width
3875 // being a power-of-2 anyways.
3876 SmallVector<Value *> Inputs(Values);
3877
3878 unsigned ToWidth = ToType->getScalarSizeInBits();
3879 unsigned Width = Inputs.front()->getType()->getScalarSizeInBits();
3880 assert(Width <= ToWidth);
3881 assert(isPowerOf2_32(Width) && isPowerOf2_32(ToWidth));
3882 unsigned Length = length(Inputs.front()->getType());
3883
3884 unsigned NeedInputs = ToWidth / Width;
3885 if (Inputs.size() != NeedInputs) {
3886 // Having too many inputs is ok: drop the high bits (usual wrap-around).
3887 // If there are too few, fill them with the sign bit.
3888 Value *Last = Inputs.back();
3889 Value *Sign = Builder.CreateAShr(
3890 Last, ConstantInt::get(Last->getType(), Width - 1), "asr");
3891 Inputs.resize(NeedInputs, Sign);
3892 }
3893
3894 while (Inputs.size() > 1) {
3895 Width *= 2;
3896 auto *VTy = VectorType::get(getIntTy(Width), Length, false);
3897 for (int i = 0, e = Inputs.size(); i < e; i += 2) {
3898 Value *Res = vshuff(Builder, Inputs[i], Inputs[i + 1]);
3899 Inputs[i / 2] = Builder.CreateBitCast(Res, VTy, "cst");
3900 }
3901 Inputs.resize(Inputs.size() / 2);
3902 }
3903
3904 assert(Inputs.front()->getType() == ToType);
3905 return Inputs.front();
3906}
3907
3908auto HexagonVectorCombine::calculatePointerDifference(Value *Ptr0,
3909 Value *Ptr1) const
3910 -> std::optional<int> {
3911 // Try SCEV first.
3912 const SCEV *Scev0 = SE.getSCEV(Ptr0);
3913 const SCEV *Scev1 = SE.getSCEV(Ptr1);
3914 const SCEV *ScevDiff = SE.getMinusSCEV(Scev0, Scev1);
3915 if (auto *Const = dyn_cast<SCEVConstant>(ScevDiff)) {
3916 APInt V = Const->getAPInt();
3917 if (V.isSignedIntN(8 * sizeof(int)))
3918 return static_cast<int>(V.getSExtValue());
3919 }
3920
3921 struct Builder : IRBuilder<> {
3922 Builder(BasicBlock *B) : IRBuilder<>(B->getTerminator()) {}
3923 ~Builder() {
3924 for (Instruction *I : llvm::reverse(ToErase))
3925 I->eraseFromParent();
3926 }
3927 SmallVector<Instruction *, 8> ToErase;
3928 };
3929
3930#define CallBuilder(B, F) \
3931 [&](auto &B_) { \
3932 Value *V = B_.F; \
3933 if (auto *I = dyn_cast<Instruction>(V)) \
3934 B_.ToErase.push_back(I); \
3935 return V; \
3936 }(B)
3937
3938 auto Simplify = [this](Value *V) {
3939 if (Value *S = simplify(V))
3940 return S;
3941 return V;
3942 };
3943
3944 auto StripBitCast = [](Value *V) {
3945 while (auto *C = dyn_cast<BitCastInst>(V))
3946 V = C->getOperand(0);
3947 return V;
3948 };
3949
3950 Ptr0 = StripBitCast(Ptr0);
3951 Ptr1 = StripBitCast(Ptr1);
3953 return std::nullopt;
3954
3955 auto *Gep0 = cast<GetElementPtrInst>(Ptr0);
3956 auto *Gep1 = cast<GetElementPtrInst>(Ptr1);
3957 if (Gep0->getPointerOperand() != Gep1->getPointerOperand())
3958 return std::nullopt;
3959 if (Gep0->getSourceElementType() != Gep1->getSourceElementType())
3960 return std::nullopt;
3961
3962 Builder B(Gep0->getParent());
3963 int Scale = getSizeOf(Gep0->getSourceElementType(), Alloc);
3964
3965 // FIXME: for now only check GEPs with a single index.
3966 if (Gep0->getNumOperands() != 2 || Gep1->getNumOperands() != 2)
3967 return std::nullopt;
3968
3969 Value *Idx0 = Gep0->getOperand(1);
3970 Value *Idx1 = Gep1->getOperand(1);
3971
3972 // First, try to simplify the subtraction directly.
3973 if (auto *Diff = dyn_cast<ConstantInt>(
3974 Simplify(CallBuilder(B, CreateSub(Idx0, Idx1)))))
3975 return Diff->getSExtValue() * Scale;
3976
3977 KnownBits Known0 = getKnownBits(Idx0, Gep0);
3978 KnownBits Known1 = getKnownBits(Idx1, Gep1);
3979 APInt Unknown = ~(Known0.Zero | Known0.One) | ~(Known1.Zero | Known1.One);
3980 if (Unknown.isAllOnes())
3981 return std::nullopt;
3982
3983 Value *MaskU = ConstantInt::get(Idx0->getType(), Unknown);
3984 Value *AndU0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskU)));
3985 Value *AndU1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskU)));
3986 Value *SubU = Simplify(CallBuilder(B, CreateSub(AndU0, AndU1)));
3987 int Diff0 = 0;
3988 if (auto *C = dyn_cast<ConstantInt>(SubU)) {
3989 Diff0 = C->getSExtValue();
3990 } else {
3991 return std::nullopt;
3992 }
3993
3994 Value *MaskK = ConstantInt::get(MaskU->getType(), ~Unknown);
3995 Value *AndK0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskK)));
3996 Value *AndK1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskK)));
3997 Value *SubK = Simplify(CallBuilder(B, CreateSub(AndK0, AndK1)));
3998 int Diff1 = 0;
3999 if (auto *C = dyn_cast<ConstantInt>(SubK)) {
4000 Diff1 = C->getSExtValue();
4001 } else {
4002 return std::nullopt;
4003 }
4004
4005 return (Diff0 + Diff1) * Scale;
4006
4007#undef CallBuilder
4008}
4009
4010auto HexagonVectorCombine::getNumSignificantBits(const Value *V,
4011 const Instruction *CtxI) const
4012 -> unsigned {
4013 return ComputeMaxSignificantBits(V, DL, &AC, CtxI, &DT);
4014}
4015
4016auto HexagonVectorCombine::getKnownBits(const Value *V,
4017 const Instruction *CtxI) const
4018 -> KnownBits {
4019 return computeKnownBits(V, DL, &AC, CtxI, &DT);
4020}
4021
4022auto HexagonVectorCombine::isSafeToClone(const Instruction &In) const -> bool {
4023 if (In.mayHaveSideEffects() || In.isAtomic() || In.isVolatile() ||
4024 In.isFenceLike() || In.mayReadOrWriteMemory()) {
4025 return false;
4026 }
4027 if (isa<CallBase>(In) || isa<AllocaInst>(In))
4028 return false;
4029 return true;
4030}
4031
4032template <typename T>
4033auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In,
4035 const T &IgnoreInsts) const
4036 -> bool {
4037 auto getLocOrNone =
4038 [this](const Instruction &I) -> std::optional<MemoryLocation> {
4039 if (const auto *II = dyn_cast<IntrinsicInst>(&I)) {
4040 switch (II->getIntrinsicID()) {
4041 case Intrinsic::masked_load:
4042 return MemoryLocation::getForArgument(II, 0, TLI);
4043 case Intrinsic::masked_store:
4044 return MemoryLocation::getForArgument(II, 1, TLI);
4045 }
4046 }
4048 };
4049
4050 // The source and the destination must be in the same basic block.
4051 const BasicBlock &Block = *In.getParent();
4052 assert(Block.begin() == To || Block.end() == To || To->getParent() == &Block);
4053 // No PHIs.
4054 if (isa<PHINode>(In) || (To != Block.end() && isa<PHINode>(*To)))
4055 return false;
4056
4058 return true;
4059 bool MayWrite = In.mayWriteToMemory();
4060 auto MaybeLoc = getLocOrNone(In);
4061
4062 auto From = In.getIterator();
4063 if (From == To)
4064 return true;
4065 bool MoveUp = (To != Block.end() && To->comesBefore(&In));
4066 auto Range =
4067 MoveUp ? std::make_pair(To, From) : std::make_pair(std::next(From), To);
4068 for (auto It = Range.first; It != Range.second; ++It) {
4069 const Instruction &I = *It;
4070 if (llvm::is_contained(IgnoreInsts, &I))
4071 continue;
4072 // assume intrinsic can be ignored
4073 if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
4074 if (II->getIntrinsicID() == Intrinsic::assume)
4075 continue;
4076 }
4077 // Parts based on isSafeToMoveBefore from CoveMoverUtils.cpp.
4078 if (I.mayThrow())
4079 return false;
4080 if (auto *CB = dyn_cast<CallBase>(&I)) {
4081 if (!CB->hasFnAttr(Attribute::WillReturn))
4082 return false;
4083 if (!CB->hasFnAttr(Attribute::NoSync))
4084 return false;
4085 }
4086 if (I.mayReadOrWriteMemory()) {
4087 auto MaybeLocI = getLocOrNone(I);
4088 if (MayWrite || I.mayWriteToMemory()) {
4089 if (!MaybeLoc || !MaybeLocI)
4090 return false;
4091 if (!AA.isNoAlias(*MaybeLoc, *MaybeLocI))
4092 return false;
4093 }
4094 }
4095 }
4096 return true;
4097}
4098
4099auto HexagonVectorCombine::isByteVecTy(Type *Ty) const -> bool {
4100 if (auto *VecTy = dyn_cast<VectorType>(Ty))
4101 return VecTy->getElementType() == getByteTy();
4102 return false;
4103}
4104
4105auto HexagonVectorCombine::getElementRange(IRBuilderBase &Builder, Value *Lo,
4106 Value *Hi, int Start,
4107 int Length) const -> Value * {
4108 assert(0 <= Start && size_t(Start + Length) < length(Lo) + length(Hi));
4109 SmallVector<int, 128> SMask(Length);
4110 std::iota(SMask.begin(), SMask.end(), Start);
4111 return Builder.CreateShuffleVector(Lo, Hi, SMask, "shf");
4112}
4113
4114// Pass management.
4115
4116namespace {
4117class HexagonVectorCombineLegacy : public FunctionPass {
4118public:
4119 static char ID;
4120
4121 HexagonVectorCombineLegacy() : FunctionPass(ID) {}
4122
4123 StringRef getPassName() const override { return "Hexagon Vector Combine"; }
4124
4125 void getAnalysisUsage(AnalysisUsage &AU) const override {
4126 AU.setPreservesCFG();
4127 AU.addRequired<AAResultsWrapperPass>();
4128 AU.addRequired<AssumptionCacheTracker>();
4129 AU.addRequired<DominatorTreeWrapperPass>();
4130 AU.addRequired<ScalarEvolutionWrapperPass>();
4131 AU.addRequired<TargetLibraryInfoWrapperPass>();
4132 AU.addRequired<TargetPassConfig>();
4133 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
4134 FunctionPass::getAnalysisUsage(AU);
4135 }
4136
4137 bool runOnFunction(Function &F) override {
4138 if (skipFunction(F))
4139 return false;
4140 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
4141 AssumptionCache &AC =
4142 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
4143 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
4144 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
4145 TargetLibraryInfo &TLI =
4146 getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
4147 auto &TM = getAnalysis<TargetPassConfig>().getTM<HexagonTargetMachine>();
4148 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
4149 HexagonVectorCombine HVC(F, AA, AC, DT, SE, TLI, TM, ORE);
4150 return HVC.run();
4151 }
4152};
4153} // namespace
4154
4155char HexagonVectorCombineLegacy::ID = 0;
4156
4157INITIALIZE_PASS_BEGIN(HexagonVectorCombineLegacy, DEBUG_TYPE,
4158 "Hexagon Vector Combine", false, false)
4166INITIALIZE_PASS_END(HexagonVectorCombineLegacy, DEBUG_TYPE,
4167 "Hexagon Vector Combine", false, false)
4168
4170 return new HexagonVectorCombineLegacy();
4171}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static IntegerType * getIntTy(IRBuilderBase &B, const TargetLibraryInfo *TLI)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
hexagon bit simplify
Hexagon Common GEP
static cl::opt< unsigned > SizeLimit("eif-limit", cl::init(6), cl::Hidden, cl::desc("Size limit in Hexagon early if-conversion"))
static Value * locateIndexesFromIntrinsic(Instruction *In)
Instruction * locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual)
Value * getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC, IRBuilderBase &Builder, LLVMContext &Ctx, Value *I)
static Value * locateIndexesFromGEP(Value *In)
#define CallBuilder(B, F)
Value * getPointer(Value *Ptr)
#define DEFAULT_HVX_VTCM_PAGE_SIZE
static Value * locateAddressFromIntrinsic(Instruction *In)
static Instruction * selectDestination(Instruction *In, HvxIdioms::DstQualifier &Qual)
Value * get_i32_Mask(const HexagonVectorCombine &HVC, IRBuilderBase &Builder, LLVMContext &Ctx, unsigned int pattern)
bool isArithmetic(unsigned Opc)
static Type * getIndexType(Value *In)
GetElementPtrInst * locateGepFromIntrinsic(Instruction *In)
Value * getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC, IRBuilderBase &Builder, LLVMContext &Ctx, Value *I)
static Align effectiveAlignForValueTy(const DataLayout &DL, Type *ValTy, int Requested)
iv Induction Variable Users
Definition IVUsers.cpp:48
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
#define H(x, y, z)
Definition MD5.cpp:56
static bool isCandidate(const MachineInstr *MI, Register &DefedReg, Register FrameReg)
static bool isUndef(const MachineInstr &MI)
This file implements a map that provides insertion order iteration.
This file contains the declarations for metadata subclasses.
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Remove Loads Into Fake Uses
static ConstantInt * getConstInt(MDNode *MD, unsigned NumOp)
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
Target-Independent Code Generator Pass Configuration Options pass.
static uint32_t getAlignment(const MCSectionCOFF &Sec)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:834
unsigned getAddressSpace() const
Return the address space for the allocation.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
InstListType::const_iterator const_iterator
Definition BasicBlock.h:171
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
AttributeList getAttributes() const
Return the attributes for this call.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ ICMP_NE
not equal
Definition InstrTypes.h:762
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
unsigned getPointerSizeInBits(unsigned AS=0) const
The size in bits of the pointer representation in a given address space.
Definition DataLayout.h:501
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
iterator_range< iterator > children()
NodeT * getBlock() const
DomTreeNodeBase< NodeT > * getRootNode()
getRootNode - This returns the entry node for the CFG of the function.
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:310
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:155
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool empty() const
Definition Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
const BasicBlock & front() const
Definition Function.h:860
const BasicBlock & back() const
Definition Function.h:862
DISubprogram * getSubprogram() const
Get the attached subprogram.
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool isHVXVectorType(EVT VecTy, bool IncludeBool=false) const
unsigned getVectorLength() const
bool isTypeForHVX(Type *VecTy, bool IncludeBool=false) const
Intrinsic::ID getIntrinsicId(unsigned Opc) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition IRBuilder.h:1901
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2684
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2142
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1554
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2388
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2334
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2518
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1461
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2252
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1928
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1533
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2130
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2659
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1592
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1941
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1444
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2242
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2563
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2116
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1573
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2494
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1614
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
const char * getOpcodeName() const
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:350
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
bool empty() const
Definition MapVector.h:79
void remove_if(Predicate Pred)
Remove the elements that match the predicate.
size_type size() const
Definition MapVector.h:58
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:246
bool onlyAccessesInaccessibleMem() const
Whether this function only (at most) accesses inaccessible memory.
Definition ModRef.h:265
static LLVM_ABI std::optional< MemoryLocation > getOrNone(const Instruction *Inst)
static LLVM_ABI MemoryLocation getForArgument(const CallBase *Call, unsigned ArgIdx, const TargetLibraryInfo *TLI)
Return a location representing a particular argument of a call.
OptimizationRemarkEmitter legacy analysis pass.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
The main scalar evolution driver.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:288
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:552
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
Rounding
Possible values of current rounding mode, which is specified in bits 23:22 of FPCR.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate
Predicate - These are "(BI << 5) | BO" for various predicates.
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
BinOpPred_match< LHS, RHS, is_right_shift_op > m_Shr(const LHS &L, const RHS &R)
Matches logical shift operations.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
initializer< Ty > init(const Ty &Val)
constexpr double e
@ User
could "use" a pointer
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI Instruction * getTerminator() const
LLVM_ABI Instruction & front() const
This is an optimization pass for GlobalISel generic memory operations.
FunctionPass * createHexagonVectorCombineLegacyPass()
@ Offset
Definition DWP.cpp:558
@ Length
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535
@ Undef
Value of the register doesn't matter.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:328
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:356
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1790
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1151
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI Value * simplifyInstruction(Instruction *I, const SimplifyQuery &Q)
See if we can compute a simplified version of this instruction.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition Local.h:252
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2087
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2191
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
MaskT vshuff(ArrayRef< int > Vu, ArrayRef< int > Vv, unsigned Size, bool TakeOdd)
MaskT vdeal(ArrayRef< int > Vu, ArrayRef< int > Vv, unsigned Size, bool TakeOdd)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:863
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339