LLVM  9.0.0svn
X86InterleavedAccess.cpp
Go to the documentation of this file.
1 //===- X86InterleavedAccess.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file contains the X86 implementation of the interleaved accesses
11 /// optimization generating X86-specific instructions/intrinsics for
12 /// interleaved access groups.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "X86ISelLowering.h"
17 #include "X86Subtarget.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/IR/Constants.h"
22 #include "llvm/IR/DataLayout.h"
23 #include "llvm/IR/DerivedTypes.h"
24 #include "llvm/IR/IRBuilder.h"
25 #include "llvm/IR/Instruction.h"
26 #include "llvm/IR/Instructions.h"
27 #include "llvm/IR/Module.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/IR/Value.h"
30 #include "llvm/Support/Casting.h"
32 #include <algorithm>
33 #include <cassert>
34 #include <cmath>
35 #include <cstdint>
36 
37 using namespace llvm;
38 
39 namespace {
40 
41 /// This class holds necessary information to represent an interleaved
42 /// access group and supports utilities to lower the group into
43 /// X86-specific instructions/intrinsics.
44 /// E.g. A group of interleaving access loads (Factor = 2; accessing every
45 /// other element)
46 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
47 /// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
48 /// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
49 class X86InterleavedAccessGroup {
50  /// Reference to the wide-load instruction of an interleaved access
51  /// group.
52  Instruction *const Inst;
53 
54  /// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
56 
57  /// Reference to the starting index of each user-shuffle.
58  ArrayRef<unsigned> Indices;
59 
60  /// Reference to the interleaving stride in terms of elements.
61  const unsigned Factor;
62 
63  /// Reference to the underlying target.
64  const X86Subtarget &Subtarget;
65 
66  const DataLayout &DL;
67 
68  IRBuilder<> &Builder;
69 
70  /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
71  /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
72  void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
73  SmallVectorImpl<Instruction *> &DecomposedVectors);
74 
75  /// Performs matrix transposition on a 4x4 matrix \p InputVectors and
76  /// returns the transposed-vectors in \p TransposedVectors.
77  /// E.g.
78  /// InputVectors:
79  /// In-V0 = p1, p2, p3, p4
80  /// In-V1 = q1, q2, q3, q4
81  /// In-V2 = r1, r2, r3, r4
82  /// In-V3 = s1, s2, s3, s4
83  /// OutputVectors:
84  /// Out-V0 = p1, q1, r1, s1
85  /// Out-V1 = p2, q2, r2, s2
86  /// Out-V2 = p3, q3, r3, s3
87  /// Out-V3 = P4, q4, r4, s4
88  void transpose_4x4(ArrayRef<Instruction *> InputVectors,
89  SmallVectorImpl<Value *> &TransposedMatrix);
90  void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
91  SmallVectorImpl<Value *> &TransposedMatrix,
92  unsigned NumSubVecElems);
93  void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
94  SmallVectorImpl<Value *> &TransposedMatrix);
95  void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
96  SmallVectorImpl<Value *> &TransposedMatrix,
97  unsigned NumSubVecElems);
98  void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
99  SmallVectorImpl<Value *> &TransposedMatrix,
100  unsigned NumSubVecElems);
101 
102 public:
103  /// In order to form an interleaved access group X86InterleavedAccessGroup
104  /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
105  /// \p Shuffs, reference to the first indices of each interleaved-vector
106  /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
107  /// X86-specific instructions/intrinsics it also requires the underlying
108  /// target information \p STarget.
109  explicit X86InterleavedAccessGroup(Instruction *I,
111  ArrayRef<unsigned> Ind, const unsigned F,
112  const X86Subtarget &STarget,
113  IRBuilder<> &B)
114  : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
115  DL(Inst->getModule()->getDataLayout()), Builder(B) {}
116 
117  /// Returns true if this interleaved access group can be lowered into
118  /// x86-specific instructions/intrinsics, false otherwise.
119  bool isSupported() const;
120 
121  /// Lowers this interleaved access group into X86-specific
122  /// instructions/intrinsics.
123  bool lowerIntoOptimizedSequence();
124 };
125 
126 } // end anonymous namespace
127 
128 bool X86InterleavedAccessGroup::isSupported() const {
129  VectorType *ShuffleVecTy = Shuffles[0]->getType();
130  Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
131  unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
132  unsigned WideInstSize;
133 
134  // Currently, lowering is supported for the following vectors:
135  // Stride 4:
136  // 1. Store and load of 4-element vectors of 64 bits on AVX.
137  // 2. Store of 16/32-element vectors of 8 bits on AVX.
138  // Stride 3:
139  // 1. Load of 16/32-element vectors of 8 bits on AVX.
140  if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
141  return false;
142 
143  if (isa<LoadInst>(Inst)) {
144  WideInstSize = DL.getTypeSizeInBits(Inst->getType());
145  if (cast<LoadInst>(Inst)->getPointerAddressSpace())
146  return false;
147  } else
148  WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
149 
150  // We support shuffle represents stride 4 for byte type with size of
151  // WideInstSize.
152  if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
153  return true;
154 
155  if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
156  (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
157  WideInstSize == 2048))
158  return true;
159 
160  if (ShuffleElemSize == 8 && Factor == 3 &&
161  (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
162  return true;
163 
164  return false;
165 }
166 
167 void X86InterleavedAccessGroup::decompose(
168  Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
169  SmallVectorImpl<Instruction *> &DecomposedVectors) {
170  assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
171  "Expected Load or Shuffle");
172 
173  Type *VecWidth = VecInst->getType();
174  (void)VecWidth;
175  assert(VecWidth->isVectorTy() &&
176  DL.getTypeSizeInBits(VecWidth) >=
177  DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
178  "Invalid Inst-size!!!");
179 
180  if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
181  Value *Op0 = SVI->getOperand(0);
182  Value *Op1 = SVI->getOperand(1);
183 
184  // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
185  for (unsigned i = 0; i < NumSubVectors; ++i)
186  DecomposedVectors.push_back(
187  cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
188  Op0, Op1,
189  createSequentialMask(Builder, Indices[i],
190  SubVecTy->getVectorNumElements(), 0))));
191  return;
192  }
193 
194  // Decompose the load instruction.
195  LoadInst *LI = cast<LoadInst>(VecInst);
196  Type *VecBaseTy, *VecBasePtrTy;
197  Value *VecBasePtr;
198  unsigned int NumLoads = NumSubVectors;
199  // In the case of stride 3 with a vector of 32 elements load the information
200  // in the following way:
201  // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
202  unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
203  if (VecLength == 768 || VecLength == 1536) {
204  VecBaseTy = VectorType::get(Type::getInt8Ty(LI->getContext()), 16);
205  VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
206  VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
207  NumLoads = NumSubVectors * (VecLength / 384);
208  } else {
209  VecBaseTy = SubVecTy;
210  VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
211  VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
212  }
213  // Generate N loads of T type.
214  for (unsigned i = 0; i < NumLoads; i++) {
215  // TODO: Support inbounds GEP.
216  Value *NewBasePtr =
217  Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
218  Instruction *NewLoad =
219  Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, LI->getAlignment());
220  DecomposedVectors.push_back(NewLoad);
221  }
222 }
223 
224 // Changing the scale of the vector type by reducing the number of elements and
225 // doubling the scalar size.
226 static MVT scaleVectorType(MVT VT) {
227  unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
228  return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
229  VT.getVectorNumElements() / 2);
230 }
231 
232 static uint32_t Concat[] = {
233  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
234  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
235  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
236  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 };
237 
238 // genShuffleBland - Creates shuffle according to two vectors.This function is
239 // only works on instructions with lane inside 256 registers. According to
240 // the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
241 // offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
242 // Where the 'LowOffset' refers to the first vector and the highOffset refers to
243 // the second vector.
244 // |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20|
245 // |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25|
246 // |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31|
247 // For the sequence to work as a mirror to the load.
248 // We must consider the elements order as above.
249 // In this function we are combining two types of shuffles.
250 // The first one is vpshufed and the second is a type of "blend" shuffle.
251 // By computing the shuffle on a sequence of 16 elements(one lane) and add the
252 // correct offset. We are creating a vpsuffed + blend sequence between two
253 // shuffles.
255  SmallVectorImpl<uint32_t> &Out, int LowOffset,
256  int HighOffset) {
257  assert(VT.getSizeInBits() >= 256 &&
258  "This function doesn't accept width smaller then 256");
259  unsigned NumOfElm = VT.getVectorNumElements();
260  for (unsigned i = 0; i < Mask.size(); i++)
261  Out.push_back(Mask[i] + LowOffset);
262  for (unsigned i = 0; i < Mask.size(); i++)
263  Out.push_back(Mask[i] + HighOffset + NumOfElm);
264 }
265 
266 // reorderSubVector returns the data to is the original state. And de-facto is
267 // the opposite of the function concatSubVector.
268 
269 // For VecElems = 16
270 // Invec[0] - |0| TransposedMatrix[0] - |0|
271 // Invec[1] - |1| => TransposedMatrix[1] - |1|
272 // Invec[2] - |2| TransposedMatrix[2] - |2|
273 
274 // For VecElems = 32
275 // Invec[0] - |0|3| TransposedMatrix[0] - |0|1|
276 // Invec[1] - |1|4| => TransposedMatrix[1] - |2|3|
277 // Invec[2] - |2|5| TransposedMatrix[2] - |4|5|
278 
279 // For VecElems = 64
280 // Invec[0] - |0|3|6|9 | TransposedMatrix[0] - |0|1|2 |3 |
281 // Invec[1] - |1|4|7|10| => TransposedMatrix[1] - |4|5|6 |7 |
282 // Invec[2] - |2|5|8|11| TransposedMatrix[2] - |8|9|10|11|
283 
284 static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
286  unsigned VecElems, unsigned Stride,
287  IRBuilder<> Builder) {
288 
289  if (VecElems == 16) {
290  for (unsigned i = 0; i < Stride; i++)
291  TransposedMatrix[i] = Builder.CreateShuffleVector(
292  Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
293  return;
294  }
295 
296  SmallVector<uint32_t, 32> OptimizeShuf;
297  Value *Temp[8];
298 
299  for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
300  genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
301  (i + 1) / Stride * 16);
302  Temp[i / 2] = Builder.CreateShuffleVector(
303  Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
304  OptimizeShuf.clear();
305  }
306 
307  if (VecElems == 32) {
308  std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
309  return;
310  }
311  else
312  for (unsigned i = 0; i < Stride; i++)
313  TransposedMatrix[i] =
314  Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
315 }
316 
317 void X86InterleavedAccessGroup::interleave8bitStride4VF8(
319  SmallVectorImpl<Value *> &TransposedMatrix) {
320  // Assuming we start from the following vectors:
321  // Matrix[0]= c0 c1 c2 c3 c4 ... c7
322  // Matrix[1]= m0 m1 m2 m3 m4 ... m7
323  // Matrix[2]= y0 y1 y2 y3 y4 ... y7
324  // Matrix[3]= k0 k1 k2 k3 k4 ... k7
325 
326  MVT VT = MVT::v8i16;
327  TransposedMatrix.resize(2);
329  SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord;
330  SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord;
331 
332  for (unsigned i = 0; i < 8; ++i) {
333  MaskLow.push_back(i);
334  MaskLow.push_back(i + 8);
335  }
336 
337  createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false);
338  createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false);
339  scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
340  scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
341  // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
342  // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
343  Value *IntrVec1Low =
344  Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
345  Value *IntrVec2Low =
346  Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
347 
348  // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
349  // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
350 
351  TransposedMatrix[0] =
352  Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
353  TransposedMatrix[1] =
354  Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
355 }
356 
357 void X86InterleavedAccessGroup::interleave8bitStride4(
358  ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix,
359  unsigned NumOfElm) {
360  // Example: Assuming we start from the following vectors:
361  // Matrix[0]= c0 c1 c2 c3 c4 ... c31
362  // Matrix[1]= m0 m1 m2 m3 m4 ... m31
363  // Matrix[2]= y0 y1 y2 y3 y4 ... y31
364  // Matrix[3]= k0 k1 k2 k3 k4 ... k31
365 
366  MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm);
367  MVT HalfVT = scaleVectorType(VT);
368 
369  TransposedMatrix.resize(4);
370  SmallVector<uint32_t, 32> MaskHigh;
372  SmallVector<uint32_t, 32> LowHighMask[2];
373  SmallVector<uint32_t, 32> MaskHighTemp;
374  SmallVector<uint32_t, 32> MaskLowTemp;
375 
376  // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
377  // shuffle pattern.
378 
379  createUnpackShuffleMask<uint32_t>(VT, MaskLow, true, false);
380  createUnpackShuffleMask<uint32_t>(VT, MaskHigh, false, false);
381 
382  // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
383  // shuffle pattern.
384 
385  createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp, true, false);
386  createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp, false, false);
387  scaleShuffleMask<uint32_t>(2, MaskLowTemp, LowHighMask[0]);
388  scaleShuffleMask<uint32_t>(2, MaskHighTemp, LowHighMask[1]);
389 
390  // IntrVec1Low = c0 m0 c1 m1 ... c7 m7 | c16 m16 c17 m17 ... c23 m23
391  // IntrVec1High = c8 m8 c9 m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
392  // IntrVec2Low = y0 k0 y1 k1 ... y7 k7 | y16 k16 y17 k17 ... y23 k23
393  // IntrVec2High = y8 k8 y9 k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31
394  Value *IntrVec[4];
395 
396  IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
397  IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
398  IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
399  IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
400 
401  // cmyk4 cmyk5 cmyk6 cmyk7 | cmyk20 cmyk21 cmyk22 cmyk23
402  // cmyk12 cmyk13 cmyk14 cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31
403  // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk16 cmyk17 cmyk18 cmyk19
404  // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27
405 
406  Value *VecOut[4];
407  for (int i = 0; i < 4; i++)
408  VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
409  LowHighMask[i % 2]);
410 
411  // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk4 cmyk5 cmyk6 cmyk7
412  // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk12 cmyk13 cmyk14 cmyk15
413  // cmyk16 cmyk17 cmyk18 cmyk19 | cmyk20 cmyk21 cmyk22 cmyk23
414  // cmyk24 cmyk25 cmyk26 cmyk27 | cmyk28 cmyk29 cmyk30 cmyk31
415 
416  if (VT == MVT::v16i8) {
417  std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
418  return;
419  }
420 
421  reorderSubVector(VT, TransposedMatrix, VecOut, makeArrayRef(Concat, 16),
422  NumOfElm, 4, Builder);
423 }
424 
425 // createShuffleStride returns shuffle mask of size N.
426 // The shuffle pattern is as following :
427 // {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane),
428 // (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
429 // (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
430 // Where Lane is the # of lanes in a register:
431 // VectorSize = 128 => Lane = 1
432 // VectorSize = 256 => Lane = 2
433 // For example shuffle pattern for VF 16 register size 256 -> lanes = 2
434 // {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
435 static void createShuffleStride(MVT VT, int Stride,
437  int VectorSize = VT.getSizeInBits();
438  int VF = VT.getVectorNumElements();
439  int LaneCount = std::max(VectorSize / 128, 1);
440  for (int Lane = 0; Lane < LaneCount; Lane++)
441  for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
442  Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
443 }
444 
445 // setGroupSize sets 'SizeInfo' to the size(number of elements) of group
446 // inside mask a shuffleMask. A mask contains exactly 3 groups, where
447 // each group is a monotonically increasing sequence with stride 3.
448 // For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
449 static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
450  int VectorSize = VT.getSizeInBits();
451  int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
452  for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
453  int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
454  SizeInfo.push_back(GroupSize);
455  FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
456  }
457 }
458 
459 // DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
460 // vpalign works according to lanes
461 // Where Lane is the # of lanes in a register:
462 // VectorWide = 128 => Lane = 1
463 // VectorWide = 256 => Lane = 2
464 // For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
465 // For Lane = 2 shuffle pattern is:
466 // {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
467 // Imm variable sets the offset amount. The result of the
468 // function is stored inside ShuffleMask vector and it built as described in
469 // the begin of the description. AlignDirection is a boolean that indicates the
470 // direction of the alignment. (false - align to the "right" side while true -
471 // align to the "left" side)
472 static void DecodePALIGNRMask(MVT VT, unsigned Imm,
473  SmallVectorImpl<uint32_t> &ShuffleMask,
474  bool AlignDirection = true, bool Unary = false) {
475  unsigned NumElts = VT.getVectorNumElements();
476  unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
477  unsigned NumLaneElts = NumElts / NumLanes;
478 
479  Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
480  unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
481 
482  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
483  for (unsigned i = 0; i != NumLaneElts; ++i) {
484  unsigned Base = i + Offset;
485  // if i+offset is out of this lane then we actually need the other source
486  // If Unary the other source is the first source.
487  if (Base >= NumLaneElts)
488  Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
489  ShuffleMask.push_back(Base + l);
490  }
491  }
492 }
493 
494 // concatSubVector - The function rebuilds the data to a correct expected
495 // order. An assumption(The shape of the matrix) was taken for the
496 // deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
497 // This function ensures that the data is built in correct way for the lane
498 // instructions. Each lane inside the vector is a 128-bit length.
499 //
500 // The 'InVec' argument contains the data in increasing order. In InVec[0] You
501 // can find the first 128 bit data. The number of different lanes inside a
502 // vector depends on the 'VecElems'.In general, the formula is
503 // VecElems * type / 128. The size of the array 'InVec' depends and equal to
504 // 'VecElems'.
505 
506 // For VecElems = 16
507 // Invec[0] - |0| Vec[0] - |0|
508 // Invec[1] - |1| => Vec[1] - |1|
509 // Invec[2] - |2| Vec[2] - |2|
510 
511 // For VecElems = 32
512 // Invec[0] - |0|1| Vec[0] - |0|3|
513 // Invec[1] - |2|3| => Vec[1] - |1|4|
514 // Invec[2] - |4|5| Vec[2] - |2|5|
515 
516 // For VecElems = 64
517 // Invec[0] - |0|1|2 |3 | Vec[0] - |0|3|6|9 |
518 // Invec[1] - |4|5|6 |7 | => Vec[1] - |1|4|7|10|
519 // Invec[2] - |8|9|10|11| Vec[2] - |2|5|8|11|
520 
522  unsigned VecElems, IRBuilder<> Builder) {
523  if (VecElems == 16) {
524  for (int i = 0; i < 3; i++)
525  Vec[i] = InVec[i];
526  return;
527  }
528 
529  for (unsigned j = 0; j < VecElems / 32; j++)
530  for (int i = 0; i < 3; i++)
531  Vec[i + j * 3] = Builder.CreateShuffleVector(
532  InVec[j * 6 + i], InVec[j * 6 + i + 3], makeArrayRef(Concat, 32));
533 
534  if (VecElems == 32)
535  return;
536 
537  for (int i = 0; i < 3; i++)
538  Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat);
539 }
540 
541 void X86InterleavedAccessGroup::deinterleave8bitStride3(
542  ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
543  unsigned VecElems) {
544  // Example: Assuming we start from the following vectors:
545  // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
546  // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
547  // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
548 
549  TransposedMatrix.resize(3);
551  SmallVector<uint32_t, 32> VPAlign[2];
552  SmallVector<uint32_t, 32> VPAlign2;
553  SmallVector<uint32_t, 32> VPAlign3;
554  SmallVector<uint32_t, 3> GroupSize;
555  Value *Vec[6], *TempVector[3];
556 
557  MVT VT = MVT::getVT(Shuffles[0]->getType());
558 
559  createShuffleStride(VT, 3, VPShuf);
560  setGroupSize(VT, GroupSize);
561 
562  for (int i = 0; i < 2; i++)
563  DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
564 
565  DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
566  DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
567 
568  concatSubVector(Vec, InVec, VecElems, Builder);
569  // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
570  // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
571  // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
572 
573  for (int i = 0; i < 3; i++)
574  Vec[i] = Builder.CreateShuffleVector(
575  Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf);
576 
577  // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
578  // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
579  // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
580 
581  for (int i = 0; i < 3; i++)
582  TempVector[i] =
583  Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
584 
585  // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
586  // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
587  // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
588 
589  for (int i = 0; i < 3; i++)
590  Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
591  VPAlign[1]);
592 
593  // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
594  // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
595  // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
596 
597  Value *TempVec = Builder.CreateShuffleVector(
598  Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3);
599  TransposedMatrix[0] = Builder.CreateShuffleVector(
600  Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2);
601  TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
602  TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
603 }
604 
605 // group2Shuffle reorder the shuffle stride back into continuous order.
606 // For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
607 // MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
609  SmallVectorImpl<uint32_t> &Output) {
610  int IndexGroup[3] = {0, 0, 0};
611  int Index = 0;
612  int VectorWidth = VT.getSizeInBits();
613  int VF = VT.getVectorNumElements();
614  // Find the index of the different groups.
615  int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
616  for (int i = 0; i < 3; i++) {
617  IndexGroup[(Index * 3) % (VF / Lane)] = Index;
618  Index += Mask[i];
619  }
620  // According to the index compute the convert mask.
621  for (int i = 0; i < VF / Lane; i++) {
622  Output.push_back(IndexGroup[i % 3]);
623  IndexGroup[i % 3]++;
624  }
625 }
626 
627 void X86InterleavedAccessGroup::interleave8bitStride3(
628  ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
629  unsigned VecElems) {
630  // Example: Assuming we start from the following vectors:
631  // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
632  // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
633  // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
634 
635  TransposedMatrix.resize(3);
636  SmallVector<uint32_t, 3> GroupSize;
638  SmallVector<uint32_t, 32> VPAlign[3];
639  SmallVector<uint32_t, 32> VPAlign2;
640  SmallVector<uint32_t, 32> VPAlign3;
641 
642  Value *Vec[3], *TempVector[3];
643  MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
644 
645  setGroupSize(VT, GroupSize);
646 
647  for (int i = 0; i < 3; i++)
648  DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
649 
650  DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
651  DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
652 
653  // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
654  // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
655  // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
656 
657  Vec[0] = Builder.CreateShuffleVector(
658  InVec[0], UndefValue::get(InVec[0]->getType()), VPAlign2);
659  Vec[1] = Builder.CreateShuffleVector(
660  InVec[1], UndefValue::get(InVec[1]->getType()), VPAlign3);
661  Vec[2] = InVec[2];
662 
663  // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
664  // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
665  // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
666 
667  for (int i = 0; i < 3; i++)
668  TempVector[i] =
669  Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
670 
671  // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
672  // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
673  // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
674 
675  for (int i = 0; i < 3; i++)
676  Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
677  VPAlign[2]);
678 
679  // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
680  // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
681  // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
682 
683  unsigned NumOfElm = VT.getVectorNumElements();
684  group2Shuffle(VT, GroupSize, VPShuf);
685  reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm,3, Builder);
686 }
687 
688 void X86InterleavedAccessGroup::transpose_4x4(
690  SmallVectorImpl<Value *> &TransposedMatrix) {
691  assert(Matrix.size() == 4 && "Invalid matrix size");
692  TransposedMatrix.resize(4);
693 
694  // dst = src1[0,1],src2[0,1]
695  uint32_t IntMask1[] = {0, 1, 4, 5};
696  ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
697  Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
698  Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
699 
700  // dst = src1[2,3],src2[2,3]
701  uint32_t IntMask2[] = {2, 3, 6, 7};
702  Mask = makeArrayRef(IntMask2, 4);
703  Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
704  Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
705 
706  // dst = src1[0],src2[0],src1[2],src2[2]
707  uint32_t IntMask3[] = {0, 4, 2, 6};
708  Mask = makeArrayRef(IntMask3, 4);
709  TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
710  TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
711 
712  // dst = src1[1],src2[1],src1[3],src2[3]
713  uint32_t IntMask4[] = {1, 5, 3, 7};
714  Mask = makeArrayRef(IntMask4, 4);
715  TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
716  TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
717 }
718 
719 // Lowers this interleaved access group into X86-specific
720 // instructions/intrinsics.
721 bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
722  SmallVector<Instruction *, 4> DecomposedVectors;
723  SmallVector<Value *, 4> TransposedVectors;
724  VectorType *ShuffleTy = Shuffles[0]->getType();
725 
726  if (isa<LoadInst>(Inst)) {
727  // Try to generate target-sized register(/instruction).
728  decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
729 
730  Type *ShuffleEltTy = Inst->getType();
731  unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
732  // Perform matrix-transposition in order to compute interleaved
733  // results by generating some sort of (optimized) target-specific
734  // instructions.
735 
736  switch (NumSubVecElems) {
737  default:
738  return false;
739  case 4:
740  transpose_4x4(DecomposedVectors, TransposedVectors);
741  break;
742  case 8:
743  case 16:
744  case 32:
745  case 64:
746  deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
747  NumSubVecElems);
748  break;
749  }
750 
751  // Now replace the unoptimized-interleaved-vectors with the
752  // transposed-interleaved vectors.
753  for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
754  Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
755 
756  return true;
757  }
758 
759  Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
760  unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
761 
762  // Lower the interleaved stores:
763  // 1. Decompose the interleaved wide shuffle into individual shuffle
764  // vectors.
765  decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
766  DecomposedVectors);
767 
768  // 2. Transpose the interleaved-vectors into vectors of contiguous
769  // elements.
770  switch (NumSubVecElems) {
771  case 4:
772  transpose_4x4(DecomposedVectors, TransposedVectors);
773  break;
774  case 8:
775  interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
776  break;
777  case 16:
778  case 32:
779  case 64:
780  if (Factor == 4)
781  interleave8bitStride4(DecomposedVectors, TransposedVectors,
782  NumSubVecElems);
783  if (Factor == 3)
784  interleave8bitStride3(DecomposedVectors, TransposedVectors,
785  NumSubVecElems);
786  break;
787  default:
788  return false;
789  }
790 
791  // 3. Concatenate the contiguous-vectors back into a wide vector.
792  Value *WideVec = concatenateVectors(Builder, TransposedVectors);
793 
794  // 4. Generate a store instruction for wide-vec.
795  StoreInst *SI = cast<StoreInst>(Inst);
796  Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
797  SI->getAlignment());
798 
799  return true;
800 }
801 
802 // Lower interleaved load(s) into target specific instructions/
803 // intrinsics. Lowering sequence varies depending on the vector-types, factor,
804 // number of shuffles and ISA.
805 // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
808  ArrayRef<unsigned> Indices, unsigned Factor) const {
809  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
810  "Invalid interleave factor");
811  assert(!Shuffles.empty() && "Empty shufflevector input");
812  assert(Shuffles.size() == Indices.size() &&
813  "Unmatched number of shufflevectors and indices");
814 
815  // Create an interleaved access group.
816  IRBuilder<> Builder(LI);
817  X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
818  Builder);
819 
820  return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
821 }
822 
824  ShuffleVectorInst *SVI,
825  unsigned Factor) const {
826  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
827  "Invalid interleave factor");
828 
829  assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
830  "Invalid interleaved store");
831 
832  // Holds the indices of SVI that correspond to the starting index of each
833  // interleaved shuffle.
834  SmallVector<unsigned, 4> Indices;
835  auto Mask = SVI->getShuffleMask();
836  for (unsigned i = 0; i < Factor; i++)
837  Indices.push_back(Mask[i]);
838 
840 
841  // Create an interleaved access group.
842  IRBuilder<> Builder(SI);
843  X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
844  Builder);
845 
846  return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
847 }
static void setGroupSize(MVT VT, SmallVectorImpl< uint32_t > &SizeInfo)
Type * getVectorElementType() const
Definition: Type.h:370
static MVT getIntegerVT(unsigned BitWidth)
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:110
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
This class represents lattice values for constants.
Definition: AllocatorList.h:23
static MVT getVectorVT(MVT VT, unsigned NumElements)
unsigned getVectorNumElements() const
This instruction constructs a fixed permutation of two input vectors.
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:709
static uint32_t Concat[]
F(f)
An instruction for reading from memory.
Definition: Instructions.h:167
static void genShuffleBland(MVT VT, ArrayRef< uint32_t > Mask, SmallVectorImpl< uint32_t > &Out, int LowOffset, int HighOffset)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:229
Live Register Matrix
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst *> Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower interleaved load(s) into target specific instructions/intrinsics.
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Definition: ArrayRef.h:450
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:651
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:41
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:742
unsigned getSizeInBits() const
Constant * createSequentialMask(IRBuilder<> &Builder, unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:244
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
An instruction for storing to memory.
Definition: Instructions.h:320
MVT getVectorElementType() const
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Machine Value Type.
Value * concatenateVectors(IRBuilder<> &Builder, ArrayRef< Value *> Vecs)
Concatenate a list of vectors.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
unsigned getScalarSizeInBits() const
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:148
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Value * getPointerOperand()
Definition: Instructions.h:284
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:288
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1424
static wasm::ValType getType(const TargetRegisterClass *RC)
static void concatSubVector(Value **Vec, ArrayRef< Instruction *> InVec, unsigned VecElems, IRBuilder<> Builder)
static void reorderSubVector(MVT VT, SmallVectorImpl< Value *> &TransposedMatrix, ArrayRef< Value *> Vec, ArrayRef< uint32_t > VPShuf, unsigned VecElems, unsigned Stride, IRBuilder<> Builder)
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower interleaved store(s) into target specific instructions/intrinsics.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:841
Module.h This file contains the declarations for the Module class.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static MVT scaleVectorType(MVT VT)
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2104
unsigned getVectorNumElements() const
Definition: DerivedTypes.h:493
Class to represent vector types.
Definition: DerivedTypes.h:424
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:240
#define I(x, y, z)
Definition: MD5.cpp:58
static void group2Shuffle(MVT VT, SmallVectorImpl< uint32_t > &Mask, SmallVectorImpl< uint32_t > &Output)
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:365
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:290
static void createShuffleStride(MVT VT, int Stride, SmallVectorImpl< uint32_t > &Mask)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:72
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:605
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E&#39;s largest value.
Definition: BitmaskEnum.h:80
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1237
VectorType * getType() const
Overload to return most specific vector type.
Value * getPointerOperand()
Definition: Instructions.h:412
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:173
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:143
void resize(size_type N)
Definition: SmallVector.h:344