Go to the documentation of this file.
28 ShuffleMask.push_back(0);
29 ShuffleMask.push_back(1);
30 ShuffleMask.push_back(2);
31 ShuffleMask.push_back(3);
34 unsigned ZMask =
Imm & 15;
35 unsigned CountD = (
Imm >> 4) & 3;
36 unsigned CountS = (
Imm >> 6) & 3;
39 unsigned InVal = 4 + CountS;
41 ShuffleMask[CountD] = InVal;
51 assert((Idx + Len) <= NumElts &&
"Insertion out of range");
53 for (
unsigned i = 0;
i != NumElts; ++
i)
54 ShuffleMask.push_back(
i);
55 for (
unsigned i = 0;
i != Len; ++
i)
56 ShuffleMask[Idx +
i] = NumElts +
i;
61 for (
unsigned i = NElts / 2;
i != NElts; ++
i)
62 ShuffleMask.push_back(NElts +
i);
64 for (
unsigned i = NElts / 2;
i != NElts; ++
i)
65 ShuffleMask.push_back(
i);
70 for (
unsigned i = 0;
i != NElts / 2; ++
i)
71 ShuffleMask.push_back(
i);
73 for (
unsigned i = 0;
i != NElts / 2; ++
i)
74 ShuffleMask.push_back(NElts +
i);
78 for (
int i = 0,
e = NumElts / 2;
i <
e; ++
i) {
79 ShuffleMask.push_back(2 *
i);
80 ShuffleMask.push_back(2 *
i);
85 for (
int i = 0,
e = NumElts / 2;
i <
e; ++
i) {
86 ShuffleMask.push_back(2 *
i + 1);
87 ShuffleMask.push_back(2 *
i + 1);
92 const unsigned NumLaneElts = 2;
94 for (
unsigned l = 0;
l < NumElts;
l += NumLaneElts)
95 for (
unsigned i = 0;
i < NumLaneElts; ++
i)
96 ShuffleMask.push_back(
l);
101 const unsigned NumLaneElts = 16;
103 for (
unsigned l = 0;
l < NumElts;
l += NumLaneElts)
104 for (
unsigned i = 0;
i < NumLaneElts; ++
i) {
107 ShuffleMask.push_back(
M);
113 const unsigned NumLaneElts = 16;
115 for (
unsigned l = 0;
l < NumElts;
l += NumLaneElts)
116 for (
unsigned i = 0;
i < NumLaneElts; ++
i) {
120 ShuffleMask.push_back(
M);
126 const unsigned NumLaneElts = 16;
128 for (
unsigned l = 0;
l != NumElts;
l += NumLaneElts) {
129 for (
unsigned i = 0;
i != NumLaneElts; ++
i) {
132 if (
Base >= NumLaneElts)
Base += NumElts - NumLaneElts;
133 ShuffleMask.push_back(
Base +
l);
142 Imm =
Imm & (NumElts - 1);
143 for (
unsigned i = 0;
i != NumElts; ++
i)
144 ShuffleMask.push_back(
i +
Imm);
149 unsigned Size = NumElts * ScalarBits;
150 unsigned NumLanes = Size / 128;
151 if (NumLanes == 0) NumLanes = 1;
152 unsigned NumLaneElts = NumElts / NumLanes;
155 for (
unsigned l = 0;
l != NumElts;
l += NumLaneElts) {
156 for (
unsigned i = 0;
i != NumLaneElts; ++
i) {
157 ShuffleMask.push_back(SplatImm % NumLaneElts +
l);
158 SplatImm /= NumLaneElts;
165 for (
unsigned l = 0;
l != NumElts;
l += 8) {
166 unsigned NewImm =
Imm;
167 for (
unsigned i = 0,
e = 4;
i !=
e; ++
i) {
168 ShuffleMask.push_back(
l +
i);
170 for (
unsigned i = 4,
e = 8;
i !=
e; ++
i) {
171 ShuffleMask.push_back(
l + 4 + (NewImm & 3));
179 for (
unsigned l = 0;
l != NumElts;
l += 8) {
180 unsigned NewImm =
Imm;
181 for (
unsigned i = 0,
e = 4;
i !=
e; ++
i) {
182 ShuffleMask.push_back(
l + (NewImm & 3));
185 for (
unsigned i = 4,
e = 8;
i !=
e; ++
i) {
186 ShuffleMask.push_back(
l +
i);
192 unsigned NumHalfElts = NumElts / 2;
194 for (
unsigned l = 0;
l != NumHalfElts; ++
l)
195 ShuffleMask.push_back(
l + NumHalfElts);
196 for (
unsigned h = 0;
h != NumHalfElts; ++
h)
197 ShuffleMask.push_back(
h);
202 unsigned NumLaneElts = 128 / ScalarBits;
204 unsigned NewImm =
Imm;
205 for (
unsigned l = 0;
l != NumElts;
l += NumLaneElts) {
207 for (
unsigned s = 0;
s != NumElts * 2;
s += NumElts) {
208 for (
unsigned i = 0;
i != NumLaneElts / 2; ++
i) {
209 ShuffleMask.push_back(NewImm % NumLaneElts +
s +
l);
210 NewImm /= NumLaneElts;
213 if (NumLaneElts == 4) NewImm =
Imm;
221 unsigned NumLanes = (NumElts * ScalarBits) / 128;
222 if (NumLanes == 0) NumLanes = 1;
223 unsigned NumLaneElts = NumElts / NumLanes;
225 for (
unsigned l = 0;
l != NumElts;
l += NumLaneElts) {
226 for (
unsigned i =
l + NumLaneElts / 2,
e =
l + NumLaneElts;
i !=
e; ++
i) {
227 ShuffleMask.push_back(
i);
228 ShuffleMask.push_back(
i + NumElts);
237 unsigned NumLanes = (NumElts * ScalarBits) / 128;
238 if (NumLanes == 0 ) NumLanes = 1;
239 unsigned NumLaneElts = NumElts / NumLanes;
241 for (
unsigned l = 0;
l != NumElts;
l += NumLaneElts) {
242 for (
unsigned i =
l,
e =
l + NumLaneElts / 2;
i !=
e; ++
i) {
243 ShuffleMask.push_back(
i);
244 ShuffleMask.push_back(
i + NumElts);
251 ShuffleMask.
append(NumElts, 0);
256 unsigned Scale = DstNumElts / SrcNumElts;
258 for (
unsigned i = 0;
i != Scale; ++
i)
259 for (
unsigned j = 0;
j != SrcNumElts; ++
j)
260 ShuffleMask.push_back(
j);
266 unsigned NumElementsInLane = 128 / ScalarSize;
267 unsigned NumLanes = NumElts / NumElementsInLane;
269 for (
unsigned l = 0;
l != NumElts;
l += NumElementsInLane) {
270 unsigned Index = (
Imm % NumLanes) * NumElementsInLane;
273 if (
l >= (NumElts / 2))
275 for (
unsigned i = 0;
i != NumElementsInLane; ++
i)
276 ShuffleMask.push_back(
Index +
i);
282 unsigned HalfSize = NumElts / 2;
284 for (
unsigned l = 0;
l != 2; ++
l) {
285 unsigned HalfMask =
Imm >> (
l * 4);
286 unsigned HalfBegin = (HalfMask & 0x3) * HalfSize;
287 for (
unsigned i = HalfBegin,
e = HalfBegin + HalfSize;
i !=
e; ++
i)
294 for (
int i = 0,
e = RawMask.
size();
i <
e; ++
i) {
302 int Base = (
i / 16) * 16;
309 ShuffleMask.push_back(
Index);
316 for (
unsigned i = 0;
i < NumElts; ++
i) {
319 unsigned Bit =
i % 8;
320 ShuffleMask.push_back(((
Imm >>
Bit) & 1) ? NumElts +
i :
i);
326 assert(RawMask.
size() == 16 &&
"Illegal VPPERM shuffle mask size");
341 for (
int i = 0,
e = RawMask.
size();
i <
e; ++
i) {
349 if (PermuteOp == 4) {
353 if (PermuteOp != 0) {
359 ShuffleMask.push_back((
int)
Index);
365 for (
unsigned l = 0;
l != NumElts;
l += 4)
366 for (
unsigned i = 0;
i != 4; ++
i)
367 ShuffleMask.push_back(
l + ((
Imm >> (2 *
i)) & 3));
371 unsigned NumDstElts,
bool IsAnyExtend,
373 unsigned Scale = DstScalarBits / SrcScalarBits;
374 assert(SrcScalarBits < DstScalarBits &&
375 "Expected zero extension mask to increase scalar size");
378 for (
unsigned i = 0;
i != NumDstElts;
i++) {
379 ShuffleMask.push_back(
i);
386 ShuffleMask.push_back(0);
394 ShuffleMask.push_back(NumElts);
395 for (
unsigned i = 1;
i < NumElts;
i++)
401 unsigned HalfElts = NumElts / 2;
409 if (0 != (Len % EltSize) || 0 != (Idx % EltSize))
417 if ((Len + Idx) > 64) {
428 for (
int i = 0;
i != Len; ++
i)
429 ShuffleMask.push_back(
i + Idx);
430 for (
int i = Len;
i != (
int)HalfElts; ++
i)
432 for (
int i = HalfElts;
i != (
int)NumElts; ++
i)
438 unsigned HalfElts = NumElts / 2;
446 if (0 != (Len % EltSize) || 0 != (Idx % EltSize))
454 if ((Len + Idx) > 64) {
466 for (
int i = 0;
i != Idx; ++
i)
467 ShuffleMask.push_back(
i);
468 for (
int i = 0;
i != Len; ++
i)
469 ShuffleMask.push_back(
i + NumElts);
470 for (
int i = Idx + Len;
i != (
int)HalfElts; ++
i)
471 ShuffleMask.push_back(
i);
472 for (
int i = HalfElts;
i != (
int)NumElts; ++
i)
479 unsigned VecSize = NumElts * ScalarBits;
480 unsigned NumLanes =
VecSize / 128;
481 unsigned NumEltsPerLane = NumElts / NumLanes;
483 "Unexpected vector size");
484 assert((ScalarBits == 32 || ScalarBits == 64) &&
"Unexpected element size");
486 for (
unsigned i = 0,
e = RawMask.
size();
i <
e; ++
i) {
492 M = (ScalarBits == 64 ? ((
M >> 1) & 0x1) : (
M & 0x3));
493 unsigned LaneOffset =
i & ~(NumEltsPerLane - 1);
494 ShuffleMask.push_back((
int)(LaneOffset +
M));
501 unsigned VecSize = NumElts * ScalarBits;
502 unsigned NumLanes =
VecSize / 128;
503 unsigned NumEltsPerLane = NumElts / NumLanes;
505 assert((ScalarBits == 32 || ScalarBits == 64) &&
"Unexpected element size");
506 assert((NumElts == RawMask.
size()) &&
"Unexpected mask size");
508 for (
unsigned i = 0,
e = RawMask.
size();
i <
e; ++
i) {
519 unsigned MatchBit = (Selector >> 3) & 0x1;
527 if ((M2Z & 0
x2) != 0 && MatchBit != (M2Z & 0x1)) {
532 int Index =
i & ~(NumEltsPerLane - 1);
533 if (ScalarBits == 64)
534 Index += (Selector >> 1) & 0x1;
536 Index += Selector & 0x3;
538 int Src = (Selector >> 2) & 0x1;
539 Index += Src * NumElts;
540 ShuffleMask.push_back(
Index);
547 for (
int i = 0,
e = RawMask.
size();
i !=
e; ++
i) {
554 ShuffleMask.push_back((
int)
M);
561 for (
int i = 0,
e = RawMask.
size();
i !=
e; ++
i) {
568 ShuffleMask.push_back((
int)
M);
This is an optimization pass for GlobalISel generic memory operations.
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
void DecodePSWAPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a PSWAPD 3DNow! instruction.
Clang compiles this i1 i64 store i64 i64 store i64 i64 store i64 i64 store i64 align Which gets codegen d xmm0 movaps rbp movaps rbp movaps rbp movaps rbp rbp rbp rbp rbp It would be better to have movq s of instead of the movaps s LLVM produces ret int
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
This requires reassociating to forms of expressions that are already something that reassoc doesn t think about yet These two functions should generate the same code on big endian int * l
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
gcc mainline compiles it x2(%rip)
void DecodeInsertElementMask(unsigned NumElts, unsigned Idx, unsigned Len, SmallVectorImpl< int > &ShuffleMask)
multiplies can be turned into SHL s
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
void DecodeSubVectorBroadcast(unsigned DstNumElts, unsigned SrcNumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of a subvector to a larger vector type.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
Class for arbitrary precision integers.
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
size_t size() const
size - Get the array size.
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
the multiplication has a latency of four as opposed to two cycles for the movl lea variant It appears gcc place string data with linkonce linkage in section coalesced instead of section coalesced Take a look at darwin h
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.