LLVM  14.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of
17 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
18 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost.
21 /// Some examples of other technologies/CPUs:
22 /// SSE 3 - Pentium4 / Athlon64
23 /// SSE 4.1 - Penryn
24 /// SSE 4.2 - Nehalem
25 /// AVX - Sandy Bridge
26 /// AVX2 - Haswell
27 /// AVX-512 - Xeon Phi / Skylake
28 /// And some examples of instruction target dependent costs (latency)
29 /// divss sqrtss rsqrtss
30 /// AMD K7 11-16 19 3
31 /// Piledriver 9-24 13-15 5
32 /// Jaguar 14 16 2
33 /// Pentium II,III 18 30 2
34 /// Nehalem 7-14 7-18 3
35 /// Haswell 10-13 11 5
36 /// TODO: Develop and implement the target dependent cost model and
37 /// specialize cost numbers for different Cost Model Targets such as throughput,
38 /// code size, latency and uop count.
39 //===----------------------------------------------------------------------===//
40 
41 #include "X86TargetTransformInfo.h"
44 #include "llvm/CodeGen/CostTable.h"
46 #include "llvm/IR/IntrinsicInst.h"
47 #include "llvm/Support/Debug.h"
48 
49 using namespace llvm;
50 
51 #define DEBUG_TYPE "x86tti"
52 
53 //===----------------------------------------------------------------------===//
54 //
55 // X86 cost model.
56 //
57 //===----------------------------------------------------------------------===//
58 
60 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
61  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
62  // TODO: Currently the __builtin_popcount() implementation using SSE3
63  // instructions is inefficient. Once the problem is fixed, we should
64  // call ST->hasSSE3() instead of ST->hasPOPCNT().
66 }
67 
70  switch (Level) {
72  // - Penryn
73  // - Nehalem
74  // - Westmere
75  // - Sandy Bridge
76  // - Ivy Bridge
77  // - Haswell
78  // - Broadwell
79  // - Skylake
80  // - Kabylake
81  return 32 * 1024; // 32 KByte
83  // - Penryn
84  // - Nehalem
85  // - Westmere
86  // - Sandy Bridge
87  // - Ivy Bridge
88  // - Haswell
89  // - Broadwell
90  // - Skylake
91  // - Kabylake
92  return 256 * 1024; // 256 KByte
93  }
94 
95  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
96 }
97 
100  // - Penryn
101  // - Nehalem
102  // - Westmere
103  // - Sandy Bridge
104  // - Ivy Bridge
105  // - Haswell
106  // - Broadwell
107  // - Skylake
108  // - Kabylake
109  switch (Level) {
113  return 8;
114  }
115 
116  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
117 }
118 
119 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
120  bool Vector = (ClassID == 1);
121  if (Vector && !ST->hasSSE1())
122  return 0;
123 
124  if (ST->is64Bit()) {
125  if (Vector && ST->hasAVX512())
126  return 32;
127  return 16;
128  }
129  return 8;
130 }
131 
132 TypeSize
134  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
135  switch (K) {
137  return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
139  if (ST->hasAVX512() && PreferVectorWidth >= 512)
140  return TypeSize::getFixed(512);
141  if (ST->hasAVX() && PreferVectorWidth >= 256)
142  return TypeSize::getFixed(256);
143  if (ST->hasSSE1() && PreferVectorWidth >= 128)
144  return TypeSize::getFixed(128);
145  return TypeSize::getFixed(0);
147  return TypeSize::getScalable(0);
148  }
149 
150  llvm_unreachable("Unsupported register kind");
151 }
152 
153 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
155  .getFixedSize();
156 }
157 
158 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
159  // If the loop will not be vectorized, don't interleave the loop.
160  // Let regular unroll to unroll the loop, which saves the overflow
161  // check and memory check cost.
162  if (VF == 1)
163  return 1;
164 
165  if (ST->isAtom())
166  return 1;
167 
168  // Sandybridge and Haswell have multiple execution ports and pipelined
169  // vector units.
170  if (ST->hasAVX())
171  return 4;
172 
173  return 2;
174 }
175 
177  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
179  TTI::OperandValueProperties Opd1PropInfo,
181  const Instruction *CxtI) {
182  // TODO: Handle more cost kinds.
184  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
185  Op2Info, Opd1PropInfo,
186  Opd2PropInfo, Args, CxtI);
187 
188  // vXi8 multiplications are always promoted to vXi16.
189  if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
190  Ty->getScalarSizeInBits() == 8) {
191  Type *WideVecTy =
192  VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
193  return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
195  CostKind) +
196  getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
198  CostKind) +
199  getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info,
200  Opd1PropInfo, Opd2PropInfo);
201  }
202 
203  // Legalize the type.
204  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
205 
206  int ISD = TLI->InstructionOpcodeToISD(Opcode);
207  assert(ISD && "Invalid opcode");
208 
209  if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
210  LT.second.getScalarType() == MVT::i32) {
211  // Check if the operands can be represented as a smaller datatype.
212  bool Op1Signed = false, Op2Signed = false;
213  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
214  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
215  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
216 
217  // If both are representable as i15 and at least one is zero-extended,
218  // then we can treat this as PMADDWD which has the same costs
219  // as a vXi16 multiply..
220  if (OpMinSize <= 15 && (!Op1Signed || !Op2Signed) && !ST->isPMADDWDSlow())
221  LT.second =
222  MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
223  }
224 
225  if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
226  ISD == ISD::UREM) &&
229  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
230  if (ISD == ISD::SDIV || ISD == ISD::SREM) {
231  // On X86, vector signed division by constants power-of-two are
232  // normally expanded to the sequence SRA + SRL + ADD + SRA.
233  // The OperandValue properties may not be the same as that of the previous
234  // operation; conservatively assume OP_None.
235  InstructionCost Cost =
236  2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
239  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
242  Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
245 
246  if (ISD == ISD::SREM) {
247  // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
248  Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
249  Op2Info);
250  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
251  Op2Info);
252  }
253 
254  return Cost;
255  }
256 
257  // Vector unsigned division/remainder will be simplified to shifts/masks.
258  if (ISD == ISD::UDIV)
259  return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
262  // UREM
263  return getArithmeticInstrCost(Instruction::And, Ty, CostKind, Op1Info,
266  }
267 
268  static const CostTblEntry GLMCostTable[] = {
269  { ISD::FDIV, MVT::f32, 18 }, // divss
270  { ISD::FDIV, MVT::v4f32, 35 }, // divps
271  { ISD::FDIV, MVT::f64, 33 }, // divsd
272  { ISD::FDIV, MVT::v2f64, 65 }, // divpd
273  };
274 
275  if (ST->useGLMDivSqrtCosts())
276  if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
277  LT.second))
278  return LT.first * Entry->Cost;
279 
280  static const CostTblEntry SLMCostTable[] = {
281  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
282  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
283  { ISD::FMUL, MVT::f64, 2 }, // mulsd
284  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
285  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
286  { ISD::FDIV, MVT::f32, 17 }, // divss
287  { ISD::FDIV, MVT::v4f32, 39 }, // divps
288  { ISD::FDIV, MVT::f64, 32 }, // divsd
289  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
290  { ISD::FADD, MVT::v2f64, 2 }, // addpd
291  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
292  // v2i64/v4i64 mul is custom lowered as a series of long:
293  // multiplies(3), shifts(3) and adds(2)
294  // slm muldq version throughput is 2 and addq throughput 4
295  // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
296  // 3X4 (addq throughput) = 17
297  { ISD::MUL, MVT::v2i64, 17 },
298  // slm addq\subq throughput is 4
299  { ISD::ADD, MVT::v2i64, 4 },
300  { ISD::SUB, MVT::v2i64, 4 },
301  };
302 
303  if (ST->isSLM()) {
304  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
305  // Check if the operands can be shrinked into a smaller datatype.
306  // TODO: Merge this into generiic vXi32 MUL patterns above.
307  bool Op1Signed = false;
308  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
309  bool Op2Signed = false;
310  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
311 
312  bool SignedMode = Op1Signed || Op2Signed;
313  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
314 
315  if (OpMinSize <= 7)
316  return LT.first * 3; // pmullw/sext
317  if (!SignedMode && OpMinSize <= 8)
318  return LT.first * 3; // pmullw/zext
319  if (OpMinSize <= 15)
320  return LT.first * 5; // pmullw/pmulhw/pshuf
321  if (!SignedMode && OpMinSize <= 16)
322  return LT.first * 5; // pmullw/pmulhw/pshuf
323  }
324 
325  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
326  LT.second)) {
327  return LT.first * Entry->Cost;
328  }
329  }
330 
331  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
332  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
333  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
334  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
335  };
336 
338  ST->hasBWI()) {
339  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
340  LT.second))
341  return LT.first * Entry->Cost;
342  }
343 
344  static const CostTblEntry AVX512UniformConstCostTable[] = {
345  { ISD::SRA, MVT::v2i64, 1 },
346  { ISD::SRA, MVT::v4i64, 1 },
347  { ISD::SRA, MVT::v8i64, 1 },
348 
349  { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
350  { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
351  { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
352 
353  { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
354  { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
355  { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
356  { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
357  };
358 
360  ST->hasAVX512()) {
361  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
362  LT.second))
363  return LT.first * Entry->Cost;
364  }
365 
366  static const CostTblEntry AVX2UniformConstCostTable[] = {
367  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
368  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
369  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
370 
371  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
372 
373  { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
374  { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
375  { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
376  { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
377  };
378 
380  ST->hasAVX2()) {
381  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
382  LT.second))
383  return LT.first * Entry->Cost;
384  }
385 
386  static const CostTblEntry SSE2UniformConstCostTable[] = {
387  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
388  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
389  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
390 
391  { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
392  { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
393  { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
394 
395  { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
396  { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
397  { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
398  { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
399  { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
400  { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
401  { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
402  { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
403  };
404 
405  // XOP has faster vXi8 shifts.
407  ST->hasSSE2() && !ST->hasXOP()) {
408  if (const auto *Entry =
409  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
410  return LT.first * Entry->Cost;
411  }
412 
413  static const CostTblEntry AVX512BWConstCostTable[] = {
414  { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
415  { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
416  { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
417  { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
418  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
419  { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
420  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
421  { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
422  };
423 
426  ST->hasBWI()) {
427  if (const auto *Entry =
428  CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
429  return LT.first * Entry->Cost;
430  }
431 
432  static const CostTblEntry AVX512ConstCostTable[] = {
433  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
434  { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
435  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
436  { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
437  { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
438  { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
439  { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
440  { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
441  { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
442  { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
443  { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
444  { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
445  };
446 
449  ST->hasAVX512()) {
450  if (const auto *Entry =
451  CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
452  return LT.first * Entry->Cost;
453  }
454 
455  static const CostTblEntry AVX2ConstCostTable[] = {
456  { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
457  { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
458  { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
459  { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
460  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
461  { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
462  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
463  { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
464  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
465  { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
466  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
467  { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
468  };
469 
472  ST->hasAVX2()) {
473  if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
474  return LT.first * Entry->Cost;
475  }
476 
477  static const CostTblEntry SSE2ConstCostTable[] = {
478  { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
479  { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
480  { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
481  { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
482  { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
483  { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
484  { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
485  { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
486  { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
487  { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
488  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
489  { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
490  { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
491  { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
492  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
493  { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
494  { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
495  { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
496  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
497  { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
498  { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
499  { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
500  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
501  { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
502  };
503 
506  ST->hasSSE2()) {
507  // pmuldq sequence.
508  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
509  return LT.first * 32;
510  if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
511  return LT.first * 38;
512  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
513  return LT.first * 15;
514  if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
515  return LT.first * 20;
516 
517  if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
518  return LT.first * Entry->Cost;
519  }
520 
521  static const CostTblEntry AVX512BWShiftCostTable[] = {
522  { ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence.
523  { ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence.
524  { ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence.
525  { ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence.
526  { ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence.
527  { ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence.
528  { ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence.
529  { ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence.
530  { ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence.
531 
532  { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
533  { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
534  { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
535  { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
536  { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
537  { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
538  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
539  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
540  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
541  };
542 
543  if (ST->hasBWI())
544  if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
545  return LT.first * Entry->Cost;
546 
547  static const CostTblEntry AVX2UniformCostTable[] = {
548  // Uniform splats are cheaper for the following instructions.
549  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
550  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
551  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
552  { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
553  { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
554  { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
555 
556  { ISD::SHL, MVT::v8i32, 1 }, // pslld
557  { ISD::SRL, MVT::v8i32, 1 }, // psrld
558  { ISD::SRA, MVT::v8i32, 1 }, // psrad
559  { ISD::SHL, MVT::v4i64, 1 }, // psllq
560  { ISD::SRL, MVT::v4i64, 1 }, // psrlq
561  };
562 
563  if (ST->hasAVX2() &&
565  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
566  if (const auto *Entry =
567  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
568  return LT.first * Entry->Cost;
569  }
570 
571  static const CostTblEntry SSE2UniformCostTable[] = {
572  // Uniform splats are cheaper for the following instructions.
573  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
574  { ISD::SHL, MVT::v4i32, 1 }, // pslld
575  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
576 
577  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
578  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
579  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
580 
581  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
582  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
583  };
584 
585  if (ST->hasSSE2() &&
587  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
588  if (const auto *Entry =
589  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
590  return LT.first * Entry->Cost;
591  }
592 
593  static const CostTblEntry AVX512DQCostTable[] = {
594  { ISD::MUL, MVT::v2i64, 2 }, // pmullq
595  { ISD::MUL, MVT::v4i64, 2 }, // pmullq
596  { ISD::MUL, MVT::v8i64, 2 } // pmullq
597  };
598 
599  // Look for AVX512DQ lowering tricks for custom cases.
600  if (ST->hasDQI())
601  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
602  return LT.first * Entry->Cost;
603 
604  static const CostTblEntry AVX512BWCostTable[] = {
605  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
606  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
607  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
608  };
609 
610  // Look for AVX512BW lowering tricks for custom cases.
611  if (ST->hasBWI())
612  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
613  return LT.first * Entry->Cost;
614 
615  static const CostTblEntry AVX512CostTable[] = {
616  { ISD::SHL, MVT::v4i32, 1 },
617  { ISD::SRL, MVT::v4i32, 1 },
618  { ISD::SRA, MVT::v4i32, 1 },
619  { ISD::SHL, MVT::v8i32, 1 },
620  { ISD::SRL, MVT::v8i32, 1 },
621  { ISD::SRA, MVT::v8i32, 1 },
622  { ISD::SHL, MVT::v16i32, 1 },
623  { ISD::SRL, MVT::v16i32, 1 },
624  { ISD::SRA, MVT::v16i32, 1 },
625 
626  { ISD::SHL, MVT::v2i64, 1 },
627  { ISD::SRL, MVT::v2i64, 1 },
628  { ISD::SHL, MVT::v4i64, 1 },
629  { ISD::SRL, MVT::v4i64, 1 },
630  { ISD::SHL, MVT::v8i64, 1 },
631  { ISD::SRL, MVT::v8i64, 1 },
632 
633  { ISD::SRA, MVT::v2i64, 1 },
634  { ISD::SRA, MVT::v4i64, 1 },
635  { ISD::SRA, MVT::v8i64, 1 },
636 
637  { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
638  { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
639  { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
640  { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add
641 
642  { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
643  { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
644  { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
645  { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
646  { ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/
647  { ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/
648  { ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/
649  { ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/
650 
651  { ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
652  { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
653  { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
654  { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
655  { ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/
656  { ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/
657  { ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/
658  { ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/
659  };
660 
661  if (ST->hasAVX512())
662  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
663  return LT.first * Entry->Cost;
664 
665  static const CostTblEntry AVX2ShiftCostTable[] = {
666  // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
667  // customize them to detect the cases where shift amount is a scalar one.
668  { ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org)
669  { ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org)
670  { ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org)
671  { ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org)
672  { ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org)
673  { ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org)
674  { ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org)
675  { ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org)
676  { ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org)
677  { ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org)
678  };
679 
680  if (ST->hasAVX512()) {
681  if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
684  // On AVX512, a packed v32i16 shift left by a constant build_vector
685  // is lowered into a vector multiply (vpmullw).
686  return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
687  Op1Info, Op2Info,
690  }
691 
692  // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
693  if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
694  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
697  // On AVX2, a packed v16i16 shift left by a constant build_vector
698  // is lowered into a vector multiply (vpmullw).
699  return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
700  Op1Info, Op2Info,
703 
704  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
705  return LT.first * Entry->Cost;
706  }
707 
708  static const CostTblEntry XOPShiftCostTable[] = {
709  // 128bit shifts take 1cy, but right shifts require negation beforehand.
710  { ISD::SHL, MVT::v16i8, 1 },
711  { ISD::SRL, MVT::v16i8, 2 },
712  { ISD::SRA, MVT::v16i8, 2 },
713  { ISD::SHL, MVT::v8i16, 1 },
714  { ISD::SRL, MVT::v8i16, 2 },
715  { ISD::SRA, MVT::v8i16, 2 },
716  { ISD::SHL, MVT::v4i32, 1 },
717  { ISD::SRL, MVT::v4i32, 2 },
718  { ISD::SRA, MVT::v4i32, 2 },
719  { ISD::SHL, MVT::v2i64, 1 },
720  { ISD::SRL, MVT::v2i64, 2 },
721  { ISD::SRA, MVT::v2i64, 2 },
722  // 256bit shifts require splitting if AVX2 didn't catch them above.
723  { ISD::SHL, MVT::v32i8, 2+2 },
724  { ISD::SRL, MVT::v32i8, 4+2 },
725  { ISD::SRA, MVT::v32i8, 4+2 },
726  { ISD::SHL, MVT::v16i16, 2+2 },
727  { ISD::SRL, MVT::v16i16, 4+2 },
728  { ISD::SRA, MVT::v16i16, 4+2 },
729  { ISD::SHL, MVT::v8i32, 2+2 },
730  { ISD::SRL, MVT::v8i32, 4+2 },
731  { ISD::SRA, MVT::v8i32, 4+2 },
732  { ISD::SHL, MVT::v4i64, 2+2 },
733  { ISD::SRL, MVT::v4i64, 4+2 },
734  { ISD::SRA, MVT::v4i64, 4+2 },
735  };
736 
737  // Look for XOP lowering tricks.
738  if (ST->hasXOP()) {
739  // If the right shift is constant then we'll fold the negation so
740  // it's as cheap as a left shift.
741  int ShiftISD = ISD;
742  if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
745  ShiftISD = ISD::SHL;
746  if (const auto *Entry =
747  CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
748  return LT.first * Entry->Cost;
749  }
750 
751  static const CostTblEntry SSE2UniformShiftCostTable[] = {
752  // Uniform splats are cheaper for the following instructions.
753  { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
754  { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
755  { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
756 
757  { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
758  { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
759  { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
760 
761  { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
762  { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
763  { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
764  { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
765  };
766 
767  if (ST->hasSSE2() &&
769  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
770 
771  // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
772  if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
773  return LT.first * 4; // 2*psrad + shuffle.
774 
775  if (const auto *Entry =
776  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
777  return LT.first * Entry->Cost;
778  }
779 
780  if (ISD == ISD::SHL &&
782  MVT VT = LT.second;
783  // Vector shift left by non uniform constant can be lowered
784  // into vector multiply.
785  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
786  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
787  ISD = ISD::MUL;
788  }
789 
790  static const CostTblEntry AVX2CostTable[] = {
791  { ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence.
792  { ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence.
793  { ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
794  { ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
795  { ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
796  { ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
797 
798  { ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence.
799  { ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence.
800  { ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
801  { ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
802  { ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
803  { ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
804 
805  { ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence.
806  { ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence.
807  { ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence.
808  { ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence.
809  { ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence.
810  { ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence.
811  { ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence.
812  { ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence.
813 
814  { ISD::SUB, MVT::v32i8, 1 }, // psubb
815  { ISD::ADD, MVT::v32i8, 1 }, // paddb
816  { ISD::SUB, MVT::v16i16, 1 }, // psubw
817  { ISD::ADD, MVT::v16i16, 1 }, // paddw
818  { ISD::SUB, MVT::v8i32, 1 }, // psubd
819  { ISD::ADD, MVT::v8i32, 1 }, // paddd
820  { ISD::SUB, MVT::v4i64, 1 }, // psubq
821  { ISD::ADD, MVT::v4i64, 1 }, // paddq
822 
823  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
824  { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
825  { ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add
826 
827  { ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
828  { ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
829  { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
830  { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
831  { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
832  { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
833  { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/
834  { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/
835  { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
836  { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
837 
838  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
839  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
840  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
841  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
842  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
843  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
844  };
845 
846  // Look for AVX2 lowering tricks for custom cases.
847  if (ST->hasAVX2())
848  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
849  return LT.first * Entry->Cost;
850 
851  static const CostTblEntry AVX1CostTable[] = {
852  // We don't have to scalarize unsupported ops. We can issue two half-sized
853  // operations and we only need to extract the upper YMM half.
854  // Two ops + 1 extract + 1 insert = 4.
855  { ISD::MUL, MVT::v16i16, 4 },
856  { ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/
857  { ISD::MUL, MVT::v4i64, 12 },
858 
859  { ISD::SUB, MVT::v32i8, 4 },
860  { ISD::ADD, MVT::v32i8, 4 },
861  { ISD::SUB, MVT::v16i16, 4 },
862  { ISD::ADD, MVT::v16i16, 4 },
863  { ISD::SUB, MVT::v8i32, 4 },
864  { ISD::ADD, MVT::v8i32, 4 },
865  { ISD::SUB, MVT::v4i64, 4 },
866  { ISD::ADD, MVT::v4i64, 4 },
867 
868  { ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split.
869  { ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence.
870  { ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split.
871  { ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld
872  { ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split
873  { ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend.
874  { ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
875 
876  { ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split.
877  { ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split.
878  { ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend.
879  { ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split.
880  { ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend.
881  { ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
882 
883  { ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split.
884  { ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split.
885  { ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend.
886  { ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split.
887  { ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend.
888  { ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split.
889 
890  { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/
891  { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/
892 
893  { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/
894  { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/
895  { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/
896 
897  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
898  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
899  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
900  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
901  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
902  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
903  };
904 
905  if (ST->hasAVX())
906  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
907  return LT.first * Entry->Cost;
908 
909  static const CostTblEntry SSE42CostTable[] = {
910  { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
911  { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
912  { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
913  { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
914 
915  { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
916  { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
917  { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
918  { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
919 
920  { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
921  { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
922  { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
923  { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
924 
925  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
926  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
927  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
928  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
929 
930  { ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add
931  };
932 
933  if (ST->hasSSE42())
934  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
935  return LT.first * Entry->Cost;
936 
937  static const CostTblEntry SSE41CostTable[] = {
938  { ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence.
939  { ISD::SHL, MVT::v8i16, 11 }, // pblendvb sequence.
940  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
941 
942  { ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence.
943  { ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence.
944  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
945 
946  { ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence.
947  { ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence.
948 
949  { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
950  };
951 
952  if (ST->hasSSE41())
953  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
954  return LT.first * Entry->Cost;
955 
956  static const CostTblEntry SSE2CostTable[] = {
957  // We don't correctly identify costs of casts because they are marked as
958  // custom.
959  { ISD::SHL, MVT::v16i8, 13 }, // cmpgtb sequence.
960  { ISD::SHL, MVT::v8i16, 25 }, // cmpgtw sequence.
961  { ISD::SHL, MVT::v4i32, 16 }, // pslld/paddd/cvttps2dq/pmuludq.
962  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
963 
964  { ISD::SRL, MVT::v16i8, 14 }, // cmpgtb sequence.
965  { ISD::SRL, MVT::v8i16, 16 }, // cmpgtw sequence.
966  { ISD::SRL, MVT::v4i32, 12 }, // Shift each lane + blend.
967  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
968 
969  { ISD::SRA, MVT::v16i8, 27 }, // unpacked cmpgtb sequence.
970  { ISD::SRA, MVT::v8i16, 16 }, // cmpgtw sequence.
971  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
972  { ISD::SRA, MVT::v2i64, 8 }, // srl/xor/sub splat+shuffle sequence.
973 
974  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
975  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
976  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
977 
978  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
979  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
980  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
981  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
982 
983  { ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/
984  { ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/
985  { ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/
986  { ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/
987 
988  { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
989  { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
990 
991  { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
992  { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
993  };
994 
995  if (ST->hasSSE2())
996  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
997  return LT.first * Entry->Cost;
998 
999  static const CostTblEntry SSE1CostTable[] = {
1000  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
1001  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
1002 
1003  { ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/
1004  { ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1005 
1006  { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
1007  { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1008 
1009  { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
1010  { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1011  };
1012 
1013  if (ST->hasSSE1())
1014  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1015  return LT.first * Entry->Cost;
1016 
1017  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
1018  { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
1019  { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
1020  };
1021 
1022  if (ST->is64Bit())
1023  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1024  return LT.first * Entry->Cost;
1025 
1026  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1027  { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
1028  { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
1029  { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
1030 
1031  { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
1032  { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
1033  { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
1034  };
1035 
1036  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1037  return LT.first * Entry->Cost;
1038 
1039  // It is not a good idea to vectorize division. We have to scalarize it and
1040  // in the process we will often end up having to spilling regular
1041  // registers. The overhead of division is going to dominate most kernels
1042  // anyways so try hard to prevent vectorization of division - it is
1043  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1044  // to hide "20 cycles" for each lane.
1045  if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
1046  ISD == ISD::UDIV || ISD == ISD::UREM)) {
1048  Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
1050  return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1051  }
1052 
1053  // Fallback to the default implementation.
1054  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
1055 }
1056 
1058  VectorType *BaseTp,
1059  ArrayRef<int> Mask, int Index,
1060  VectorType *SubTp) {
1061  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1062  // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1063  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
1064 
1066  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1067  if (Kind == TTI::SK_Transpose)
1069 
1070  // For Broadcasts we are splatting the first element from the first input
1071  // register, so only need to reference that input and all the output
1072  // registers are the same.
1073  if (Kind == TTI::SK_Broadcast)
1074  LT.first = 1;
1075 
1076  // Subvector extractions are free if they start at the beginning of a
1077  // vector and cheap if the subvectors are aligned.
1078  if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1079  int NumElts = LT.second.getVectorNumElements();
1080  if ((Index % NumElts) == 0)
1081  return 0;
1082  std::pair<InstructionCost, MVT> SubLT =
1083  TLI->getTypeLegalizationCost(DL, SubTp);
1084  if (SubLT.second.isVector()) {
1085  int NumSubElts = SubLT.second.getVectorNumElements();
1086  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1087  return SubLT.first;
1088  // Handle some cases for widening legalization. For now we only handle
1089  // cases where the original subvector was naturally aligned and evenly
1090  // fit in its legalized subvector type.
1091  // FIXME: Remove some of the alignment restrictions.
1092  // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1093  // vectors.
1094  int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1095  if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1096  (NumSubElts % OrigSubElts) == 0 &&
1097  LT.second.getVectorElementType() ==
1098  SubLT.second.getVectorElementType() &&
1099  LT.second.getVectorElementType().getSizeInBits() ==
1100  BaseTp->getElementType()->getPrimitiveSizeInBits()) {
1101  assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1102  "Unexpected number of elements!");
1103  auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1104  LT.second.getVectorNumElements());
1105  auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1106  SubLT.second.getVectorNumElements());
1107  int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1108  InstructionCost ExtractCost = getShuffleCost(
1109  TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy);
1110 
1111  // If the original size is 32-bits or more, we can use pshufd. Otherwise
1112  // if we have SSSE3 we can use pshufb.
1113  if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1114  return ExtractCost + 1; // pshufd or pshufb
1115 
1116  assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1117  "Unexpected vector size");
1118 
1119  return ExtractCost + 2; // worst case pshufhw + pshufd
1120  }
1121  }
1122  }
1123 
1124  // Subvector insertions are cheap if the subvectors are aligned.
1125  // Note that in general, the insertion starting at the beginning of a vector
1126  // isn't free, because we need to preserve the rest of the wide vector.
1127  if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1128  int NumElts = LT.second.getVectorNumElements();
1129  std::pair<InstructionCost, MVT> SubLT =
1130  TLI->getTypeLegalizationCost(DL, SubTp);
1131  if (SubLT.second.isVector()) {
1132  int NumSubElts = SubLT.second.getVectorNumElements();
1133  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1134  return SubLT.first;
1135  }
1136 
1137  // If the insertion isn't aligned, treat it like a 2-op shuffle.
1139  }
1140 
1141  // Handle some common (illegal) sub-vector types as they are often very cheap
1142  // to shuffle even on targets without PSHUFB.
1143  EVT VT = TLI->getValueType(DL, BaseTp);
1144  if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1145  !ST->hasSSSE3()) {
1146  static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1147  {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1148  {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1149  {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1150  {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1151  {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1152 
1153  {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1154  {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1155  {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1156  {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1157 
1158  {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1159  {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1160  {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1161  {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1162  {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1163 
1164  {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1165  {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1166  {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1167  {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1168  {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1169  };
1170 
1171  if (ST->hasSSE2())
1172  if (const auto *Entry =
1173  CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1174  return Entry->Cost;
1175  }
1176 
1177  // We are going to permute multiple sources and the result will be in multiple
1178  // destinations. Providing an accurate cost only for splits where the element
1179  // type remains the same.
1180  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1181  MVT LegalVT = LT.second;
1182  if (LegalVT.isVector() &&
1183  LegalVT.getVectorElementType().getSizeInBits() ==
1184  BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1185  LegalVT.getVectorNumElements() <
1186  cast<FixedVectorType>(BaseTp)->getNumElements()) {
1187 
1188  unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1189  unsigned LegalVTSize = LegalVT.getStoreSize();
1190  // Number of source vectors after legalization:
1191  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1192  // Number of destination vectors after legalization:
1193  InstructionCost NumOfDests = LT.first;
1194 
1195  auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1196  LegalVT.getVectorNumElements());
1197 
1198  InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1199  return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1200  None, 0, nullptr);
1201  }
1202 
1203  return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1204  }
1205 
1206  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1207  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1208  // We assume that source and destination have the same vector type.
1209  InstructionCost NumOfDests = LT.first;
1210  InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1211  LT.first = NumOfDests * NumOfShufflesPerDest;
1212  }
1213 
1214  static const CostTblEntry AVX512FP16ShuffleTbl[] = {
1215  {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1216  {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1217  {TTI::SK_Broadcast, MVT::v8f16, 1}, // vpbroadcastw
1218 
1219  {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1220  {TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw
1221  {TTI::SK_Reverse, MVT::v8f16, 1}, // vpshufb
1222 
1223  {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1224  {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1225  {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // vpshufb
1226 
1227  {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1228  {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w
1229  {TTI::SK_PermuteTwoSrc, MVT::v8f16, 2} // vpermt2w
1230  };
1231 
1232  if (!ST->useSoftFloat() && ST->hasFP16())
1233  if (const auto *Entry =
1234  CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second))
1235  return LT.first * Entry->Cost;
1236 
1237  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1238  {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1239  {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1240 
1241  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1242  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1243 
1244  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1245  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1246  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1247  };
1248 
1249  if (ST->hasVBMI())
1250  if (const auto *Entry =
1251  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1252  return LT.first * Entry->Cost;
1253 
1254  static const CostTblEntry AVX512BWShuffleTbl[] = {
1255  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1256  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1257 
1258  {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1259  {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1260  {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1261 
1262  {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1263  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1264  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1265 
1266  {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1267  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1268  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1269  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1270 
1271  {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1272  {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1273  };
1274 
1275  if (ST->hasBWI())
1276  if (const auto *Entry =
1277  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1278  return LT.first * Entry->Cost;
1279 
1280  static const CostTblEntry AVX512ShuffleTbl[] = {
1281  {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
1282  {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1283  {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1284  {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1285  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1286  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1287 
1288  {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1289  {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1290  {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1291  {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1292  {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca
1293  {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca
1294 
1295  {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1296  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1297  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1298  {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1299  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1300  {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1301  {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1302  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1303  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1304  {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1305  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1306  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1307  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1308 
1309  {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1310  {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1311  {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1312  {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1313  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1314  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1315  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1316  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1317  {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1318  {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1319  {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1320  {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
1321 
1322  // FIXME: This just applies the type legalization cost rules above
1323  // assuming these completely split.
1328 
1329  {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
1330  {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
1331  {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
1332  {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
1333  {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
1334  {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
1335  };
1336 
1337  if (ST->hasAVX512())
1338  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1339  return LT.first * Entry->Cost;
1340 
1341  static const CostTblEntry AVX2ShuffleTbl[] = {
1342  {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1343  {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1344  {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1345  {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1346  {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1347  {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1348 
1349  {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1350  {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1351  {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1352  {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1353  {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1354  {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1355 
1356  {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1357  {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1358 
1359  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1360  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1361  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1362  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1363  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1364  // + vpblendvb
1365  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1366  // + vpblendvb
1367 
1368  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1369  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1370  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1371  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1372  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1373  // + vpblendvb
1374  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1375  // + vpblendvb
1376  };
1377 
1378  if (ST->hasAVX2())
1379  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1380  return LT.first * Entry->Cost;
1381 
1382  static const CostTblEntry XOPShuffleTbl[] = {
1383  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1384  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1385  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1386  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1387  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1388  // + vinsertf128
1389  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1390  // + vinsertf128
1391 
1392  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1393  // + vinsertf128
1394  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1395  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1396  // + vinsertf128
1397  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1398  };
1399 
1400  if (ST->hasXOP())
1401  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1402  return LT.first * Entry->Cost;
1403 
1404  static const CostTblEntry AVX1ShuffleTbl[] = {
1405  {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1406  {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1407  {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1408  {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1409  {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1410  {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1411 
1412  {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1413  {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1414  {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1415  {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1416  {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1417  // + vinsertf128
1418  {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1419  // + vinsertf128
1420 
1421  {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1422  {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1423  {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1424  {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1425  {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1426  {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1427 
1428  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1429  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1430  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1431  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1432  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1433  // + 2*por + vinsertf128
1434  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1435  // + 2*por + vinsertf128
1436 
1437  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1438  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1439  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1440  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1441  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1442  // + 4*por + vinsertf128
1443  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1444  // + 4*por + vinsertf128
1445  };
1446 
1447  if (ST->hasAVX())
1448  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1449  return LT.first * Entry->Cost;
1450 
1451  static const CostTblEntry SSE41ShuffleTbl[] = {
1452  {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1453  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1454  {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1455  {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1456  {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1457  {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1458  };
1459 
1460  if (ST->hasSSE41())
1461  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1462  return LT.first * Entry->Cost;
1463 
1464  static const CostTblEntry SSSE3ShuffleTbl[] = {
1465  {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1466  {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1467 
1468  {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1469  {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1470 
1471  {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1472  {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1473 
1474  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1475  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1476 
1477  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1478  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1479  };
1480 
1481  if (ST->hasSSSE3())
1482  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1483  return LT.first * Entry->Cost;
1484 
1485  static const CostTblEntry SSE2ShuffleTbl[] = {
1486  {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1487  {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1488  {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1489  {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1490  {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1491 
1492  {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1493  {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1494  {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1495  {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1496  {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1497  // + 2*pshufd + 2*unpck + packus
1498 
1499  {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1500  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1501  {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1502  {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1503  {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1504 
1505  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1506  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1507  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1508  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1509  // + pshufd/unpck
1510  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1511  // + 2*pshufd + 2*unpck + 2*packus
1512 
1513  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1514  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1515  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1516  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1517  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1518  };
1519 
1520  if (ST->hasSSE2())
1521  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1522  return LT.first * Entry->Cost;
1523 
1524  static const CostTblEntry SSE1ShuffleTbl[] = {
1525  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1526  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1527  { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1528  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1529  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1530  };
1531 
1532  if (ST->hasSSE1())
1533  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1534  return LT.first * Entry->Cost;
1535 
1536  return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1537 }
1538 
1540  Type *Src,
1543  const Instruction *I) {
1544  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1545  assert(ISD && "Invalid opcode");
1546 
1547  // TODO: Allow non-throughput costs that aren't binary.
1548  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1550  return Cost == 0 ? 0 : 1;
1551  return Cost;
1552  };
1553 
1554  // The cost tables include both specific, custom (non-legal) src/dst type
1555  // conversions and generic, legalized types. We test for customs first, before
1556  // falling back to legalization.
1557  // FIXME: Need a better design of the cost table to handle non-simple types of
1558  // potential massive combinations (elem_num x src_type x dst_type).
1559  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1562 
1563  // Mask sign extend has an instruction.
1575 
1576  // Mask zero extend is a sext + shift.
1588 
1590  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
1591  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm
1592  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm
1593  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
1594  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm
1595  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm
1596  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
1597  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm
1598  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm
1599  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
1600  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm
1601  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm
1602  { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm
1605  };
1606 
1607  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1610 
1613 
1616 
1619  };
1620 
1621  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1622  // 256-bit wide vectors.
1623 
1624  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1628 
1629  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1630  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1631  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1632  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
1633  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1634  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1635  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1636  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
1637  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
1638  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
1639  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
1640  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
1641  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
1642  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
1643  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
1644  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
1645  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
1646  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
1647  { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdb
1648  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
1649  { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
1650  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
1651  { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
1652  { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
1653  { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
1654  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
1655 
1656  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
1658 
1659  // Sign extend is zmm vpternlogd+vptruncdb.
1660  // Zero extend is zmm broadcast load+vptruncdw.
1669 
1670  // Sign extend is zmm vpternlogd+vptruncdw.
1671  // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
1680 
1681  { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
1682  { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
1683  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
1684  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
1685  { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
1686  { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
1687  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
1688  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
1689  { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
1690  { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
1691 
1692  { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
1693  { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
1694  { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
1695  { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
1696 
1707 
1708  { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1709  { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1710 
1719 
1730 
1742 
1749  };
1750 
1751  static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
1752  // Mask sign extend has an instruction.
1762 
1763  // Mask zero extend is a sext + shift.
1773 
1775  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // vpsllw+vptestmb
1776  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // vpsllw+vptestmw
1777  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // vpsllw+vptestmb
1778  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // vpsllw+vptestmw
1779  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // vpsllw+vptestmb
1780  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // vpsllw+vptestmw
1781  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // vpsllw+vptestmb
1782  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // vpsllw+vptestmw
1783  { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // vpsllw+vptestmb
1784  };
1785 
1786  static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
1791 
1796 
1801 
1806  };
1807 
1808  static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
1809  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1810  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1811  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1812  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
1813  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1814  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1815  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1816  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
1817  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
1818  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
1819  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
1820  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
1821  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
1822  { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
1823  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
1824  { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
1825  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
1826 
1827  // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
1828  // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
1837 
1838  // sign extend is vpcmpeq+maskedmove+vpmovdw
1839  // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
1848 
1849  { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
1850  { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
1851  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
1852  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
1853  { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
1854  { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
1855  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
1856  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
1857  { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
1858  { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
1859 
1872 
1877 
1891 
1895 
1903  };
1904 
1905  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1912 
1927 
1929 
1940 
1943 
1948 
1957 
1965 
1976  };
1977 
1978  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1985 
1998 
2004 
2005  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
2009  { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
2011 
2024 
2042 
2054 
2068 
2071  };
2072 
2073  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2086 
2087  // These truncates end up widening elements.
2088  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
2089  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
2090  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
2091 
2095 
2107 
2122 
2133 
2144  };
2145 
2146  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2147  // These are somewhat magic numbers justified by comparing the
2148  // output of llvm-mca for our various supported scheduler models
2149  // and basing it off the worst case scenario.
2162 
2176 
2187 
2191  { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
2198 
2211 
2212  // These truncates are really widening elements.
2213  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2214  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2215  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2216  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2217  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2218  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2219 
2220  { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2222  { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
2228  { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2229  { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2230  { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
2231  };
2232 
2233  // Attempt to map directly to (simple) MVT types to let us match custom entries.
2234  EVT SrcTy = TLI->getValueType(DL, Src);
2235  EVT DstTy = TLI->getValueType(DL, Dst);
2236 
2237  // The function getSimpleVT only handles simple value types.
2238  if (SrcTy.isSimple() && DstTy.isSimple()) {
2239  MVT SimpleSrcTy = SrcTy.getSimpleVT();
2240  MVT SimpleDstTy = DstTy.getSimpleVT();
2241 
2242  if (ST->useAVX512Regs()) {
2243  if (ST->hasBWI())
2244  if (const auto *Entry = ConvertCostTableLookup(
2245  AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2246  return AdjustCost(Entry->Cost);
2247 
2248  if (ST->hasDQI())
2249  if (const auto *Entry = ConvertCostTableLookup(
2250  AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2251  return AdjustCost(Entry->Cost);
2252 
2253  if (ST->hasAVX512())
2254  if (const auto *Entry = ConvertCostTableLookup(
2255  AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2256  return AdjustCost(Entry->Cost);
2257  }
2258 
2259  if (ST->hasBWI())
2260  if (const auto *Entry = ConvertCostTableLookup(
2261  AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2262  return AdjustCost(Entry->Cost);
2263 
2264  if (ST->hasDQI())
2265  if (const auto *Entry = ConvertCostTableLookup(
2266  AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2267  return AdjustCost(Entry->Cost);
2268 
2269  if (ST->hasAVX512())
2270  if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2271  SimpleDstTy, SimpleSrcTy))
2272  return AdjustCost(Entry->Cost);
2273 
2274  if (ST->hasAVX2()) {
2275  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2276  SimpleDstTy, SimpleSrcTy))
2277  return AdjustCost(Entry->Cost);
2278  }
2279 
2280  if (ST->hasAVX()) {
2281  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2282  SimpleDstTy, SimpleSrcTy))
2283  return AdjustCost(Entry->Cost);
2284  }
2285 
2286  if (ST->hasSSE41()) {
2287  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2288  SimpleDstTy, SimpleSrcTy))
2289  return AdjustCost(Entry->Cost);
2290  }
2291 
2292  if (ST->hasSSE2()) {
2293  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2294  SimpleDstTy, SimpleSrcTy))
2295  return AdjustCost(Entry->Cost);
2296  }
2297  }
2298 
2299  // Fall back to legalized types.
2300  std::pair<InstructionCost, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
2301  std::pair<InstructionCost, MVT> LTDest =
2302  TLI->getTypeLegalizationCost(DL, Dst);
2303 
2304  if (ST->useAVX512Regs()) {
2305  if (ST->hasBWI())
2306  if (const auto *Entry = ConvertCostTableLookup(
2307  AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
2308  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2309 
2310  if (ST->hasDQI())
2311  if (const auto *Entry = ConvertCostTableLookup(
2312  AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
2313  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2314 
2315  if (ST->hasAVX512())
2316  if (const auto *Entry = ConvertCostTableLookup(
2317  AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
2318  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2319  }
2320 
2321  if (ST->hasBWI())
2322  if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2323  LTDest.second, LTSrc.second))
2324  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2325 
2326  if (ST->hasDQI())
2327  if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
2328  LTDest.second, LTSrc.second))
2329  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2330 
2331  if (ST->hasAVX512())
2332  if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2333  LTDest.second, LTSrc.second))
2334  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2335 
2336  if (ST->hasAVX2())
2337  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2338  LTDest.second, LTSrc.second))
2339  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2340 
2341  if (ST->hasAVX())
2342  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2343  LTDest.second, LTSrc.second))
2344  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2345 
2346  if (ST->hasSSE41())
2347  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2348  LTDest.second, LTSrc.second))
2349  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2350 
2351  if (ST->hasSSE2())
2352  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2353  LTDest.second, LTSrc.second))
2354  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2355 
2356  // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
2357  // sitofp.
2358  if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
2359  1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
2360  Type *ExtSrc = Src->getWithNewBitWidth(32);
2361  unsigned ExtOpc =
2362  (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
2363 
2364  // For scalar loads the extend would be free.
2365  InstructionCost ExtCost = 0;
2366  if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
2367  ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
2368 
2369  return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
2371  }
2372 
2373  // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
2374  // i32.
2375  if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
2376  1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
2377  Type *TruncDst = Dst->getWithNewBitWidth(32);
2378  return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
2379  getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
2381  }
2382 
2383  return AdjustCost(
2384  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2385 }
2386 
2388  Type *CondTy,
2389  CmpInst::Predicate VecPred,
2391  const Instruction *I) {
2392  // TODO: Handle other cost kinds.
2394  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2395  I);
2396 
2397  // Legalize the type.
2398  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2399 
2400  MVT MTy = LT.second;
2401 
2402  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2403  assert(ISD && "Invalid opcode");
2404 
2405  unsigned ExtraCost = 0;
2406  if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {
2407  // Some vector comparison predicates cost extra instructions.
2408  if (MTy.isVector() &&
2409  !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
2410  (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
2411  ST->hasBWI())) {
2412  switch (cast<CmpInst>(I)->getPredicate()) {
2413  case CmpInst::Predicate::ICMP_NE:
2414  // xor(cmpeq(x,y),-1)
2415  ExtraCost = 1;
2416  break;
2417  case CmpInst::Predicate::ICMP_SGE:
2418  case CmpInst::Predicate::ICMP_SLE:
2419  // xor(cmpgt(x,y),-1)
2420  ExtraCost = 1;
2421  break;
2422  case CmpInst::Predicate::ICMP_ULT:
2423  case CmpInst::Predicate::ICMP_UGT:
2424  // cmpgt(xor(x,signbit),xor(y,signbit))
2425  // xor(cmpeq(pmaxu(x,y),x),-1)
2426  ExtraCost = 2;
2427  break;
2428  case CmpInst::Predicate::ICMP_ULE:
2429  case CmpInst::Predicate::ICMP_UGE:
2430  if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
2431  (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
2432  // cmpeq(psubus(x,y),0)
2433  // cmpeq(pminu(x,y),x)
2434  ExtraCost = 1;
2435  } else {
2436  // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
2437  ExtraCost = 3;
2438  }
2439  break;
2440  default:
2441  break;
2442  }
2443  }
2444  }
2445 
2446  static const CostTblEntry SLMCostTbl[] = {
2447  // slm pcmpeq/pcmpgt throughput is 2
2448  { ISD::SETCC, MVT::v2i64, 2 },
2449  };
2450 
2451  static const CostTblEntry AVX512BWCostTbl[] = {
2452  { ISD::SETCC, MVT::v32i16, 1 },
2453  { ISD::SETCC, MVT::v64i8, 1 },
2454 
2455  { ISD::SELECT, MVT::v32i16, 1 },
2456  { ISD::SELECT, MVT::v64i8, 1 },
2457  };
2458 
2459  static const CostTblEntry AVX512CostTbl[] = {
2460  { ISD::SETCC, MVT::v8i64, 1 },
2461  { ISD::SETCC, MVT::v16i32, 1 },
2462  { ISD::SETCC, MVT::v8f64, 1 },
2463  { ISD::SETCC, MVT::v16f32, 1 },
2464 
2465  { ISD::SELECT, MVT::v8i64, 1 },
2466  { ISD::SELECT, MVT::v16i32, 1 },
2467  { ISD::SELECT, MVT::v8f64, 1 },
2468  { ISD::SELECT, MVT::v16f32, 1 },
2469 
2470  { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
2471  { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
2472 
2473  { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3
2474  { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3
2475  };
2476 
2477  static const CostTblEntry AVX2CostTbl[] = {
2478  { ISD::SETCC, MVT::v4i64, 1 },
2479  { ISD::SETCC, MVT::v8i32, 1 },
2480  { ISD::SETCC, MVT::v16i16, 1 },
2481  { ISD::SETCC, MVT::v32i8, 1 },
2482 
2483  { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
2484  { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
2485  { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
2486  { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
2487  };
2488 
2489  static const CostTblEntry AVX1CostTbl[] = {
2490  { ISD::SETCC, MVT::v4f64, 1 },
2491  { ISD::SETCC, MVT::v8f32, 1 },
2492  // AVX1 does not support 8-wide integer compare.
2493  { ISD::SETCC, MVT::v4i64, 4 },
2494  { ISD::SETCC, MVT::v8i32, 4 },
2495  { ISD::SETCC, MVT::v16i16, 4 },
2496  { ISD::SETCC, MVT::v32i8, 4 },
2497 
2498  { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
2499  { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
2500  { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
2501  { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
2502  { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
2503  { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
2504  };
2505 
2506  static const CostTblEntry SSE42CostTbl[] = {
2507  { ISD::SETCC, MVT::v2f64, 1 },
2508  { ISD::SETCC, MVT::v4f32, 1 },
2509  { ISD::SETCC, MVT::v2i64, 1 },
2510  };
2511 
2512  static const CostTblEntry SSE41CostTbl[] = {
2513  { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
2514  { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
2515  { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
2516  { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
2517  { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
2518  { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
2519  };
2520 
2521  static const CostTblEntry SSE2CostTbl[] = {
2522  { ISD::SETCC, MVT::v2f64, 2 },
2523  { ISD::SETCC, MVT::f64, 1 },
2524  { ISD::SETCC, MVT::v2i64, 8 },
2525  { ISD::SETCC, MVT::v4i32, 1 },
2526  { ISD::SETCC, MVT::v8i16, 1 },
2527  { ISD::SETCC, MVT::v16i8, 1 },
2528 
2529  { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
2530  { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
2531  { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
2532  { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
2533  { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
2534  };
2535 
2536  static const CostTblEntry SSE1CostTbl[] = {
2537  { ISD::SETCC, MVT::v4f32, 2 },
2538  { ISD::SETCC, MVT::f32, 1 },
2539 
2540  { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
2541  };
2542 
2543  if (ST->isSLM())
2544  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2545  return LT.first * (ExtraCost + Entry->Cost);
2546 
2547  if (ST->hasBWI())
2548  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2549  return LT.first * (ExtraCost + Entry->Cost);
2550 
2551  if (ST->hasAVX512())
2552  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2553  return LT.first * (ExtraCost + Entry->Cost);
2554 
2555  if (ST->hasAVX2())
2556  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2557  return LT.first * (ExtraCost + Entry->Cost);
2558 
2559  if (ST->hasAVX())
2560  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2561  return LT.first * (ExtraCost + Entry->Cost);
2562 
2563  if (ST->hasSSE42())
2564  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2565  return LT.first * (ExtraCost + Entry->Cost);
2566 
2567  if (ST->hasSSE41())
2568  if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
2569  return LT.first * (ExtraCost + Entry->Cost);
2570 
2571  if (ST->hasSSE2())
2572  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2573  return LT.first * (ExtraCost + Entry->Cost);
2574 
2575  if (ST->hasSSE1())
2576  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2577  return LT.first * (ExtraCost + Entry->Cost);
2578 
2579  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2580 }
2581 
2583 
2587 
2588  // Costs should match the codegen from:
2589  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
2590  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
2591  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
2592  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
2593  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
2594 
2595  // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
2596  // specialized in these tables yet.
2597  static const CostTblEntry AVX512BITALGCostTbl[] = {
2598  { ISD::CTPOP, MVT::v32i16, 1 },
2599  { ISD::CTPOP, MVT::v64i8, 1 },
2600  { ISD::CTPOP, MVT::v16i16, 1 },
2601  { ISD::CTPOP, MVT::v32i8, 1 },
2602  { ISD::CTPOP, MVT::v8i16, 1 },
2603  { ISD::CTPOP, MVT::v16i8, 1 },
2604  };
2605  static const CostTblEntry AVX512VPOPCNTDQCostTbl[] = {
2606  { ISD::CTPOP, MVT::v8i64, 1 },
2607  { ISD::CTPOP, MVT::v16i32, 1 },
2608  { ISD::CTPOP, MVT::v4i64, 1 },
2609  { ISD::CTPOP, MVT::v8i32, 1 },
2610  { ISD::CTPOP, MVT::v2i64, 1 },
2611  { ISD::CTPOP, MVT::v4i32, 1 },
2612  };
2613  static const CostTblEntry AVX512CDCostTbl[] = {
2614  { ISD::CTLZ, MVT::v8i64, 1 },
2615  { ISD::CTLZ, MVT::v16i32, 1 },
2616  { ISD::CTLZ, MVT::v32i16, 8 },
2617  { ISD::CTLZ, MVT::v64i8, 20 },
2618  { ISD::CTLZ, MVT::v4i64, 1 },
2619  { ISD::CTLZ, MVT::v8i32, 1 },
2620  { ISD::CTLZ, MVT::v16i16, 4 },
2621  { ISD::CTLZ, MVT::v32i8, 10 },
2622  { ISD::CTLZ, MVT::v2i64, 1 },
2623  { ISD::CTLZ, MVT::v4i32, 1 },
2624  { ISD::CTLZ, MVT::v8i16, 4 },
2625  { ISD::CTLZ, MVT::v16i8, 4 },
2626  };
2627  static const CostTblEntry AVX512BWCostTbl[] = {
2628  { ISD::ABS, MVT::v32i16, 1 },
2629  { ISD::ABS, MVT::v64i8, 1 },
2630  { ISD::BITREVERSE, MVT::v8i64, 3 },
2631  { ISD::BITREVERSE, MVT::v16i32, 3 },
2632  { ISD::BITREVERSE, MVT::v32i16, 3 },
2633  { ISD::BITREVERSE, MVT::v64i8, 2 },
2634  { ISD::BSWAP, MVT::v8i64, 1 },
2635  { ISD::BSWAP, MVT::v16i32, 1 },
2636  { ISD::BSWAP, MVT::v32i16, 1 },
2637  { ISD::CTLZ, MVT::v8i64, 23 },
2638  { ISD::CTLZ, MVT::v16i32, 22 },
2639  { ISD::CTLZ, MVT::v32i16, 18 },
2640  { ISD::CTLZ, MVT::v64i8, 17 },
2641  { ISD::CTPOP, MVT::v8i64, 7 },
2642  { ISD::CTPOP, MVT::v16i32, 11 },
2643  { ISD::CTPOP, MVT::v32i16, 9 },
2644  { ISD::CTPOP, MVT::v64i8, 6 },
2645  { ISD::CTTZ, MVT::v8i64, 10 },
2646  { ISD::CTTZ, MVT::v16i32, 14 },
2647  { ISD::CTTZ, MVT::v32i16, 12 },
2648  { ISD::CTTZ, MVT::v64i8, 9 },
2649  { ISD::SADDSAT, MVT::v32i16, 1 },
2650  { ISD::SADDSAT, MVT::v64i8, 1 },
2651  { ISD::SMAX, MVT::v32i16, 1 },
2652  { ISD::SMAX, MVT::v64i8, 1 },
2653  { ISD::SMIN, MVT::v32i16, 1 },
2654  { ISD::SMIN, MVT::v64i8, 1 },
2655  { ISD::SSUBSAT, MVT::v32i16, 1 },
2656  { ISD::SSUBSAT, MVT::v64i8, 1 },
2657  { ISD::UADDSAT, MVT::v32i16, 1 },
2658  { ISD::UADDSAT, MVT::v64i8, 1 },
2659  { ISD::UMAX, MVT::v32i16, 1 },
2660  { ISD::UMAX, MVT::v64i8, 1 },
2661  { ISD::UMIN, MVT::v32i16, 1 },
2662  { ISD::UMIN, MVT::v64i8, 1 },
2663  { ISD::USUBSAT, MVT::v32i16, 1 },
2664  { ISD::USUBSAT, MVT::v64i8, 1 },
2665  };
2666  static const CostTblEntry AVX512CostTbl[] = {
2667  { ISD::ABS, MVT::v8i64, 1 },
2668  { ISD::ABS, MVT::v16i32, 1 },
2669  { ISD::ABS, MVT::v32i16, 2 }, // FIXME: include split
2670  { ISD::ABS, MVT::v64i8, 2 }, // FIXME: include split
2671  { ISD::ABS, MVT::v4i64, 1 },
2672  { ISD::ABS, MVT::v2i64, 1 },
2673  { ISD::BITREVERSE, MVT::v8i64, 36 },
2674  { ISD::BITREVERSE, MVT::v16i32, 24 },
2675  { ISD::BITREVERSE, MVT::v32i16, 10 },
2676  { ISD::BITREVERSE, MVT::v64i8, 10 },
2677  { ISD::BSWAP, MVT::v8i64, 4 },
2678  { ISD::BSWAP, MVT::v16i32, 4 },
2679  { ISD::BSWAP, MVT::v32i16, 4 },
2680  { ISD::CTLZ, MVT::v8i64, 29 },
2681  { ISD::CTLZ, MVT::v16i32, 35 },
2682  { ISD::CTLZ, MVT::v32i16, 28 },
2683  { ISD::CTLZ, MVT::v64i8, 18 },
2684  { ISD::CTPOP, MVT::v8i64, 16 },
2685  { ISD::CTPOP, MVT::v16i32, 24 },
2686  { ISD::CTPOP, MVT::v32i16, 18 },
2687  { ISD::CTPOP, MVT::v64i8, 12 },
2688  { ISD::CTTZ, MVT::v8i64, 20 },
2689  { ISD::CTTZ, MVT::v16i32, 28 },
2690  { ISD::CTTZ, MVT::v32i16, 24 },
2691  { ISD::CTTZ, MVT::v64i8, 18 },
2692  { ISD::SMAX, MVT::v8i64, 1 },
2693  { ISD::SMAX, MVT::v16i32, 1 },
2694  { ISD::SMAX, MVT::v32i16, 2 }, // FIXME: include split
2695  { ISD::SMAX, MVT::v64i8, 2 }, // FIXME: include split
2696  { ISD::SMAX, MVT::v4i64, 1 },
2697  { ISD::SMAX, MVT::v2i64, 1 },
2698  { ISD::SMIN, MVT::v8i64, 1 },
2699  { ISD::SMIN, MVT::v16i32, 1 },
2700  { ISD::SMIN, MVT::v32i16, 2 }, // FIXME: include split
2701  { ISD::SMIN, MVT::v64i8, 2 }, // FIXME: include split
2702  { ISD::SMIN, MVT::v4i64, 1 },
2703  { ISD::SMIN, MVT::v2i64, 1 },
2704  { ISD::UMAX, MVT::v8i64, 1 },
2705  { ISD::UMAX, MVT::v16i32, 1 },
2706  { ISD::UMAX, MVT::v32i16, 2 }, // FIXME: include split
2707  { ISD::UMAX, MVT::v64i8, 2 }, // FIXME: include split
2708  { ISD::UMAX, MVT::v4i64, 1 },
2709  { ISD::UMAX, MVT::v2i64, 1 },
2710  { ISD::UMIN, MVT::v8i64, 1 },
2711  { ISD::UMIN, MVT::v16i32, 1 },
2712  { ISD::UMIN, MVT::v32i16, 2 }, // FIXME: include split
2713  { ISD::UMIN, MVT::v64i8, 2 }, // FIXME: include split
2714  { ISD::UMIN, MVT::v4i64, 1 },
2715  { ISD::UMIN, MVT::v2i64, 1 },
2716  { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
2717  { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
2718  { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
2719  { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
2720  { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
2721  { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
2722  { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
2723  { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
2724  { ISD::SADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2725  { ISD::SADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2726  { ISD::SSUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2727  { ISD::SSUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2728  { ISD::UADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2729  { ISD::UADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2730  { ISD::USUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2731  { ISD::USUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2732  { ISD::FMAXNUM, MVT::f32, 2 },
2733  { ISD::FMAXNUM, MVT::v4f32, 2 },
2734  { ISD::FMAXNUM, MVT::v8f32, 2 },
2735  { ISD::FMAXNUM, MVT::v16f32, 2 },
2736  { ISD::FMAXNUM, MVT::f64, 2 },
2737  { ISD::FMAXNUM, MVT::v2f64, 2 },
2738  { ISD::FMAXNUM, MVT::v4f64, 2 },
2739  { ISD::FMAXNUM, MVT::v8f64, 2 },
2740  };
2741  static const CostTblEntry XOPCostTbl[] = {
2742  { ISD::BITREVERSE, MVT::v4i64, 4 },
2743  { ISD::BITREVERSE, MVT::v8i32, 4 },
2744  { ISD::BITREVERSE, MVT::v16i16, 4 },
2745  { ISD::BITREVERSE, MVT::v32i8, 4 },
2746  { ISD::BITREVERSE, MVT::v2i64, 1 },
2747  { ISD::BITREVERSE, MVT::v4i32, 1 },
2748  { ISD::BITREVERSE, MVT::v8i16, 1 },
2749  { ISD::BITREVERSE, MVT::v16i8, 1 },
2750  { ISD::BITREVERSE, MVT::i64, 3 },
2751  { ISD::BITREVERSE, MVT::i32, 3 },
2752  { ISD::BITREVERSE, MVT::i16, 3 },
2753  { ISD::BITREVERSE, MVT::i8, 3 }
2754  };
2755  static const CostTblEntry AVX2CostTbl[] = {
2756  { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2757  { ISD::ABS, MVT::v8i32, 1 },
2758  { ISD::ABS, MVT::v16i16, 1 },
2759  { ISD::ABS, MVT::v32i8, 1 },
2760  { ISD::BITREVERSE, MVT::v2i64, 3 },
2761  { ISD::BITREVERSE, MVT::v4i64, 3 },
2762  { ISD::BITREVERSE, MVT::v4i32, 3 },
2763  { ISD::BITREVERSE, MVT::v8i32, 3 },
2764  { ISD::BITREVERSE, MVT::v8i16, 3 },
2765  { ISD::BITREVERSE, MVT::v16i16, 3 },
2766  { ISD::BITREVERSE, MVT::v16i8, 3 },
2767  { ISD::BITREVERSE, MVT::v32i8, 3 },
2768  { ISD::BSWAP, MVT::v4i64, 1 },
2769  { ISD::BSWAP, MVT::v8i32, 1 },
2770  { ISD::BSWAP, MVT::v16i16, 1 },
2771  { ISD::CTLZ, MVT::v2i64, 7 },
2772  { ISD::CTLZ, MVT::v4i64, 7 },
2773  { ISD::CTLZ, MVT::v4i32, 5 },
2774  { ISD::CTLZ, MVT::v8i32, 5 },
2775  { ISD::CTLZ, MVT::v8i16, 4 },
2776  { ISD::CTLZ, MVT::v16i16, 4 },
2777  { ISD::CTLZ, MVT::v16i8, 3 },
2778  { ISD::CTLZ, MVT::v32i8, 3 },
2779  { ISD::CTPOP, MVT::v2i64, 3 },
2780  { ISD::CTPOP, MVT::v4i64, 3 },
2781  { ISD::CTPOP, MVT::v4i32, 7 },
2782  { ISD::CTPOP, MVT::v8i32, 7 },
2783  { ISD::CTPOP, MVT::v8i16, 3 },
2784  { ISD::CTPOP, MVT::v16i16, 3 },
2785  { ISD::CTPOP, MVT::v16i8, 2 },
2786  { ISD::CTPOP, MVT::v32i8, 2 },
2787  { ISD::CTTZ, MVT::v2i64, 4 },
2788  { ISD::CTTZ, MVT::v4i64, 4 },
2789  { ISD::CTTZ, MVT::v4i32, 7 },
2790  { ISD::CTTZ, MVT::v8i32, 7 },
2791  { ISD::CTTZ, MVT::v8i16, 4 },
2792  { ISD::CTTZ, MVT::v16i16, 4 },
2793  { ISD::CTTZ, MVT::v16i8, 3 },
2794  { ISD::CTTZ, MVT::v32i8, 3 },
2795  { ISD::SADDSAT, MVT::v16i16, 1 },
2796  { ISD::SADDSAT, MVT::v32i8, 1 },
2797  { ISD::SMAX, MVT::v8i32, 1 },
2798  { ISD::SMAX, MVT::v16i16, 1 },
2799  { ISD::SMAX, MVT::v32i8, 1 },
2800  { ISD::SMIN, MVT::v8i32, 1 },
2801  { ISD::SMIN, MVT::v16i16, 1 },
2802  { ISD::SMIN, MVT::v32i8, 1 },
2803  { ISD::SSUBSAT, MVT::v16i16, 1 },
2804  { ISD::SSUBSAT, MVT::v32i8, 1 },
2805  { ISD::UADDSAT, MVT::v16i16, 1 },
2806  { ISD::UADDSAT, MVT::v32i8, 1 },
2807  { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
2808  { ISD::UMAX, MVT::v8i32, 1 },
2809  { ISD::UMAX, MVT::v16i16, 1 },
2810  { ISD::UMAX, MVT::v32i8, 1 },
2811  { ISD::UMIN, MVT::v8i32, 1 },
2812  { ISD::UMIN, MVT::v16i16, 1 },
2813  { ISD::UMIN, MVT::v32i8, 1 },
2814  { ISD::USUBSAT, MVT::v16i16, 1 },
2815  { ISD::USUBSAT, MVT::v32i8, 1 },
2816  { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
2817  { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
2818  { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
2819  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
2820  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
2821  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
2822  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
2823  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
2824  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
2825  };
2826  static const CostTblEntry AVX1CostTbl[] = {
2827  { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2828  { ISD::ABS, MVT::v8i32, 3 },
2829  { ISD::ABS, MVT::v16i16, 3 },
2830  { ISD::ABS, MVT::v32i8, 3 },
2831  { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
2832  { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
2833  { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
2834  { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
2835  { ISD::BSWAP, MVT::v4i64, 4 },
2836  { ISD::BSWAP, MVT::v8i32, 4 },
2837  { ISD::BSWAP, MVT::v16i16, 4 },
2838  { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
2839  { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
2840  { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
2841  { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2842  { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
2843  { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
2844  { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
2845  { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
2846  { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
2847  { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
2848  { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
2849  { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2850  { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2851  { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2852  { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2853  { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2854  { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2855  { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2856  { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2857  { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2858  { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2859  { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2860  { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2861  { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2862  { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
2863  { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2864  { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2865  { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2866  { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2867  { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2868  { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2869  { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2870  { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2871  { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
2872  { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS
2873  { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
2874  { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
2875  { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD
2876  { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
2877  { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
2878  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
2879  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
2880  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
2881  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
2882  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
2883  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
2884  };
2885  static const CostTblEntry GLMCostTbl[] = {
2886  { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
2887  { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
2888  { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
2889  { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
2890  };
2891  static const CostTblEntry SLMCostTbl[] = {
2892  { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
2893  { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
2894  { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
2895  { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
2896  };
2897  static const CostTblEntry SSE42CostTbl[] = {
2898  { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
2899  { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
2900  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
2901  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
2902  };
2903  static const CostTblEntry SSE41CostTbl[] = {
2904  { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X)
2905  { ISD::SMAX, MVT::v4i32, 1 },
2906  { ISD::SMAX, MVT::v16i8, 1 },
2907  { ISD::SMIN, MVT::v4i32, 1 },
2908  { ISD::SMIN, MVT::v16i8, 1 },
2909  { ISD::UMAX, MVT::v4i32, 1 },
2910  { ISD::UMAX, MVT::v8i16, 1 },
2911  { ISD::UMIN, MVT::v4i32, 1 },
2912  { ISD::UMIN, MVT::v8i16, 1 },
2913  };
2914  static const CostTblEntry SSSE3CostTbl[] = {
2915  { ISD::ABS, MVT::v4i32, 1 },
2916  { ISD::ABS, MVT::v8i16, 1 },
2917  { ISD::ABS, MVT::v16i8, 1 },
2918  { ISD::BITREVERSE, MVT::v2i64, 5 },
2919  { ISD::BITREVERSE, MVT::v4i32, 5 },
2920  { ISD::BITREVERSE, MVT::v8i16, 5 },
2921  { ISD::BITREVERSE, MVT::v16i8, 5 },
2922  { ISD::BSWAP, MVT::v2i64, 1 },
2923  { ISD::BSWAP, MVT::v4i32, 1 },
2924  { ISD::BSWAP, MVT::v8i16, 1 },
2925  { ISD::CTLZ, MVT::v2i64, 23 },
2926  { ISD::CTLZ, MVT::v4i32, 18 },
2927  { ISD::CTLZ, MVT::v8i16, 14 },
2928  { ISD::CTLZ, MVT::v16i8, 9 },
2929  { ISD::CTPOP, MVT::v2i64, 7 },
2930  { ISD::CTPOP, MVT::v4i32, 11 },
2931  { ISD::CTPOP, MVT::v8i16, 9 },
2932  { ISD::CTPOP, MVT::v16i8, 6 },
2933  { ISD::CTTZ, MVT::v2i64, 10 },
2934  { ISD::CTTZ, MVT::v4i32, 14 },
2935  { ISD::CTTZ, MVT::v8i16, 12 },
2936  { ISD::CTTZ, MVT::v16i8, 9 }
2937  };
2938  static const CostTblEntry SSE2CostTbl[] = {
2939  { ISD::ABS, MVT::v2i64, 4 },
2940  { ISD::ABS, MVT::v4i32, 3 },
2941  { ISD::ABS, MVT::v8i16, 2 },
2942  { ISD::ABS, MVT::v16i8, 2 },
2943  { ISD::BITREVERSE, MVT::v2i64, 29 },
2944  { ISD::BITREVERSE, MVT::v4i32, 27 },
2945  { ISD::BITREVERSE, MVT::v8i16, 27 },
2946  { ISD::BITREVERSE, MVT::v16i8, 20 },
2947  { ISD::BSWAP, MVT::v2i64, 7 },
2948  { ISD::BSWAP, MVT::v4i32, 7 },
2949  { ISD::BSWAP, MVT::v8i16, 7 },
2950  { ISD::CTLZ, MVT::v2i64, 25 },
2951  { ISD::CTLZ, MVT::v4i32, 26 },
2952  { ISD::CTLZ, MVT::v8i16, 20 },
2953  { ISD::CTLZ, MVT::v16i8, 17 },
2954  { ISD::CTPOP, MVT::v2i64, 12 },
2955  { ISD::CTPOP, MVT::v4i32, 15 },
2956  { ISD::CTPOP, MVT::v8i16, 13 },
2957  { ISD::CTPOP, MVT::v16i8, 10 },
2958  { ISD::CTTZ, MVT::v2i64, 14 },
2959  { ISD::CTTZ, MVT::v4i32, 18 },
2960  { ISD::CTTZ, MVT::v8i16, 16 },
2961  { ISD::CTTZ, MVT::v16i8, 13 },
2962  { ISD::SADDSAT, MVT::v8i16, 1 },
2963  { ISD::SADDSAT, MVT::v16i8, 1 },
2964  { ISD::SMAX, MVT::v8i16, 1 },
2965  { ISD::SMIN, MVT::v8i16, 1 },
2966  { ISD::SSUBSAT, MVT::v8i16, 1 },
2967  { ISD::SSUBSAT, MVT::v16i8, 1 },
2968  { ISD::UADDSAT, MVT::v8i16, 1 },
2969  { ISD::UADDSAT, MVT::v16i8, 1 },
2970  { ISD::UMAX, MVT::v8i16, 2 },
2971  { ISD::UMAX, MVT::v16i8, 1 },
2972  { ISD::UMIN, MVT::v8i16, 2 },
2973  { ISD::UMIN, MVT::v16i8, 1 },
2974  { ISD::USUBSAT, MVT::v8i16, 1 },
2975  { ISD::USUBSAT, MVT::v16i8, 1 },
2976  { ISD::FMAXNUM, MVT::f64, 4 },
2977  { ISD::FMAXNUM, MVT::v2f64, 4 },
2978  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
2979  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
2980  };
2981  static const CostTblEntry SSE1CostTbl[] = {
2982  { ISD::FMAXNUM, MVT::f32, 4 },
2983  { ISD::FMAXNUM, MVT::v4f32, 4 },
2984  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
2985  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
2986  };
2987  static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
2988  { ISD::CTTZ, MVT::i64, 1 },
2989  };
2990  static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
2991  { ISD::CTTZ, MVT::i32, 1 },
2992  { ISD::CTTZ, MVT::i16, 1 },
2993  { ISD::CTTZ, MVT::i8, 1 },
2994  };
2995  static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
2996  { ISD::CTLZ, MVT::i64, 1 },
2997  };
2998  static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
2999  { ISD::CTLZ, MVT::i32, 1 },
3000  { ISD::CTLZ, MVT::i16, 1 },
3001  { ISD::CTLZ, MVT::i8, 1 },
3002  };
3003  static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
3004  { ISD::CTPOP, MVT::i64, 1 },
3005  };
3006  static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
3007  { ISD::CTPOP, MVT::i32, 1 },
3008  { ISD::CTPOP, MVT::i16, 1 },
3009  { ISD::CTPOP, MVT::i8, 1 },
3010  };
3011  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
3012  { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
3013  { ISD::BITREVERSE, MVT::i64, 14 },
3014  { ISD::BSWAP, MVT::i64, 1 },
3015  { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
3016  { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
3017  { ISD::CTPOP, MVT::i64, 10 },
3018  { ISD::SADDO, MVT::i64, 1 },
3019  { ISD::UADDO, MVT::i64, 1 },
3020  { ISD::UMULO, MVT::i64, 2 }, // mulq + seto
3021  };
3022  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
3023  { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV
3024  { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV
3025  { ISD::BITREVERSE, MVT::i32, 14 },
3026  { ISD::BITREVERSE, MVT::i16, 14 },
3027  { ISD::BITREVERSE, MVT::i8, 11 },
3028  { ISD::BSWAP, MVT::i32, 1 },
3029  { ISD::BSWAP, MVT::i16, 1 }, // ROL
3030  { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
3031  { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
3032  { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
3033  { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH
3034  { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH
3035  { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH
3036  { ISD::CTPOP, MVT::i32, 8 },
3037  { ISD::CTPOP, MVT::i16, 9 },
3038  { ISD::CTPOP, MVT::i8, 7 },
3039  { ISD::SADDO, MVT::i32, 1 },
3040  { ISD::SADDO, MVT::i16, 1 },
3041  { ISD::SADDO, MVT::i8, 1 },
3042  { ISD::UADDO, MVT::i32, 1 },
3043  { ISD::UADDO, MVT::i16, 1 },
3044  { ISD::UADDO, MVT::i8, 1 },
3045  { ISD::UMULO, MVT::i32, 2 }, // mul + seto
3046  { ISD::UMULO, MVT::i16, 2 },
3047  { ISD::UMULO, MVT::i8, 2 },
3048  };
3049 
3050  Type *RetTy = ICA.getReturnType();
3051  Type *OpTy = RetTy;
3052  Intrinsic::ID IID = ICA.getID();
3053  unsigned ISD = ISD::DELETED_NODE;
3054  switch (IID) {
3055  default:
3056  break;
3057  case Intrinsic::abs:
3058  ISD = ISD::ABS;
3059  break;
3060  case Intrinsic::bitreverse:
3061  ISD = ISD::BITREVERSE;
3062  break;
3063  case Intrinsic::bswap:
3064  ISD = ISD::BSWAP;
3065  break;
3066  case Intrinsic::ctlz:
3067  ISD = ISD::CTLZ;
3068  break;
3069  case Intrinsic::ctpop:
3070  ISD = ISD::CTPOP;
3071  break;
3072  case Intrinsic::cttz:
3073  ISD = ISD::CTTZ;
3074  break;
3075  case Intrinsic::maxnum:
3076  case Intrinsic::minnum:
3077  // FMINNUM has same costs so don't duplicate.
3078  ISD = ISD::FMAXNUM;
3079  break;
3080  case Intrinsic::sadd_sat:
3081  ISD = ISD::SADDSAT;
3082  break;
3083  case Intrinsic::smax:
3084  ISD = ISD::SMAX;
3085  break;
3086  case Intrinsic::smin:
3087  ISD = ISD::SMIN;
3088  break;
3089  case Intrinsic::ssub_sat:
3090  ISD = ISD::SSUBSAT;
3091  break;
3092  case Intrinsic::uadd_sat:
3093  ISD = ISD::UADDSAT;
3094  break;
3095  case Intrinsic::umax:
3096  ISD = ISD::UMAX;
3097  break;
3098  case Intrinsic::umin:
3099  ISD = ISD::UMIN;
3100  break;
3101  case Intrinsic::usub_sat:
3102  ISD = ISD::USUBSAT;
3103  break;
3104  case Intrinsic::sqrt:
3105  ISD = ISD::FSQRT;
3106  break;
3107  case Intrinsic::sadd_with_overflow:
3108  case Intrinsic::ssub_with_overflow:
3109  // SSUBO has same costs so don't duplicate.
3110  ISD = ISD::SADDO;
3111  OpTy = RetTy->getContainedType(0);
3112  break;
3113  case Intrinsic::uadd_with_overflow:
3114  case Intrinsic::usub_with_overflow:
3115  // USUBO has same costs so don't duplicate.
3116  ISD = ISD::UADDO;
3117  OpTy = RetTy->getContainedType(0);
3118  break;
3119  case Intrinsic::umul_with_overflow:
3120  case Intrinsic::smul_with_overflow:
3121  // SMULO has same costs so don't duplicate.
3122  ISD = ISD::UMULO;
3123  OpTy = RetTy->getContainedType(0);
3124  break;
3125  }
3126 
3127  if (ISD != ISD::DELETED_NODE) {
3128  // Legalize the type.
3129  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
3130  MVT MTy = LT.second;
3131 
3132  // Attempt to lookup cost.
3133  if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
3134  MTy.isVector()) {
3135  // With PSHUFB the code is very similar for all types. If we have integer
3136  // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
3137  // we also need a PSHUFB.
3138  unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
3139 
3140  // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
3141  // instructions. We also need an extract and an insert.
3142  if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
3143  (ST->hasBWI() && MTy.is512BitVector())))
3144  Cost = Cost * 2 + 2;
3145 
3146  return LT.first * Cost;
3147  }
3148 
3149  auto adjustTableCost = [](const CostTblEntry &Entry,
3150  InstructionCost LegalizationCost,
3151  FastMathFlags FMF) {
3152  // If there are no NANs to deal with, then these are reduced to a
3153  // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
3154  // assume is used in the non-fast case.
3155  if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) {
3156  if (FMF.noNaNs())
3157  return LegalizationCost * 1;
3158  }
3159  return LegalizationCost * (int)Entry.Cost;
3160  };
3161 
3162  if (ST->useGLMDivSqrtCosts())
3163  if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
3164  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3165 
3166  if (ST->isSLM())
3167  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3168  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3169 
3170  if (ST->hasBITALG())
3171  if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
3172  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3173 
3174  if (ST->hasVPOPCNTDQ())
3175  if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
3176  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3177 
3178  if (ST->hasCDI())
3179  if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
3180  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3181 
3182  if (ST->hasBWI())
3183  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3184  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3185 
3186  if (ST->hasAVX512())
3187  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3188  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3189 
3190  if (ST->hasXOP())
3191  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3192  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3193 
3194  if (ST->hasAVX2())
3195  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3196  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3197 
3198  if (ST->hasAVX())
3199  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3200  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3201 
3202  if (ST->hasSSE42())
3203  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3204  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3205 
3206  if (ST->hasSSE41())
3207  if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3208  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3209 
3210  if (ST->hasSSSE3())
3211  if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
3212  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3213 
3214  if (ST->hasSSE2())
3215  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3216  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3217 
3218  if (ST->hasSSE1())
3219  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3220  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3221 
3222  if (ST->hasBMI()) {
3223  if (ST->is64Bit())
3224  if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
3225  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3226 
3227  if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
3228  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3229  }
3230 
3231  if (ST->hasLZCNT()) {
3232  if (ST->is64Bit())
3233  if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
3234  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3235 
3236  if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
3237  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3238  }
3239 
3240  if (ST->hasPOPCNT()) {
3241  if (ST->is64Bit())
3242  if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
3243  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3244 
3245  if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
3246  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3247  }
3248 
3249  if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
3250  if (const Instruction *II = ICA.getInst()) {
3251  if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
3252  return TTI::TCC_Free;
3253  if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
3254  if (LI->hasOneUse())
3255  return TTI::TCC_Free;
3256  }
3257  }
3258  }
3259 
3260  // TODO - add BMI (TZCNT) scalar handling
3261 
3262  if (ST->is64Bit())
3263  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
3264  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3265 
3266  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
3267  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3268  }
3269 
3271 }
3272 
3276  if (ICA.isTypeBasedOnly())
3278 
3279  static const CostTblEntry AVX512CostTbl[] = {
3280  { ISD::ROTL, MVT::v8i64, 1 },
3281  { ISD::ROTL, MVT::v4i64, 1 },
3282  { ISD::ROTL, MVT::v2i64, 1 },
3283  { ISD::ROTL, MVT::v16i32, 1 },
3284  { ISD::ROTL, MVT::v8i32, 1 },
3285  { ISD::ROTL, MVT::v4i32, 1 },
3286  { ISD::ROTR, MVT::v8i64, 1 },
3287  { ISD::ROTR, MVT::v4i64, 1 },
3288  { ISD::ROTR, MVT::v2i64, 1 },
3289  { ISD::ROTR, MVT::v16i32, 1 },
3290  { ISD::ROTR, MVT::v8i32, 1 },
3291  { ISD::ROTR, MVT::v4i32, 1 }
3292  };
3293  // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3294  static const CostTblEntry XOPCostTbl[] = {
3295  { ISD::ROTL, MVT::v4i64, 4 },
3296  { ISD::ROTL, MVT::v8i32, 4 },
3297  { ISD::ROTL, MVT::v16i16, 4 },
3298  { ISD::ROTL, MVT::v32i8, 4 },
3299  { ISD::ROTL, MVT::v2i64, 1 },
3300  { ISD::ROTL, MVT::v4i32, 1 },
3301  { ISD::ROTL, MVT::v8i16, 1 },
3302  { ISD::ROTL, MVT::v16i8, 1 },
3303  { ISD::ROTR, MVT::v4i64, 6 },
3304  { ISD::ROTR, MVT::v8i32, 6 },
3305  { ISD::ROTR, MVT::v16i16, 6 },
3306  { ISD::ROTR, MVT::v32i8, 6 },
3307  { ISD::ROTR, MVT::v2i64, 2 },
3308  { ISD::ROTR, MVT::v4i32, 2 },
3309  { ISD::ROTR, MVT::v8i16, 2 },
3310  { ISD::ROTR, MVT::v16i8, 2 }
3311  };
3312  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
3313  { ISD::ROTL, MVT::i64, 1 },
3314  { ISD::ROTR, MVT::i64, 1 },
3315  { ISD::FSHL, MVT::i64, 4 }