LLVM  15.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of
17 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
18 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost.
21 /// Some examples of other technologies/CPUs:
22 /// SSE 3 - Pentium4 / Athlon64
23 /// SSE 4.1 - Penryn
24 /// SSE 4.2 - Nehalem
25 /// AVX - Sandy Bridge
26 /// AVX2 - Haswell
27 /// AVX-512 - Xeon Phi / Skylake
28 /// And some examples of instruction target dependent costs (latency)
29 /// divss sqrtss rsqrtss
30 /// AMD K7 11-16 19 3
31 /// Piledriver 9-24 13-15 5
32 /// Jaguar 14 16 2
33 /// Pentium II,III 18 30 2
34 /// Nehalem 7-14 7-18 3
35 /// Haswell 10-13 11 5
36 /// TODO: Develop and implement the target dependent cost model and
37 /// specialize cost numbers for different Cost Model Targets such as throughput,
38 /// code size, latency and uop count.
39 //===----------------------------------------------------------------------===//
40 
41 #include "X86TargetTransformInfo.h"
44 #include "llvm/CodeGen/CostTable.h"
46 #include "llvm/IR/InstIterator.h"
47 #include "llvm/IR/IntrinsicInst.h"
48 #include "llvm/Support/Debug.h"
49 
50 using namespace llvm;
51 
52 #define DEBUG_TYPE "x86tti"
53 
54 //===----------------------------------------------------------------------===//
55 //
56 // X86 cost model.
57 //
58 //===----------------------------------------------------------------------===//
59 
61 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
62  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
63  // TODO: Currently the __builtin_popcount() implementation using SSE3
64  // instructions is inefficient. Once the problem is fixed, we should
65  // call ST->hasSSE3() instead of ST->hasPOPCNT().
66  return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
67 }
68 
71  switch (Level) {
73  // - Penryn
74  // - Nehalem
75  // - Westmere
76  // - Sandy Bridge
77  // - Ivy Bridge
78  // - Haswell
79  // - Broadwell
80  // - Skylake
81  // - Kabylake
82  return 32 * 1024; // 32 KByte
84  // - Penryn
85  // - Nehalem
86  // - Westmere
87  // - Sandy Bridge
88  // - Ivy Bridge
89  // - Haswell
90  // - Broadwell
91  // - Skylake
92  // - Kabylake
93  return 256 * 1024; // 256 KByte
94  }
95 
96  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
97 }
98 
101  // - Penryn
102  // - Nehalem
103  // - Westmere
104  // - Sandy Bridge
105  // - Ivy Bridge
106  // - Haswell
107  // - Broadwell
108  // - Skylake
109  // - Kabylake
110  switch (Level) {
114  return 8;
115  }
116 
117  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
118 }
119 
120 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
121  bool Vector = (ClassID == 1);
122  if (Vector && !ST->hasSSE1())
123  return 0;
124 
125  if (ST->is64Bit()) {
126  if (Vector && ST->hasAVX512())
127  return 32;
128  return 16;
129  }
130  return 8;
131 }
132 
133 TypeSize
135  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
136  switch (K) {
138  return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
140  if (ST->hasAVX512() && PreferVectorWidth >= 512)
141  return TypeSize::getFixed(512);
142  if (ST->hasAVX() && PreferVectorWidth >= 256)
143  return TypeSize::getFixed(256);
144  if (ST->hasSSE1() && PreferVectorWidth >= 128)
145  return TypeSize::getFixed(128);
146  return TypeSize::getFixed(0);
148  return TypeSize::getScalable(0);
149  }
150 
151  llvm_unreachable("Unsupported register kind");
152 }
153 
154 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
156  .getFixedSize();
157 }
158 
159 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
160  // If the loop will not be vectorized, don't interleave the loop.
161  // Let regular unroll to unroll the loop, which saves the overflow
162  // check and memory check cost.
163  if (VF == 1)
164  return 1;
165 
166  if (ST->isAtom())
167  return 1;
168 
169  // Sandybridge and Haswell have multiple execution ports and pipelined
170  // vector units.
171  if (ST->hasAVX())
172  return 4;
173 
174  return 2;
175 }
176 
178  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
180  TTI::OperandValueProperties Opd1PropInfo,
182  const Instruction *CxtI) {
183  // TODO: Handle more cost kinds.
185  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
186  Op2Info, Opd1PropInfo,
187  Opd2PropInfo, Args, CxtI);
188 
189  // vXi8 multiplications are always promoted to vXi16.
190  if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
191  Ty->getScalarSizeInBits() == 8) {
192  Type *WideVecTy =
193  VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
194  return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
196  CostKind) +
197  getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
199  CostKind) +
200  getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info,
201  Opd1PropInfo, Opd2PropInfo);
202  }
203 
204  // Legalize the type.
205  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
206 
207  int ISD = TLI->InstructionOpcodeToISD(Opcode);
208  assert(ISD && "Invalid opcode");
209 
210  if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
211  LT.second.getScalarType() == MVT::i32) {
212  // Check if the operands can be represented as a smaller datatype.
213  bool Op1Signed = false, Op2Signed = false;
214  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
215  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
216  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
217 
218  // If both are representable as i15 and at least one is constant,
219  // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
220  // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
221  if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) {
222  bool Op1Constant =
223  isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
224  bool Op2Constant =
225  isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
226  bool Op1Sext = isa<SExtInst>(Args[0]) &&
227  (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
228  bool Op2Sext = isa<SExtInst>(Args[1]) &&
229  (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
230 
231  bool IsZeroExtended = !Op1Signed || !Op2Signed;
232  bool IsConstant = Op1Constant || Op2Constant;
233  bool IsSext = Op1Sext || Op2Sext;
234  if (IsConstant || IsZeroExtended || IsSext)
235  LT.second =
236  MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
237  }
238  }
239 
240  // Vector multiply by pow2 will be simplified to shifts.
241  if (ISD == ISD::MUL &&
244  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2)
245  return getArithmeticInstrCost(Instruction::Shl, Ty, CostKind, Op1Info,
248 
249  // On X86, vector signed division by constants power-of-two are
250  // normally expanded to the sequence SRA + SRL + ADD + SRA.
251  // The OperandValue properties may not be the same as that of the previous
252  // operation; conservatively assume OP_None.
253  if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
256  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
257  InstructionCost Cost =
258  2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
261  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
264  Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
267 
268  if (ISD == ISD::SREM) {
269  // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
270  Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
271  Op2Info);
272  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
273  Op2Info);
274  }
275 
276  return Cost;
277  }
278 
279  // Vector unsigned division/remainder will be simplified to shifts/masks.
280  if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
283  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
284  if (ISD == ISD::UDIV)
285  return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
288  // UREM
289  return getArithmeticInstrCost(Instruction::And, Ty, CostKind, Op1Info,
292  }
293 
294  static const CostTblEntry GLMCostTable[] = {
295  { ISD::FDIV, MVT::f32, 18 }, // divss
296  { ISD::FDIV, MVT::v4f32, 35 }, // divps
297  { ISD::FDIV, MVT::f64, 33 }, // divsd
298  { ISD::FDIV, MVT::v2f64, 65 }, // divpd
299  };
300 
301  if (ST->useGLMDivSqrtCosts())
302  if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
303  LT.second))
304  return LT.first * Entry->Cost;
305 
306  static const CostTblEntry SLMCostTable[] = {
307  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
308  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
309  { ISD::FMUL, MVT::f64, 2 }, // mulsd
310  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
311  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
312  { ISD::FDIV, MVT::f32, 17 }, // divss
313  { ISD::FDIV, MVT::v4f32, 39 }, // divps
314  { ISD::FDIV, MVT::f64, 32 }, // divsd
315  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
316  { ISD::FADD, MVT::v2f64, 2 }, // addpd
317  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
318  // v2i64/v4i64 mul is custom lowered as a series of long:
319  // multiplies(3), shifts(3) and adds(2)
320  // slm muldq version throughput is 2 and addq throughput 4
321  // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
322  // 3X4 (addq throughput) = 17
323  { ISD::MUL, MVT::v2i64, 17 },
324  // slm addq\subq throughput is 4
325  { ISD::ADD, MVT::v2i64, 4 },
326  { ISD::SUB, MVT::v2i64, 4 },
327  };
328 
329  if (ST->useSLMArithCosts()) {
330  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
331  // Check if the operands can be shrinked into a smaller datatype.
332  // TODO: Merge this into generiic vXi32 MUL patterns above.
333  bool Op1Signed = false;
334  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
335  bool Op2Signed = false;
336  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
337 
338  bool SignedMode = Op1Signed || Op2Signed;
339  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
340 
341  if (OpMinSize <= 7)
342  return LT.first * 3; // pmullw/sext
343  if (!SignedMode && OpMinSize <= 8)
344  return LT.first * 3; // pmullw/zext
345  if (OpMinSize <= 15)
346  return LT.first * 5; // pmullw/pmulhw/pshuf
347  if (!SignedMode && OpMinSize <= 16)
348  return LT.first * 5; // pmullw/pmulhw/pshuf
349  }
350 
351  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
352  LT.second)) {
353  return LT.first * Entry->Cost;
354  }
355  }
356 
357  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
358  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
359  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
360  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
361  };
362 
364  ST->hasBWI()) {
365  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
366  LT.second))
367  return LT.first * Entry->Cost;
368  }
369 
370  static const CostTblEntry AVX512UniformConstCostTable[] = {
371  { ISD::SRA, MVT::v2i64, 1 },
372  { ISD::SRA, MVT::v4i64, 1 },
373  { ISD::SRA, MVT::v8i64, 1 },
374 
375  { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
376  { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
377  { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
378 
379  { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
380  { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
381  { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
382  { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
383  };
384 
386  ST->hasAVX512()) {
387  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
388  LT.second))
389  return LT.first * Entry->Cost;
390  }
391 
392  static const CostTblEntry AVX2UniformConstCostTable[] = {
393  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
394  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
395  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
396 
397  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
398 
399  { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
400  { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
401  { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
402  { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
403  };
404 
406  ST->hasAVX2()) {
407  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
408  LT.second))
409  return LT.first * Entry->Cost;
410  }
411 
412  static const CostTblEntry SSE2UniformConstCostTable[] = {
413  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
414  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
415  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
416 
417  { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
418  { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
419  { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
420 
421  { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
422  { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
423  { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
424  { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
425  { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
426  { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
427  { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
428  { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
429  };
430 
431  // XOP has faster vXi8 shifts.
433  ST->hasSSE2() && !ST->hasXOP()) {
434  if (const auto *Entry =
435  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
436  return LT.first * Entry->Cost;
437  }
438 
439  static const CostTblEntry AVX512BWConstCostTable[] = {
440  { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
441  { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
442  { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
443  { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
444  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
445  { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
446  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
447  { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
448  };
449 
452  ST->hasBWI()) {
453  if (const auto *Entry =
454  CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
455  return LT.first * Entry->Cost;
456  }
457 
458  static const CostTblEntry AVX512ConstCostTable[] = {
459  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
460  { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
461  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
462  { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
463  { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
464  { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
465  { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
466  { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
467  { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
468  { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
469  { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
470  { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
471  };
472 
475  ST->hasAVX512()) {
476  if (const auto *Entry =
477  CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
478  return LT.first * Entry->Cost;
479  }
480 
481  static const CostTblEntry AVX2ConstCostTable[] = {
482  { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
483  { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
484  { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
485  { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
486  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
487  { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
488  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
489  { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
490  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
491  { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
492  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
493  { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
494  };
495 
498  ST->hasAVX2()) {
499  if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
500  return LT.first * Entry->Cost;
501  }
502 
503  static const CostTblEntry SSE2ConstCostTable[] = {
504  { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
505  { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
506  { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
507  { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
508  { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
509  { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
510  { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
511  { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
512  { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
513  { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
514  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
515  { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
516  { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
517  { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
518  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
519  { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
520  { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
521  { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
522  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
523  { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
524  { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
525  { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
526  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
527  { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
528  };
529 
532  ST->hasSSE2()) {
533  // pmuldq sequence.
534  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
535  return LT.first * 32;
536  if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
537  return LT.first * 38;
538  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
539  return LT.first * 15;
540  if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
541  return LT.first * 20;
542 
543  if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
544  return LT.first * Entry->Cost;
545  }
546 
547  static const CostTblEntry AVX512BWShiftCostTable[] = {
548  { ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence.
549  { ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence.
550  { ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence.
551  { ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence.
552  { ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence.
553  { ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence.
554  { ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence.
555  { ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence.
556  { ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence.
557 
558  { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
559  { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
560  { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
561  { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
562  { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
563  { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
564  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
565  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
566  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
567  };
568 
569  if (ST->hasBWI())
570  if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
571  return LT.first * Entry->Cost;
572 
573  static const CostTblEntry AVX2UniformCostTable[] = {
574  // Uniform splats are cheaper for the following instructions.
575  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
576  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
577  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
578  { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
579  { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
580  { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
581 
582  { ISD::SHL, MVT::v8i32, 1 }, // pslld
583  { ISD::SRL, MVT::v8i32, 1 }, // psrld
584  { ISD::SRA, MVT::v8i32, 1 }, // psrad
585  { ISD::SHL, MVT::v4i64, 1 }, // psllq
586  { ISD::SRL, MVT::v4i64, 1 }, // psrlq
587  };
588 
589  if (ST->hasAVX2() &&
591  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
592  if (const auto *Entry =
593  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
594  return LT.first * Entry->Cost;
595  }
596 
597  static const CostTblEntry SSE2UniformCostTable[] = {
598  // Uniform splats are cheaper for the following instructions.
599  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
600  { ISD::SHL, MVT::v4i32, 1 }, // pslld
601  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
602 
603  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
604  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
605  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
606 
607  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
608  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
609  };
610 
611  if (ST->hasSSE2() &&
613  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
614  if (const auto *Entry =
615  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
616  return LT.first * Entry->Cost;
617  }
618 
619  static const CostTblEntry AVX512DQCostTable[] = {
620  { ISD::MUL, MVT::v2i64, 2 }, // pmullq
621  { ISD::MUL, MVT::v4i64, 2 }, // pmullq
622  { ISD::MUL, MVT::v8i64, 2 } // pmullq
623  };
624 
625  // Look for AVX512DQ lowering tricks for custom cases.
626  if (ST->hasDQI())
627  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
628  return LT.first * Entry->Cost;
629 
630  static const CostTblEntry AVX512BWCostTable[] = {
631  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
632  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
633  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
634  };
635 
636  // Look for AVX512BW lowering tricks for custom cases.
637  if (ST->hasBWI())
638  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
639  return LT.first * Entry->Cost;
640 
641  static const CostTblEntry AVX512CostTable[] = {
642  { ISD::SHL, MVT::v4i32, 1 },
643  { ISD::SRL, MVT::v4i32, 1 },
644  { ISD::SRA, MVT::v4i32, 1 },
645  { ISD::SHL, MVT::v8i32, 1 },
646  { ISD::SRL, MVT::v8i32, 1 },
647  { ISD::SRA, MVT::v8i32, 1 },
648  { ISD::SHL, MVT::v16i32, 1 },
649  { ISD::SRL, MVT::v16i32, 1 },
650  { ISD::SRA, MVT::v16i32, 1 },
651 
652  { ISD::SHL, MVT::v2i64, 1 },
653  { ISD::SRL, MVT::v2i64, 1 },
654  { ISD::SHL, MVT::v4i64, 1 },
655  { ISD::SRL, MVT::v4i64, 1 },
656  { ISD::SHL, MVT::v8i64, 1 },
657  { ISD::SRL, MVT::v8i64, 1 },
658 
659  { ISD::SRA, MVT::v2i64, 1 },
660  { ISD::SRA, MVT::v4i64, 1 },
661  { ISD::SRA, MVT::v8i64, 1 },
662 
663  { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
664  { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
665  { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
666  { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add
667  { ISD::MUL, MVT::i64, 1 }, // Skylake from http://www.agner.org/
668 
669  { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
670  { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
671  { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
672  { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
673  { ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/
674  { ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/
675  { ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/
676  { ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/
677 
678  { ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
679  { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
680  { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
681  { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
682  { ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/
683  { ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/
684  { ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/
685  { ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/
686  };
687 
688  if (ST->hasAVX512())
689  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
690  return LT.first * Entry->Cost;
691 
692  static const CostTblEntry AVX2ShiftCostTable[] = {
693  // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
694  // customize them to detect the cases where shift amount is a scalar one.
695  { ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org)
696  { ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org)
697  { ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org)
698  { ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org)
699  { ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org)
700  { ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org)
701  { ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org)
702  { ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org)
703  { ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org)
704  { ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org)
705  };
706 
707  if (ST->hasAVX512()) {
708  if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
711  // On AVX512, a packed v32i16 shift left by a constant build_vector
712  // is lowered into a vector multiply (vpmullw).
714  Op1Info, Op2Info,
717  }
718 
719  // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
720  if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
721  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
724  // On AVX2, a packed v16i16 shift left by a constant build_vector
725  // is lowered into a vector multiply (vpmullw).
727  Op1Info, Op2Info,
730 
731  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
732  return LT.first * Entry->Cost;
733  }
734 
735  static const CostTblEntry XOPShiftCostTable[] = {
736  // 128bit shifts take 1cy, but right shifts require negation beforehand.
737  { ISD::SHL, MVT::v16i8, 1 },
738  { ISD::SRL, MVT::v16i8, 2 },
739  { ISD::SRA, MVT::v16i8, 2 },
740  { ISD::SHL, MVT::v8i16, 1 },
741  { ISD::SRL, MVT::v8i16, 2 },
742  { ISD::SRA, MVT::v8i16, 2 },
743  { ISD::SHL, MVT::v4i32, 1 },
744  { ISD::SRL, MVT::v4i32, 2 },
745  { ISD::SRA, MVT::v4i32, 2 },
746  { ISD::SHL, MVT::v2i64, 1 },
747  { ISD::SRL, MVT::v2i64, 2 },
748  { ISD::SRA, MVT::v2i64, 2 },
749  // 256bit shifts require splitting if AVX2 didn't catch them above.
750  { ISD::SHL, MVT::v32i8, 2+2 },
751  { ISD::SRL, MVT::v32i8, 4+2 },
752  { ISD::SRA, MVT::v32i8, 4+2 },
753  { ISD::SHL, MVT::v16i16, 2+2 },
754  { ISD::SRL, MVT::v16i16, 4+2 },
755  { ISD::SRA, MVT::v16i16, 4+2 },
756  { ISD::SHL, MVT::v8i32, 2+2 },
757  { ISD::SRL, MVT::v8i32, 4+2 },
758  { ISD::SRA, MVT::v8i32, 4+2 },
759  { ISD::SHL, MVT::v4i64, 2+2 },
760  { ISD::SRL, MVT::v4i64, 4+2 },
761  { ISD::SRA, MVT::v4i64, 4+2 },
762  };
763 
764  // Look for XOP lowering tricks.
765  if (ST->hasXOP()) {
766  // If the right shift is constant then we'll fold the negation so
767  // it's as cheap as a left shift.
768  int ShiftISD = ISD;
769  if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
772  ShiftISD = ISD::SHL;
773  if (const auto *Entry =
774  CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
775  return LT.first * Entry->Cost;
776  }
777 
778  static const CostTblEntry SSE2UniformShiftCostTable[] = {
779  // Uniform splats are cheaper for the following instructions.
780  { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
781  { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
782  { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
783 
784  { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
785  { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
786  { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
787 
788  { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
789  { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
790  { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
791  { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
792  };
793 
794  if (ST->hasSSE2() &&
796  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
797 
798  // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
799  if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
800  return LT.first * 4; // 2*psrad + shuffle.
801 
802  if (const auto *Entry =
803  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
804  return LT.first * Entry->Cost;
805  }
806 
807  if (ISD == ISD::SHL &&
809  MVT VT = LT.second;
810  // Vector shift left by non uniform constant can be lowered
811  // into vector multiply.
812  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
813  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
814  ISD = ISD::MUL;
815  }
816 
817  static const CostTblEntry AVX2CostTable[] = {
818  { ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence.
819  { ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence.
820  { ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
821  { ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
822  { ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
823  { ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
824 
825  { ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence.
826  { ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence.
827  { ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
828  { ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
829  { ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
830  { ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
831 
832  { ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence.
833  { ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence.
834  { ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence.
835  { ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence.
836  { ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence.
837  { ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence.
838  { ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence.
839  { ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence.
840 
841  { ISD::SUB, MVT::v32i8, 1 }, // psubb
842  { ISD::ADD, MVT::v32i8, 1 }, // paddb
843  { ISD::SUB, MVT::v16i16, 1 }, // psubw
844  { ISD::ADD, MVT::v16i16, 1 }, // paddw
845  { ISD::SUB, MVT::v8i32, 1 }, // psubd
846  { ISD::ADD, MVT::v8i32, 1 }, // paddd
847  { ISD::SUB, MVT::v4i64, 1 }, // psubq
848  { ISD::ADD, MVT::v4i64, 1 }, // paddq
849 
850  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
851  { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
852  { ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add
853 
854  { ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
855  { ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
856  { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
857  { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
858  { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
859  { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
860  { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/
861  { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/
862  { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
863  { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
864 
865  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
866  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
867  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
868  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
869  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
870  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
871  };
872 
873  // Look for AVX2 lowering tricks for custom cases.
874  if (ST->hasAVX2())
875  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
876  return LT.first * Entry->Cost;
877 
878  static const CostTblEntry AVX1CostTable[] = {
879  // We don't have to scalarize unsupported ops. We can issue two half-sized
880  // operations and we only need to extract the upper YMM half.
881  // Two ops + 1 extract + 1 insert = 4.
882  { ISD::MUL, MVT::v16i16, 4 },
883  { ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/
884  { ISD::MUL, MVT::v4i64, 12 },
885 
886  { ISD::SUB, MVT::v32i8, 4 },
887  { ISD::ADD, MVT::v32i8, 4 },
888  { ISD::SUB, MVT::v16i16, 4 },
889  { ISD::ADD, MVT::v16i16, 4 },
890  { ISD::SUB, MVT::v8i32, 4 },
891  { ISD::ADD, MVT::v8i32, 4 },
892  { ISD::SUB, MVT::v4i64, 4 },
893  { ISD::ADD, MVT::v4i64, 4 },
894 
895  { ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split.
896  { ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence.
897  { ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split.
898  { ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld
899  { ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split
900  { ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend.
901  { ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
902 
903  { ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split.
904  { ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split.
905  { ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend.
906  { ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split.
907  { ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend.
908  { ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
909 
910  { ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split.
911  { ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split.
912  { ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend.
913  { ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split.
914  { ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend.
915  { ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split.
916 
917  { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/
918  { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/
919 
920  { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/
921  { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/
922  { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/
923 
924  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
925  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
926  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
927  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
928  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
929  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
930  };
931 
932  if (ST->hasAVX())
933  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
934  return LT.first * Entry->Cost;
935 
936  static const CostTblEntry SSE42CostTable[] = {
937  { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
938  { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
939  { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
940  { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
941 
942  { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
943  { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
944  { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
945  { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
946 
947  { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
948  { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
949  { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
950  { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
951 
952  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
953  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
954  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
955  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
956 
957  { ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add
958  };
959 
960  if (ST->hasSSE42())
961  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
962  return LT.first * Entry->Cost;
963 
964  static const CostTblEntry SSE41CostTable[] = {
965  { ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence.
966  { ISD::SHL, MVT::v8i16, 11 }, // pblendvb sequence.
967  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
968 
969  { ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence.
970  { ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence.
971  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
972 
973  { ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence.
974  { ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence.
975 
976  { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
977  };
978 
979  if (ST->hasSSE41())
980  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
981  return LT.first * Entry->Cost;
982 
983  static const CostTblEntry SSE2CostTable[] = {
984  // We don't correctly identify costs of casts because they are marked as
985  // custom.
986  { ISD::SHL, MVT::v16i8, 13 }, // cmpgtb sequence.
987  { ISD::SHL, MVT::v8i16, 25 }, // cmpgtw sequence.
988  { ISD::SHL, MVT::v4i32, 16 }, // pslld/paddd/cvttps2dq/pmuludq.
989  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
990 
991  { ISD::SRL, MVT::v16i8, 14 }, // cmpgtb sequence.
992  { ISD::SRL, MVT::v8i16, 16 }, // cmpgtw sequence.
993  { ISD::SRL, MVT::v4i32, 12 }, // Shift each lane + blend.
994  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
995 
996  { ISD::SRA, MVT::v16i8, 27 }, // unpacked cmpgtb sequence.
997  { ISD::SRA, MVT::v8i16, 16 }, // cmpgtw sequence.
998  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
999  { ISD::SRA, MVT::v2i64, 8 }, // srl/xor/sub splat+shuffle sequence.
1000 
1001  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
1002  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
1003  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
1004 
1005  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
1006  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
1007  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
1008  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
1009 
1010  { ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/
1011  { ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/
1012  { ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/
1013  { ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/
1014 
1015  { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
1016  { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
1017 
1018  { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
1019  { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
1020  };
1021 
1022  if (ST->hasSSE2())
1023  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1024  return LT.first * Entry->Cost;
1025 
1026  static const CostTblEntry SSE1CostTable[] = {
1027  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
1028  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
1029 
1030  { ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/
1031  { ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1032 
1033  { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
1034  { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1035 
1036  { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
1037  { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1038  };
1039 
1040  if (ST->hasSSE1())
1041  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1042  return LT.first * Entry->Cost;
1043 
1044  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
1045  { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
1046  { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
1047  { ISD::MUL, MVT::i64, 2 }, // Nehalem from http://www.agner.org/
1048  };
1049 
1050  if (ST->is64Bit())
1051  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1052  return LT.first * Entry->Cost;
1053 
1054  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1055  { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
1056  { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
1057  { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
1058 
1059  { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
1060  { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
1061  { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
1062  };
1063 
1064  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1065  return LT.first * Entry->Cost;
1066 
1067  // It is not a good idea to vectorize division. We have to scalarize it and
1068  // in the process we will often end up having to spilling regular
1069  // registers. The overhead of division is going to dominate most kernels
1070  // anyways so try hard to prevent vectorization of division - it is
1071  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1072  // to hide "20 cycles" for each lane.
1073  if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
1074  ISD == ISD::UDIV || ISD == ISD::UREM)) {
1076  Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
1078  return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1079  }
1080 
1081  // Fallback to the default implementation.
1082  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
1083 }
1084 
1086  VectorType *BaseTp,
1087  ArrayRef<int> Mask, int Index,
1088  VectorType *SubTp,
1090  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1091  // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1092  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
1093 
1095  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1096  if (Kind == TTI::SK_Transpose)
1098 
1099  // For Broadcasts we are splatting the first element from the first input
1100  // register, so only need to reference that input and all the output
1101  // registers are the same.
1102  if (Kind == TTI::SK_Broadcast)
1103  LT.first = 1;
1104 
1105  // Subvector extractions are free if they start at the beginning of a
1106  // vector and cheap if the subvectors are aligned.
1107  if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1108  int NumElts = LT.second.getVectorNumElements();
1109  if ((Index % NumElts) == 0)
1110  return 0;
1111  std::pair<InstructionCost, MVT> SubLT =
1112  TLI->getTypeLegalizationCost(DL, SubTp);
1113  if (SubLT.second.isVector()) {
1114  int NumSubElts = SubLT.second.getVectorNumElements();
1115  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1116  return SubLT.first;
1117  // Handle some cases for widening legalization. For now we only handle
1118  // cases where the original subvector was naturally aligned and evenly
1119  // fit in its legalized subvector type.
1120  // FIXME: Remove some of the alignment restrictions.
1121  // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1122  // vectors.
1123  int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1124  if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1125  (NumSubElts % OrigSubElts) == 0 &&
1126  LT.second.getVectorElementType() ==
1127  SubLT.second.getVectorElementType() &&
1128  LT.second.getVectorElementType().getSizeInBits() ==
1129  BaseTp->getElementType()->getPrimitiveSizeInBits()) {
1130  assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1131  "Unexpected number of elements!");
1132  auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1133  LT.second.getVectorNumElements());
1134  auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1135  SubLT.second.getVectorNumElements());
1136  int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1137  InstructionCost ExtractCost = getShuffleCost(
1138  TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy);
1139 
1140  // If the original size is 32-bits or more, we can use pshufd. Otherwise
1141  // if we have SSSE3 we can use pshufb.
1142  if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1143  return ExtractCost + 1; // pshufd or pshufb
1144 
1145  assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1146  "Unexpected vector size");
1147 
1148  return ExtractCost + 2; // worst case pshufhw + pshufd
1149  }
1150  }
1151  }
1152 
1153  // Subvector insertions are cheap if the subvectors are aligned.
1154  // Note that in general, the insertion starting at the beginning of a vector
1155  // isn't free, because we need to preserve the rest of the wide vector.
1156  if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1157  int NumElts = LT.second.getVectorNumElements();
1158  std::pair<InstructionCost, MVT> SubLT =
1159  TLI->getTypeLegalizationCost(DL, SubTp);
1160  if (SubLT.second.isVector()) {
1161  int NumSubElts = SubLT.second.getVectorNumElements();
1162  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1163  return SubLT.first;
1164  }
1165 
1166  // If the insertion isn't aligned, treat it like a 2-op shuffle.
1168  }
1169 
1170  // Handle some common (illegal) sub-vector types as they are often very cheap
1171  // to shuffle even on targets without PSHUFB.
1172  EVT VT = TLI->getValueType(DL, BaseTp);
1173  if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1174  !ST->hasSSSE3()) {
1175  static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1176  {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1177  {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1178  {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1179  {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1180  {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1181 
1182  {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1183  {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1184  {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1185  {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1186 
1187  {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1188  {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1189  {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1190  {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1191  {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1192 
1193  {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1194  {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1195  {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1196  {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1197  {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1198  };
1199 
1200  if (ST->hasSSE2())
1201  if (const auto *Entry =
1202  CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1203  return Entry->Cost;
1204  }
1205 
1206  // We are going to permute multiple sources and the result will be in multiple
1207  // destinations. Providing an accurate cost only for splits where the element
1208  // type remains the same.
1209  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1210  MVT LegalVT = LT.second;
1211  if (LegalVT.isVector() &&
1212  LegalVT.getVectorElementType().getSizeInBits() ==
1213  BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1214  LegalVT.getVectorNumElements() <
1215  cast<FixedVectorType>(BaseTp)->getNumElements()) {
1216 
1217  unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1218  unsigned LegalVTSize = LegalVT.getStoreSize();
1219  // Number of source vectors after legalization:
1220  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1221  // Number of destination vectors after legalization:
1222  InstructionCost NumOfDests = LT.first;
1223 
1224  auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1225  LegalVT.getVectorNumElements());
1226 
1227  if (!Mask.empty() && NumOfDests.isValid()) {
1228  // Try to perform better estimation of the permutation.
1229  // 1. Split the source/destination vectors into real registers.
1230  // 2. Do the mask analysis to identify which real registers are
1231  // permuted. If more than 1 source registers are used for the
1232  // destination register building, the cost for this destination register
1233  // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1234  // source register is used, build mask and calculate the cost as a cost
1235  // of PermuteSingleSrc.
1236  // Also, for the single register permute we try to identify if the
1237  // destination register is just a copy of the source register or the
1238  // copy of the previous destination register (the cost is
1239  // TTI::TCC_Basic). If the source register is just reused, the cost for
1240  // this operation is 0.
1241  unsigned E = *NumOfDests.getValue();
1242  unsigned NormalizedVF =
1243  LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1244  unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1245  unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1246  SmallVector<int> NormalizedMask(NormalizedVF, UndefMaskElem);
1247  copy(Mask, NormalizedMask.begin());
1248  unsigned PrevSrcReg = 0;
1249  ArrayRef<int> PrevRegMask;
1250  InstructionCost Cost = 0;
1252  NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1253  [this, SingleOpTy, &PrevSrcReg, &PrevRegMask,
1254  &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1255  if (!ShuffleVectorInst::isIdentityMask(RegMask)) {
1256  // Check if the previous register can be just copied to the next
1257  // one.
1258  if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1259  PrevRegMask != RegMask)
1260  Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
1261  RegMask, 0, nullptr);
1262  else
1263  // Just a copy of previous destination register.
1264  Cost += TTI::TCC_Basic;
1265  return;
1266  }
1267  if (SrcReg != DestReg &&
1268  any_of(RegMask, [](int I) { return I != UndefMaskElem; })) {
1269  // Just a copy of the source register.
1270  Cost += TTI::TCC_Basic;
1271  }
1272  PrevSrcReg = SrcReg;
1273  PrevRegMask = RegMask;
1274  },
1275  [this, SingleOpTy, &Cost](ArrayRef<int> RegMask,
1276  unsigned /*Unused*/,
1277  unsigned /*Unused*/) {
1278  Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1279  0, nullptr);
1280  });
1281  return Cost;
1282  }
1283 
1284  InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1285  return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1286  None, 0, nullptr);
1287  }
1288 
1289  return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1290  }
1291 
1292  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1293  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1294  // We assume that source and destination have the same vector type.
1295  InstructionCost NumOfDests = LT.first;
1296  InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1297  LT.first = NumOfDests * NumOfShufflesPerDest;
1298  }
1299 
1300  static const CostTblEntry AVX512FP16ShuffleTbl[] = {
1301  {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1302  {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1303  {TTI::SK_Broadcast, MVT::v8f16, 1}, // vpbroadcastw
1304 
1305  {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1306  {TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw
1307  {TTI::SK_Reverse, MVT::v8f16, 1}, // vpshufb
1308 
1309  {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1310  {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1311  {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // vpshufb
1312 
1313  {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1314  {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w
1315  {TTI::SK_PermuteTwoSrc, MVT::v8f16, 2} // vpermt2w
1316  };
1317 
1318  if (!ST->useSoftFloat() && ST->hasFP16())
1319  if (const auto *Entry =
1320  CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second))
1321  return LT.first * Entry->Cost;
1322 
1323  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1324  {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1325  {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1326 
1327  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1328  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1329 
1330  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1331  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1332  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1333  };
1334 
1335  if (ST->hasVBMI())
1336  if (const auto *Entry =
1337  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1338  return LT.first * Entry->Cost;
1339 
1340  static const CostTblEntry AVX512BWShuffleTbl[] = {
1341  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1342  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1343 
1344  {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1345  {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1346  {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1347 
1348  {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1349  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1350  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1351 
1352  {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1353  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1354  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1355  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1356 
1357  {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1358  {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1359  };
1360 
1361  if (ST->hasBWI())
1362  if (const auto *Entry =
1363  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1364  return LT.first * Entry->Cost;
1365 
1366  static const CostTblEntry AVX512ShuffleTbl[] = {
1367  {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
1368  {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1369  {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1370  {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1371  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1372  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1373 
1374  {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1375  {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1376  {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1377  {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1378  {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca
1379  {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca
1380 
1381  {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1382  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1383  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1384  {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1385  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1386  {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1387  {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1388  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1389  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1390  {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1391  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1392  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1393  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1394 
1395  {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1396  {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1397  {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1398  {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1399  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1400  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1401  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1402  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1403  {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1404  {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1405  {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1406  {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
1407 
1408  // FIXME: This just applies the type legalization cost rules above
1409  // assuming these completely split.
1414 
1415  {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
1416  {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
1417  {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
1418  {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
1419  {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
1420  {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
1421  };
1422 
1423  if (ST->hasAVX512())
1424  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1425  return LT.first * Entry->Cost;
1426 
1427  static const CostTblEntry AVX2ShuffleTbl[] = {
1428  {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1429  {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1430  {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1431  {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1432  {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1433  {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1434 
1435  {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1436  {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1437  {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1438  {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1439  {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1440  {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1441 
1442  {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1443  {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1444 
1445  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1446  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1447  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1448  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1449  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1450  // + vpblendvb
1451  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1452  // + vpblendvb
1453 
1454  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1455  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1456  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1457  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1458  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1459  // + vpblendvb
1460  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1461  // + vpblendvb
1462  };
1463 
1464  if (ST->hasAVX2())
1465  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1466  return LT.first * Entry->Cost;
1467 
1468  static const CostTblEntry XOPShuffleTbl[] = {
1469  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1470  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1471  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1472  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1473  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1474  // + vinsertf128
1475  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1476  // + vinsertf128
1477 
1478  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1479  // + vinsertf128
1480  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1481  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1482  // + vinsertf128
1483  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1484  };
1485 
1486  if (ST->hasXOP())
1487  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1488  return LT.first * Entry->Cost;
1489 
1490  static const CostTblEntry AVX1ShuffleTbl[] = {
1491  {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1492  {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1493  {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1494  {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1495  {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1496  {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1497 
1498  {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1499  {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1500  {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1501  {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1502  {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1503  // + vinsertf128
1504  {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1505  // + vinsertf128
1506 
1507  {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1508  {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1509  {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1510  {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1511  {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1512  {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1513 
1514  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1515  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1516  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1517  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1518  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1519  // + 2*por + vinsertf128
1520  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1521  // + 2*por + vinsertf128
1522 
1523  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1524  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1525  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1526  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1527  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1528  // + 4*por + vinsertf128
1529  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1530  // + 4*por + vinsertf128
1531  };
1532 
1533  if (ST->hasAVX())
1534  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1535  return LT.first * Entry->Cost;
1536 
1537  static const CostTblEntry SSE41ShuffleTbl[] = {
1538  {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1539  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1540  {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1541  {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1542  {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1543  {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1544  };
1545 
1546  if (ST->hasSSE41())
1547  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1548  return LT.first * Entry->Cost;
1549 
1550  static const CostTblEntry SSSE3ShuffleTbl[] = {
1551  {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1552  {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1553 
1554  {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1555  {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1556 
1557  {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1558  {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1559 
1560  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1561  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1562 
1563  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1564  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1565  };
1566 
1567  if (ST->hasSSSE3())
1568  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1569  return LT.first * Entry->Cost;
1570 
1571  static const CostTblEntry SSE2ShuffleTbl[] = {
1572  {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1573  {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1574  {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1575  {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1576  {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1577 
1578  {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1579  {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1580  {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1581  {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1582  {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1583  // + 2*pshufd + 2*unpck + packus
1584 
1585  {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1586  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1587  {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1588  {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1589  {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1590 
1591  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1592  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1593  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1594  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1595  // + pshufd/unpck
1596  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1597  // + 2*pshufd + 2*unpck + 2*packus
1598 
1599  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1600  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1601  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1602  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1603  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1604  };
1605 
1606  static const CostTblEntry SSE3BroadcastLoadTbl[] = {
1607  {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
1608  };
1609 
1610  if (ST->hasSSE2()) {
1611  bool IsLoad =
1612  llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
1613  if (ST->hasSSE3() && IsLoad)
1614  if (const auto *Entry =
1615  CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
1617  LT.second.getVectorElementCount()) &&
1618  "Table entry missing from isLegalBroadcastLoad()");
1619  return LT.first * Entry->Cost;
1620  }
1621 
1622  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1623  return LT.first * Entry->Cost;
1624  }
1625 
1626  static const CostTblEntry SSE1ShuffleTbl[] = {
1627  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1628  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1629  { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1630  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1631  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1632  };
1633 
1634  if (ST->hasSSE1())
1635  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1636  return LT.first * Entry->Cost;
1637 
1638  return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1639 }
1640 
1642  Type *Src,
1645  const Instruction *I) {
1646  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1647  assert(ISD && "Invalid opcode");
1648 
1649  // TODO: Allow non-throughput costs that aren't binary.
1650  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1652  return Cost == 0 ? 0 : 1;
1653  return Cost;
1654  };
1655 
1656  // The cost tables include both specific, custom (non-legal) src/dst type
1657  // conversions and generic, legalized types. We test for customs first, before
1658  // falling back to legalization.
1659  // FIXME: Need a better design of the cost table to handle non-simple types of
1660  // potential massive combinations (elem_num x src_type x dst_type).
1661  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1664 
1665  // Mask sign extend has an instruction.
1683 
1684  // Mask zero extend is a sext + shift.
1702 
1720 
1722  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
1723  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
1724  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
1725  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
1726  };
1727 
1728  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1729  // Mask sign extend has an instruction.
1738 
1739  // Mask zero extend is a sext + shift.
1748 
1757 
1760 
1763 
1766 
1769  };
1770 
1771  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1772  // 256-bit wide vectors.
1773 
1774  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1778 
1779  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1780  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1781  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1782  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
1783  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1784  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1785  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1786  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
1787  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
1788  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
1789  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
1790  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
1791  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
1792  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
1793  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
1794  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
1795  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
1796  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
1797  { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb
1798  { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb
1799  { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw
1800  { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw
1801  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
1802  { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
1803  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
1804  { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb
1805  { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb
1806  { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb
1807  { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
1808  { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw
1809  { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw
1810  { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
1811  { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
1812  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
1813 
1814  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
1817 
1818  // Sign extend is zmm vpternlogd+vptruncdb.
1819  // Zero extend is zmm broadcast load+vptruncdw.
1828 
1829  // Sign extend is zmm vpternlogd+vptruncdw.
1830  // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
1839 
1840  { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
1841  { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
1842  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
1843  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
1844  { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
1845  { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
1846  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
1847  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
1848  { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
1849  { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
1850 
1851  { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
1852  { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
1853  { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
1854  { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
1855 
1866 
1867  { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1868  { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1869 
1878 
1889 
1901 
1908  };
1909 
1910  static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
1911  // Mask sign extend has an instruction.
1929 
1930  // Mask zero extend is a sext + shift.
1948 
1966 
1968  };
1969 
1970  static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
1971  // Mask sign extend has an instruction.
1980 
1981  // Mask zero extend is a sext + shift.
1990 
1999 
2004 
2009 
2014 
2019  };
2020 
2021  static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
2022  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2023  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2024  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2025  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
2026  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2027  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2028  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2029  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
2030  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
2031  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
2032  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2033  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
2034  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
2035  { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
2036  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
2037  { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
2038  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
2039 
2040  // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2041  // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2050 
2051  // sign extend is vpcmpeq+maskedmove+vpmovdw
2052  // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2061 
2062  { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
2063  { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
2064  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
2065  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
2066  { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
2067  { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
2068  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
2069  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
2070  { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
2071  { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
2072 
2085 
2090 
2104 
2108 
2116  };
2117 
2118  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
2125 
2140 
2142 
2155 
2158 
2163 
2172 
2180 
2191  };
2192 
2193  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
2200 
2213 
2219 
2222  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
2226  { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
2228 
2241 
2259 
2271 
2285 
2288  };
2289 
2290  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2303 
2304  // These truncates end up widening elements.
2305  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
2306  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
2307  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
2308 
2312 
2324 
2339 
2350 
2361  };
2362 
2363  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2364  // These are somewhat magic numbers justified by comparing the
2365  // output of llvm-mca for our various supported scheduler models
2366  // and basing it off the worst case scenario.
2379 
2393 
2404 
2408  { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
2415 
2428 
2429  // These truncates are really widening elements.
2430  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2431  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2432  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2433  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2434  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2435  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2436 
2437  { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2439  { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
2445  { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2446  { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2447  { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
2448  };
2449 
2450  // Attempt to map directly to (simple) MVT types to let us match custom entries.
2451  EVT SrcTy = TLI->getValueType(DL, Src);
2452  EVT DstTy = TLI->getValueType(DL, Dst);
2453 
2454  // The function getSimpleVT only handles simple value types.
2455  if (SrcTy.isSimple() && DstTy.isSimple()) {
2456  MVT SimpleSrcTy = SrcTy.getSimpleVT();
2457  MVT SimpleDstTy = DstTy.getSimpleVT();
2458 
2459  if (ST->useAVX512Regs()) {
2460  if (ST->hasBWI())
2461  if (const auto *Entry = ConvertCostTableLookup(
2462  AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2463  return AdjustCost(Entry->Cost);
2464 
2465  if (ST->hasDQI())
2466  if (const auto *Entry = ConvertCostTableLookup(
2467  AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2468  return AdjustCost(Entry->Cost);
2469 
2470  if (ST->hasAVX512())
2471  if (const auto *Entry = ConvertCostTableLookup(
2472  AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2473  return AdjustCost(Entry->Cost);
2474  }
2475 
2476  if (ST->hasBWI())
2477  if (const auto *Entry = ConvertCostTableLookup(
2478  AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2479  return AdjustCost(Entry->Cost);
2480 
2481  if (ST->hasDQI())
2482  if (const auto *Entry = ConvertCostTableLookup(
2483  AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2484  return AdjustCost(Entry->Cost);
2485 
2486  if (ST->hasAVX512())
2487  if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2488  SimpleDstTy, SimpleSrcTy))
2489  return AdjustCost(Entry->Cost);
2490 
2491  if (ST->hasAVX2()) {
2492  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2493  SimpleDstTy, SimpleSrcTy))
2494  return AdjustCost(Entry->Cost);
2495  }
2496 
2497  if (ST->hasAVX()) {
2498  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2499  SimpleDstTy, SimpleSrcTy))
2500  return AdjustCost(Entry->Cost);
2501  }
2502 
2503  if (ST->hasSSE41()) {
2504  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2505  SimpleDstTy, SimpleSrcTy))
2506  return AdjustCost(Entry->Cost);
2507  }
2508 
2509  if (ST->hasSSE2()) {
2510  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2511  SimpleDstTy, SimpleSrcTy))
2512  return AdjustCost(Entry->Cost);
2513  }
2514  }
2515 
2516  // Fall back to legalized types.
2517  std::pair<InstructionCost, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
2518  std::pair<InstructionCost, MVT> LTDest =
2519  TLI->getTypeLegalizationCost(DL, Dst);
2520 
2521  // If we're truncating to the same legalized type - just assume its free.
2522  if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
2523  return TTI::TCC_Free;
2524 
2525  if (ST->useAVX512Regs()) {
2526  if (ST->hasBWI())
2527  if (const auto *Entry = ConvertCostTableLookup(
2528  AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
2529  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2530 
2531  if (ST->hasDQI())
2532  if (const auto *Entry = ConvertCostTableLookup(
2533  AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
2534  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2535 
2536  if (ST->hasAVX512())
2537  if (const auto *Entry = ConvertCostTableLookup(
2538  AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
2539  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2540  }
2541 
2542  if (ST->hasBWI())
2543  if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2544  LTDest.second, LTSrc.second))
2545  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2546 
2547  if (ST->hasDQI())
2548  if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
2549  LTDest.second, LTSrc.second))
2550  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2551 
2552  if (ST->hasAVX512())
2553  if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2554  LTDest.second, LTSrc.second))
2555  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2556 
2557  if (ST->hasAVX2())
2558  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2559  LTDest.second, LTSrc.second))
2560  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2561 
2562  if (ST->hasAVX())
2563  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2564  LTDest.second, LTSrc.second))
2565  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2566 
2567  if (ST->hasSSE41())
2568  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2569  LTDest.second, LTSrc.second))
2570  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2571 
2572  if (ST->hasSSE2())
2573  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2574  LTDest.second, LTSrc.second))
2575  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2576 
2577  // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
2578  // sitofp.
2579  if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
2580  1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
2581  Type *ExtSrc = Src->getWithNewBitWidth(32);
2582  unsigned ExtOpc =
2583  (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
2584 
2585  // For scalar loads the extend would be free.
2586  InstructionCost ExtCost = 0;
2587  if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
2588  ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
2589 
2590  return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
2592  }
2593 
2594  // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
2595  // i32.
2596  if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
2597  1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
2598  Type *TruncDst = Dst->getWithNewBitWidth(32);
2599  return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
2600  getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
2602  }
2603 
2604  return AdjustCost(
2605  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2606 }
2607 
2609  Type *CondTy,
2610  CmpInst::Predicate VecPred,
2612  const Instruction *I) {
2613  // TODO: Handle other cost kinds.
2615  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2616  I);
2617 
2618  // Legalize the type.
2619  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2620 
2621  MVT MTy = LT.second;
2622 
2623  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2624  assert(ISD && "Invalid opcode");
2625 
2626  unsigned ExtraCost = 0;
2627  if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
2628  // Some vector comparison predicates cost extra instructions.
2629  // TODO: Should we invert this and assume worst case cmp costs
2630  // and reduce for particular predicates?
2631  if (MTy.isVector() &&
2632  !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
2633  (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
2634  ST->hasBWI())) {
2635  // Fallback to I if a specific predicate wasn't specified.
2636  CmpInst::Predicate Pred = VecPred;
2637  if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
2638  Pred == CmpInst::BAD_FCMP_PREDICATE))
2639  Pred = cast<CmpInst>(I)->getPredicate();
2640 
2641  switch (Pred) {
2642  case CmpInst::Predicate::ICMP_NE:
2643  // xor(cmpeq(x,y),-1)
2644  ExtraCost = 1;
2645  break;
2646  case CmpInst::Predicate::ICMP_SGE:
2647  case CmpInst::Predicate::ICMP_SLE:
2648  // xor(cmpgt(x,y),-1)
2649  ExtraCost = 1;
2650  break;
2651  case CmpInst::Predicate::ICMP_ULT:
2652  case CmpInst::Predicate::ICMP_UGT:
2653  // cmpgt(xor(x,signbit),xor(y,signbit))
2654  // xor(cmpeq(pmaxu(x,y),x),-1)
2655  ExtraCost = 2;
2656  break;
2657  case CmpInst::Predicate::ICMP_ULE:
2658  case CmpInst::Predicate::ICMP_UGE:
2659  if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
2660  (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
2661  // cmpeq(psubus(x,y),0)
2662  // cmpeq(pminu(x,y),x)
2663  ExtraCost = 1;
2664  } else {
2665  // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
2666  ExtraCost = 3;
2667  }
2668  break;
2669  case CmpInst::Predicate::BAD_ICMP_PREDICATE:
2670  case CmpInst::Predicate::BAD_FCMP_PREDICATE:
2671  // Assume worst case scenario and add the maximum extra cost.
2672  ExtraCost = 3;
2673  break;
2674  default:
2675  break;
2676  }
2677  }
2678  }
2679 
2680  static const CostTblEntry SLMCostTbl[] = {
2681  // slm pcmpeq/pcmpgt throughput is 2
2682  { ISD::SETCC, MVT::v2i64, 2 },
2683  };
2684 
2685  static const CostTblEntry AVX512BWCostTbl[] = {
2686  { ISD::SETCC, MVT::v32i16, 1 },
2687  { ISD::SETCC, MVT::v64i8, 1 },
2688 
2689  { ISD::SELECT, MVT::v32i16, 1 },
2690  { ISD::SELECT, MVT::v64i8, 1 },
2691  };
2692 
2693  static const CostTblEntry AVX512CostTbl[] = {
2694  { ISD::SETCC, MVT::v8i64, 1 },
2695  { ISD::SETCC, MVT::v16i32, 1 },
2696  { ISD::SETCC, MVT::v8f64, 1 },
2697  { ISD::SETCC, MVT::v16f32, 1 },
2698 
2699  { ISD::SELECT, MVT::v8i64, 1 },
2700  { ISD::SELECT, MVT::v4i64, 1 },
2701  { ISD::SELECT, MVT::v2i64, 1 },
2702  { ISD::SELECT, MVT::v16i32, 1 },
2703  { ISD::SELECT, MVT::v8i32, 1 },
2704  { ISD::SELECT, MVT::v4i32, 1 },
2705  { ISD::SELECT, MVT::v8f64, 1 },
2706  { ISD::SELECT, MVT::v4f64, 1 },
2707  { ISD::SELECT, MVT::v2f64, 1 },
2708  { ISD::SELECT, MVT::f64, 1 },
2709  { ISD::SELECT, MVT::v16f32, 1 },
2710  { ISD::SELECT, MVT::v8f32 , 1 },
2711  { ISD::SELECT, MVT::v4f32, 1 },
2712  { ISD::SELECT, MVT::f32 , 1 },
2713 
2714  { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
2715  { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
2716 
2717  { ISD::SELECT, MVT::v32i16, 2 },
2718  { ISD::SELECT, MVT::v16i16, 1 },
2719  { ISD::SELECT, MVT::v8i16, 1 },
2720  { ISD::SELECT, MVT::v64i8, 2 },
2721  { ISD::SELECT, MVT::v32i8, 1 },
2722  { ISD::SELECT, MVT::v16i8, 1 },
2723  };
2724 
2725  static const CostTblEntry AVX2CostTbl[] = {
2726  { ISD::SETCC, MVT::v4i64, 1 },
2727  { ISD::SETCC, MVT::v8i32, 1 },
2728  { ISD::SETCC, MVT::v16i16, 1 },
2729  { ISD::SETCC, MVT::v32i8, 1 },
2730 
2731  { ISD::SELECT, MVT::v4f64, 2 }, // vblendvpd
2732  { ISD::SELECT, MVT::v8f32, 2 }, // vblendvps
2733  { ISD::SELECT, MVT::v4i64, 2 }, // pblendvb
2734  { ISD::SELECT, MVT::v8i32, 2 }, // pblendvb
2735  { ISD::SELECT, MVT::v16i16, 2 }, // pblendvb
2736  { ISD::SELECT, MVT::v32i8, 2 }, // pblendvb
2737  };
2738 
2739  static const CostTblEntry AVX1CostTbl[] = {
2740  { ISD::SETCC, MVT::v4f64, 1 },
2741  { ISD::SETCC, MVT::v8f32, 1 },
2742  // AVX1 does not support 8-wide integer compare.
2743  { ISD::SETCC, MVT::v4i64, 4 },
2744  { ISD::SETCC, MVT::v8i32, 4 },
2745  { ISD::SETCC, MVT::v16i16, 4 },
2746  { ISD::SETCC, MVT::v32i8, 4 },
2747 
2748  { ISD::SELECT, MVT::v4f64, 3 }, // vblendvpd
2749  { ISD::SELECT, MVT::v8f32, 3 }, // vblendvps
2750  { ISD::SELECT, MVT::v4i64, 3 }, // vblendvpd
2751  { ISD::SELECT, MVT::v8i32, 3 }, // vblendvps
2752  { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
2753  { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
2754  };
2755 
2756  static const CostTblEntry SSE42CostTbl[] = {
2757  { ISD::SETCC, MVT::v2i64, 1 },
2758  };
2759 
2760  static const CostTblEntry SSE41CostTbl[] = {
2761  { ISD::SETCC, MVT::v2f64, 1 },
2762  { ISD::SETCC, MVT::v4f32, 1 },
2763 
2764  { ISD::SELECT, MVT::v2f64, 2 }, // blendvpd
2765  { ISD::SELECT, MVT::f64, 2 }, // blendvpd
2766  { ISD::SELECT, MVT::v4f32, 2 }, // blendvps
2767  { ISD::SELECT, MVT::f32 , 2 }, // blendvps
2768  { ISD::SELECT, MVT::v2i64, 2 }, // pblendvb
2769  { ISD::SELECT, MVT::v4i32, 2 }, // pblendvb
2770  { ISD::SELECT, MVT::v8i16, 2 }, // pblendvb
2771  { ISD::SELECT, MVT::v16i8, 2 }, // pblendvb
2772  };
2773 
2774  static const CostTblEntry SSE2CostTbl[] = {
2775  { ISD::SETCC, MVT::v2f64, 2 },
2776  { ISD::SETCC, MVT::f64, 1 },
2777  { ISD::SETCC, MVT::v2i64, 5 }, // pcmpeqd/pcmpgtd expansion
2778  { ISD::SETCC, MVT::v4i32, 1 },
2779  { ISD::SETCC, MVT::v8i16, 1 },
2780  { ISD::SETCC, MVT::v16i8, 1 },
2781 
2782  { ISD::SELECT, MVT::v2f64, 2 }, // andpd + andnpd + orpd
2783  { ISD::SELECT, MVT::f64, 2 }, // andpd + andnpd + orpd
2784  { ISD::SELECT, MVT::v2i64, 2 }, // pand + pandn + por
2785  { ISD::SELECT, MVT::v4i32, 2 }, // pand + pandn + por
2786  { ISD::SELECT, MVT::v8i16, 2 }, // pand + pandn + por
2787  { ISD::SELECT, MVT::v16i8, 2 }, // pand + pandn + por
2788  };
2789 
2790  static const CostTblEntry SSE1CostTbl[] = {
2791  { ISD::SETCC, MVT::v4f32, 2 },
2792  { ISD::SETCC, MVT::f32, 1 },
2793 
2794  { ISD::SELECT, MVT::v4f32, 2 }, // andps + andnps + orps
2795  { ISD::SELECT, MVT::f32, 2 }, // andps + andnps + orps
2796  };
2797 
2798  if (ST->useSLMArithCosts())
2799  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2800  return LT.first * (ExtraCost + Entry->Cost);
2801 
2802  if (ST->hasBWI())
2803  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2804  return LT.first * (ExtraCost + Entry->Cost);
2805 
2806  if (ST->hasAVX512())
2807  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2808  return LT.first * (ExtraCost + Entry->Cost);
2809 
2810  if (ST->hasAVX2())
2811  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2812  return LT.first * (ExtraCost + Entry->Cost);
2813 
2814  if (ST->hasAVX())
2815  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2816  return LT.first * (ExtraCost + Entry->Cost);
2817 
2818  if (ST->hasSSE42())
2819  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2820  return LT.first * (ExtraCost + Entry->Cost);
2821 
2822  if (ST->hasSSE41())
2823  if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
2824  return LT.first * (ExtraCost + Entry->Cost);
2825 
2826  if (ST->hasSSE2())
2827  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2828  return LT.first * (ExtraCost + Entry->Cost);
2829 
2830  if (ST->hasSSE1())
2831  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2832  return LT.first * (ExtraCost + Entry->Cost);
2833 
2834  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2835 }
2836 
2838 
2842 
2843  // Costs should match the codegen from:
2844  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
2845  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
2846  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
2847  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
2848  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
2849 
2850  // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
2851  // specialized in these tables yet.
2852  static const CostTblEntry AVX512BITALGCostTbl[] = {
2853  { ISD::CTPOP, MVT::v32i16, 1 },
2854  { ISD::CTPOP, MVT::v64i8, 1 },
2855  { ISD::CTPOP, MVT::v16i16, 1 },
2856  { ISD::CTPOP, MVT::v32i8, 1 },
2857  { ISD::CTPOP, MVT::v8i16, 1 },
2858  { ISD::CTPOP, MVT::v16i8, 1 },
2859  };
2860  static const CostTblEntry AVX512VPOPCNTDQCostTbl[] = {
2861  { ISD::CTPOP, MVT::v8i64, 1 },
2862  { ISD::CTPOP, MVT::v16i32, 1 },
2863  { ISD::CTPOP, MVT::v4i64, 1 },
2864  { ISD::CTPOP, MVT::v8i32, 1 },
2865  { ISD::CTPOP, MVT::v2i64, 1 },
2866  { ISD::CTPOP, MVT::v4i32, 1 },
2867  };
2868  static const CostTblEntry AVX512CDCostTbl[] = {
2869  { ISD::CTLZ, MVT::v8i64, 1 },
2870  { ISD::CTLZ, MVT::v16i32, 1 },
2871  { ISD::CTLZ, MVT::v32i16, 8 },
2872  { ISD::CTLZ, MVT::v64i8, 20 },
2873  { ISD::CTLZ, MVT::v4i64, 1 },
2874  { ISD::CTLZ, MVT::v8i32, 1 },
2875  { ISD::CTLZ, MVT::v16i16, 4 },
2876  { ISD::CTLZ, MVT::v32i8, 10 },
2877  { ISD::CTLZ, MVT::v2i64, 1 },
2878  { ISD::CTLZ, MVT::v4i32, 1 },
2879  { ISD::CTLZ, MVT::v8i16, 4 },
2880  { ISD::CTLZ, MVT::v16i8, 4 },
2881  };
2882  static const CostTblEntry AVX512BWCostTbl[] = {
2883  { ISD::ABS, MVT::v32i16, 1 },
2884  { ISD::ABS, MVT::v64i8, 1 },
2885  { ISD::BITREVERSE, MVT::v8i64, 3 },
2886  { ISD::BITREVERSE, MVT::v16i32, 3 },
2887  { ISD::BITREVERSE, MVT::v32i16, 3 },
2888  { ISD::BITREVERSE, MVT::v64i8, 2 },
2889  { ISD::BSWAP, MVT::v8i64, 1 },
2890  { ISD::BSWAP, MVT::v16i32, 1 },
2891  { ISD::BSWAP, MVT::v32i16, 1 },
2892  { ISD::CTLZ, MVT::v8i64, 23 },
2893  { ISD::CTLZ, MVT::v16i32, 22 },
2894  { ISD::CTLZ, MVT::v32i16, 18 },
2895  { ISD::CTLZ, MVT::v64i8, 17 },
2896  { ISD::CTPOP, MVT::v8i64, 7 },
2897  { ISD::CTPOP, MVT::v16i32, 11 },
2898  { ISD::CTPOP, MVT::v32i16, 9 },
2899  { ISD::CTPOP, MVT::v64i8, 6 },
2900  { ISD::CTTZ, MVT::v8i64, 10 },
2901  { ISD::CTTZ, MVT::v16i32, 14 },
2902  { ISD::CTTZ, MVT::v32i16, 12 },
2903  { ISD::CTTZ, MVT::v64i8, 9 },
2904  { ISD::SADDSAT, MVT::v32i16, 1 },
2905  { ISD::SADDSAT, MVT::v64i8, 1 },
2906  { ISD::SMAX, MVT::v32i16, 1 },
2907  { ISD::SMAX, MVT::v64i8, 1 },
2908  { ISD::SMIN, MVT::v32i16, 1 },
2909  { ISD::SMIN, MVT::v64i8, 1 },
2910  { ISD::SSUBSAT, MVT::v32i16, 1 },
2911  { ISD::SSUBSAT, MVT::v64i8, 1 },
2912  { ISD::UADDSAT, MVT::v32i16, 1 },
2913  { ISD::UADDSAT, MVT::v64i8, 1 },
2914  { ISD::UMAX, MVT::v32i16, 1 },
2915  { ISD::UMAX, MVT::v64i8, 1 },
2916  { ISD::UMIN, MVT::v32i16, 1 },
2917  { ISD::UMIN, MVT::v64i8, 1 },
2918  { ISD::USUBSAT, MVT::v32i16, 1 },
2919  { ISD::USUBSAT, MVT::v64i8, 1 },
2920  };
2921  static const CostTblEntry AVX512CostTbl[] = {
2922  { ISD::ABS, MVT::v8i64, 1 },
2923  { ISD::ABS, MVT::v16i32, 1 },
2924  { ISD::ABS, MVT::v32i16, 2 },
2925  { ISD::ABS, MVT::v64i8, 2 },
2926  { ISD::ABS, MVT::v4i64, 1 },
2927  { ISD::ABS, MVT::v2i64, 1 },
2928  { ISD::BITREVERSE, MVT::v8i64, 36 },
2929  { ISD::BITREVERSE, MVT::v16i32, 24 },
2930  { ISD::BITREVERSE, MVT::v32i16, 10 },
2931  { ISD::BITREVERSE, MVT::v64i8, 10 },
2932  { ISD::BSWAP, MVT::v8i64, 4 },
2933  { ISD::BSWAP, MVT::v16i32, 4 },
2934  { ISD::BSWAP, MVT::v32i16, 4 },
2935  { ISD::CTLZ, MVT::v8i64, 29 },
2936  { ISD::CTLZ, MVT::v16i32, 35 },
2937  { ISD::CTLZ, MVT::v32i16, 28 },
2938  { ISD::CTLZ, MVT::v64i8, 18 },
2939  { ISD::CTPOP, MVT::v8i64, 16 },
2940  { ISD::CTPOP, MVT::v16i32, 24 },
2941  { ISD::CTPOP, MVT::v32i16, 18 },
2942  { ISD::CTPOP, MVT::v64i8, 12 },
2943  { ISD::CTTZ, MVT::v8i64, 20 },
2944  { ISD::CTTZ, MVT::v16i32, 28 },
2945  { ISD::CTTZ, MVT::v32i16, 24 },
2946  { ISD::CTTZ, MVT::v64i8, 18 },
2947  { ISD::SMAX, MVT::v8i64, 1 },
2948  { ISD::SMAX, MVT::v16i32, 1 },
2949  { ISD::SMAX, MVT::v32i16, 2 },
2950  { ISD::SMAX, MVT::v64i8, 2 },
2951  { ISD::SMAX, MVT::v4i64, 1 },
2952  { ISD::SMAX, MVT::v2i64, 1 },
2953  { ISD::SMIN, MVT::v8i64, 1 },
2954  { ISD::SMIN, MVT::v16i32, 1 },
2955  { ISD::SMIN, MVT::v32i16, 2 },
2956  { ISD::SMIN, MVT::v64i8, 2 },
2957  { ISD::SMIN, MVT::v4i64, 1 },
2958  { ISD::SMIN, MVT::v2i64, 1 },
2959  { ISD::UMAX, MVT::v8i64, 1 },
2960  { ISD::UMAX, MVT::v16i32, 1 },
2961  { ISD::UMAX, MVT::v32i16, 2 },
2962  { ISD::UMAX, MVT::v64i8, 2 },
2963  { ISD::UMAX, MVT::v4i64, 1 },
2964  { ISD::UMAX, MVT::v2i64, 1 },
2965  { ISD::UMIN, MVT::v8i64, 1 },
2966  { ISD::UMIN, MVT::v16i32, 1 },
2967  { ISD::UMIN, MVT::v32i16, 2 },
2968  { ISD::UMIN, MVT::v64i8, 2 },
2969  { ISD::UMIN, MVT::v4i64, 1 },
2970  { ISD::UMIN, MVT::v2i64, 1 },
2971  { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
2972  { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
2973  { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
2974  { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
2975  { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
2976  { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
2977  { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
2978  { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
2979  { ISD::SADDSAT, MVT::v32i16, 2 },
2980  { ISD::SADDSAT, MVT::v64i8, 2 },
2981  { ISD::SSUBSAT, MVT::v32i16, 2 },
2982  { ISD::SSUBSAT, MVT::v64i8, 2 },
2983  { ISD::UADDSAT, MVT::v32i16, 2 },
2984  { ISD::UADDSAT, MVT::v64i8, 2 },
2985  { ISD::USUBSAT, MVT::v32i16, 2 },
2986  { ISD::USUBSAT, MVT::v64i8, 2 },
2987  { ISD::FMAXNUM, MVT::f32, 2 },
2988  { ISD::FMAXNUM, MVT::v4f32, 2 },
2989  { ISD::FMAXNUM, MVT::v8f32, 2 },
2990  { ISD::FMAXNUM, MVT::v16f32, 2 },
2991  { ISD::FMAXNUM, MVT::f64, 2 },
2992  { ISD::FMAXNUM, MVT::v2f64, 2 },
2993  { ISD::FMAXNUM, MVT::v4f64, 2 },
2994  { ISD::FMAXNUM, MVT::v8f64, 2 },
2995  };
2996  static const CostTblEntry XOPCostTbl[] = {
2997  { ISD::BITREVERSE, MVT::v4i64, 4 },
2998  { ISD::BITREVERSE, MVT::v8i32, 4 },
2999  { ISD::BITREVERSE, MVT::v16i16, 4 },
3000  { ISD::BITREVERSE, MVT::v32i8, 4 },
3001  { ISD::BITREVERSE, MVT::v2i64, 1 },
3002  { ISD::BITREVERSE, MVT::v4i32, 1 },
3003  { ISD::BITREVERSE, MVT::v8i16, 1 },
3004  { ISD::BITREVERSE, MVT::v16i8, 1 },
3005  { ISD::BITREVERSE, MVT::i64, 3 },
3006  { ISD::BITREVERSE, MVT::i32, 3 },
3007  { ISD::BITREVERSE, MVT::i16, 3 },
3008  { ISD::BITREVERSE, MVT::i8, 3 }
3009  };
3010  static const CostTblEntry AVX2CostTbl[] = {
3011  { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3012  { ISD::ABS, MVT::v8i32, 1 },
3013  { ISD::ABS, MVT::v16i16, 1 },
3014  { ISD::ABS, MVT::v32i8, 1 },
3015  { ISD::BITREVERSE, MVT::v2i64, 3 },
3016  { ISD::BITREVERSE, MVT::v4i64, 3 },
3017  { ISD::BITREVERSE, MVT::v4i32, 3 },
3018  { ISD::BITREVERSE, MVT::v8i32, 3 },
3019  { ISD::BITREVERSE, MVT::v8i16, 3 },
3020  { ISD::BITREVERSE, MVT::v16i16, 3 },
3021  { ISD::BITREVERSE, MVT::v16i8, 3 },
3022  { ISD::BITREVERSE, MVT::v32i8, 3 },
3023  { ISD::BSWAP, MVT::v4i64, 1 },
3024  { ISD::BSWAP, MVT::v8i32, 1 },
3025  { ISD::BSWAP, MVT::v16i16, 1 },
3026  { ISD::CTLZ, MVT::v2i64, 7 },
3027  { ISD::CTLZ, MVT::v4i64, 7 },
3028  { ISD::CTLZ, MVT::v4i32, 5 },
3029  { ISD::CTLZ, MVT::v8i32, 5 },
3030  { ISD::CTLZ, MVT::v8i16, 4 },
3031  { ISD::CTLZ, MVT::v16i16, 4 },
3032  { ISD::CTLZ, MVT::v16i8, 3 },
3033  { ISD::CTLZ, MVT::v32i8, 3 },
3034  { ISD::CTPOP, MVT::v2i64, 3 },
3035  { ISD::CTPOP, MVT::v4i64, 3 },
3036  { ISD::CTPOP, MVT::v4i32, 7 },
3037  { ISD::CTPOP, MVT::v8i32, 7 },
3038  { ISD::CTPOP, MVT::v8i16, 3 },
3039  { ISD::CTPOP, MVT::v16i16, 3 },
3040  { ISD::CTPOP, MVT::v16i8, 2 },
3041  { ISD::CTPOP, MVT::v32i8, 2 },
3042  { ISD::CTTZ, MVT::v2i64, 4 },
3043  { ISD::CTTZ, MVT::v4i64, 4 },
3044  { ISD::CTTZ, MVT::v4i32, 7 },
3045  { ISD::CTTZ, MVT::v8i32, 7 },
3046  { ISD::CTTZ, MVT::v8i16, 4 },
3047  { ISD::CTTZ, MVT::v16i16, 4 },
3048  { ISD::CTTZ, MVT::v16i8, 3 },
3049  { ISD::CTTZ, MVT::v32i8, 3 },
3050  { ISD::SADDSAT, MVT::v16i16, 1 },
3051  { ISD::SADDSAT, MVT::v32i8, 1 },
3052  { ISD::SMAX, MVT::v8i32, 1 },
3053  { ISD::SMAX, MVT::v16i16, 1 },
3054  { ISD::SMAX, MVT::v32i8, 1 },
3055  { ISD::SMIN, MVT::v8i32, 1 },
3056  { ISD::SMIN, MVT::v16i16, 1 },
3057  { ISD::SMIN, MVT::v32i8, 1 },
3058  { ISD::SSUBSAT, MVT::v16i16, 1 },
3059  { ISD::SSUBSAT, MVT::v32i8, 1 },
3060  { ISD::UADDSAT, MVT::v16i16, 1 },
3061  { ISD::UADDSAT, MVT::v32i8, 1 },
3062  { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
3063  { ISD::UMAX, MVT::v8i32, 1 },
3064  { ISD::UMAX, MVT::v16i16, 1 },
3065  { ISD::UMAX, MVT::v32i8, 1 },
3066  { ISD::UMIN, MVT::v8i32, 1 },
3067  { ISD::UMIN, MVT::v16i16, 1 },
3068  { ISD::UMIN, MVT::v32i8, 1 },
3069  { ISD::USUBSAT, MVT::v16i16, 1 },
3070  { ISD::USUBSAT, MVT::v32i8, 1 },
3071  { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
3072  { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
3073  { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
3074  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
3075  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
3076  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
3077  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
3078  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
3079  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
3080  };
3081  static const CostTblEntry AVX1CostTbl[] = {
3082  { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3083  { ISD::ABS, MVT::v8i32, 3 },
3084  { ISD::ABS, MVT::v16i16, 3 },
3085  { ISD::ABS, MVT::v32i8, 3 },
3086  { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
3087  { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
3088  { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
3089  { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
3090  { ISD::BSWAP, MVT::v4i64, 4 },
3091  { ISD::BSWAP, MVT::v8i32, 4 },
3092  { ISD::BSWAP, MVT::v16i16, 4 },
3093  { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
3094  { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
3095  { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
3096  { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
3097  { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
3098  { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
3099  { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
3100  { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
3101  { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
3102  { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
3103  { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
3104  { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
3105  { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3106  { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3107  { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
3108  { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3109  { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3110  { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
3111  { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3112  { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3113  { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3114  { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3115  { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3116  { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3117  { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
3118  { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
3119  { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3120  { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3121  { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
3122  { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3123  { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3124  { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3125  { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3126  { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
3127  { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS
3128  { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
3129  { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
3130  { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD
3131  { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
3132  { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
3133  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
3134  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
3135  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
3136  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
3137  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
3138  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
3139  };
3140  static const CostTblEntry GLMCostTbl[] = {
3141  { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
3142  { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
3143  { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
3144  { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
3145  };
3146  static const CostTblEntry SLMCostTbl[] = {
3147  { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
3148  { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
3149  { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
3150  { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
3151  };
3152  static const CostTblEntry SSE42CostTbl[] = {
3153  { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
3154  { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
3155  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
3156  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
3157  };
3158  static const CostTblEntry SSE41CostTbl[] = {
3159  { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X)
3160  { ISD::SMAX, MVT::v4i32, 1 },
3161  { ISD::SMAX, MVT::v16i8, 1 },
3162  { ISD::SMIN, MVT::v4i32, 1 },
3163  { ISD::SMIN, MVT::v16i8, 1 },
3164  { ISD::UMAX, MVT::v4i32, 1 },
3165  { ISD::UMAX, MVT::v8i16, 1 },
3166  { ISD::UMIN, MVT::v4i32, 1 },
3167  { ISD::UMIN, MVT::v8i16, 1 },
3168  };
3169  static const CostTblEntry SSSE3CostTbl[] = {
3170  { ISD::ABS, MVT::v4i32, 1 },
3171  { ISD::ABS, MVT::v8i16, 1 },
3172  { ISD::ABS, MVT::v16i8, 1 },
3173  { ISD::BITREVERSE, MVT::v2i64, 5 },
3174  { ISD::BITREVERSE, MVT::v4i32, 5 },
3175  { ISD::BITREVERSE, MVT::v8i16, 5 },
3176  { ISD::BITREVERSE, MVT::v16i8, 5 },
3177  { ISD::BSWAP, MVT::v2i64, 1 },
3178  { ISD::BSWAP, MVT::v4i32, 1 },
3179  { ISD::BSWAP, MVT::v8i16, 1 },
3180  { ISD::CTLZ, MVT::v2i64, 23 },
3181  { ISD::CTLZ, MVT::v4i32, 18 },
3182  { ISD::CTLZ, MVT::v8i16, 14 },
3183  { ISD::CTLZ, MVT::v16i8, 9 },
3184  { ISD::CTPOP, MVT::v2i64, 7 },
3185  { ISD::CTPOP, MVT::v4i32, 11 },
3186  { ISD::CTPOP, MVT::v8i16, 9 },
3187  { ISD::CTPOP, MVT::v16i8, 6 },
3188  { ISD::CTTZ, MVT::v2i64, 10 },
3189  { ISD::CTTZ, MVT::v4i32, 14 },
3190  { ISD::CTTZ, MVT::v8i16, 12 },
3191  { ISD::CTTZ, MVT::v16i8, 9 }
3192  };
3193  static const CostTblEntry SSE2CostTbl[] = {
3194  { ISD::ABS, MVT::v2i64, 4 },
3195  { ISD::ABS, MVT::v4i32, 3 },
3196  { ISD::ABS, MVT::v8i16, 2 },
3197  { ISD::ABS, MVT::v16i8, 2 },
3198  { ISD::BITREVERSE, MVT::v2i64, 29 },
3199  { ISD::BITREVERSE, MVT::v4i32, 27 },
3200  { ISD::BITREVERSE, MVT::v8i16, 27 },
3201  { ISD::BITREVERSE, MVT::v16i8, 20 },
3202  { ISD::BSWAP, MVT::v2i64, 7 },
3203  { ISD::BSWAP, MVT::v4i32, 7 },
3204  { ISD::BSWAP, MVT::v8i16, 7 },
3205  { ISD::CTLZ, MVT::v2i64, 25 },
3206  { ISD::CTLZ, MVT::v4i32, 26 },
3207  { ISD::CTLZ, MVT::v8i16, 20 },
3208  { ISD::CTLZ, MVT::v16i8, 17 },
3209  { ISD::CTPOP, MVT::v2i64, 12 },
3210  { ISD::CTPOP, MVT::v4i32, 15 },
3211  { ISD::CTPOP, MVT::v8i16, 13 },
3212  { ISD::CTPOP, MVT::v16i8, 10 },
3213  { ISD::CTTZ, MVT::v2i64, 14 },
3214  { ISD::CTTZ, MVT::v4i32, 18 },
3215  { ISD::CTTZ, MVT::v8i16, 16 },
3216  { ISD::CTTZ, MVT::v16i8, 13 },
3217  { ISD::SADDSAT, MVT::v8i16, 1 },
3218  { ISD::SADDSAT, MVT::v16i8, 1 },
3219  { ISD::SMAX, MVT::v8i16, 1 },
3220  { ISD::SMIN, MVT::v8i16, 1 },
3221  { ISD::SSUBSAT, MVT::v8i16, 1 },
3222  { ISD::SSUBSAT, MVT::v16i8, 1 },
3223  { ISD::UADDSAT, MVT::v8i16, 1 },
3224  { ISD::UADDSAT, MVT::v16i8, 1 },
3225  { ISD::UMAX, MVT::v8i16, 2 },
3226  { ISD::UMAX, MVT::v16i8, 1 },
3227  { ISD::UMIN, MVT::v8i16, 2 },
3228  { ISD::UMIN, MVT::v16i8, 1 },
3229  { ISD::USUBSAT, MVT::v8i16, 1 },
3230  { ISD::USUBSAT, MVT::v16i8, 1 },
3231  { ISD::FMAXNUM, MVT::f64, 4 },
3232  { ISD::FMAXNUM, MVT::v2f64, 4 },
3233  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
3234  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
3235  };
3236  static const CostTblEntry SSE1CostTbl[] = {
3237  { ISD::FMAXNUM, MVT::f32, 4 },
3238  { ISD::FMAXNUM, MVT::v4f32, 4 },
3239  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
3240  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
3241  };
3242  static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
3243  { ISD::CTTZ, MVT::i64, 1 },
3244  };
3245  static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
3246  { ISD::CTTZ, MVT::i32, 1 },
3247  { ISD::CTTZ, MVT::i16, 1 },
3248  { ISD::CTTZ, MVT::i8, 1 },
3249  };
3250  static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
3251  { ISD::CTLZ, MVT::i64, 1 },