LLVM  16.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of a
17 /// specific CPU model. Usually the numbers correspond to the CPU where the
18 /// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost,
21 /// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22 ///
23 /// Some examples of other technologies/CPUs:
24 /// SSE 3 - Pentium4 / Athlon64
25 /// SSE 4.1 - Penryn
26 /// SSE 4.2 - Nehalem / Silvermont
27 /// AVX - Sandy Bridge / Jaguar / Bulldozer
28 /// AVX2 - Haswell / Ryzen
29 /// AVX-512 - Xeon Phi / Skylake
30 ///
31 /// And some examples of instruction target dependent costs (latency)
32 /// divss sqrtss rsqrtss
33 /// AMD K7 11-16 19 3
34 /// Piledriver 9-24 13-15 5
35 /// Jaguar 14 16 2
36 /// Pentium II,III 18 30 2
37 /// Nehalem 7-14 7-18 3
38 /// Haswell 10-13 11 5
39 ///
40 /// Interpreting the 4 TargetCostKind types:
41 /// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42 /// values reported by the CPU scheduler models (and llvm-mca).
43 /// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44 /// actual encoding size of the instruction.
45 /// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46 /// by the CPU scheduler models (and llvm-mca), to ensure that they are
47 /// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48 /// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49 //===----------------------------------------------------------------------===//
50 
51 #include "X86TargetTransformInfo.h"
54 #include "llvm/CodeGen/CostTable.h"
56 #include "llvm/IR/InstIterator.h"
57 #include "llvm/IR/IntrinsicInst.h"
58 #include "llvm/Support/Debug.h"
59 
60 using namespace llvm;
61 
62 #define DEBUG_TYPE "x86tti"
63 
64 //===----------------------------------------------------------------------===//
65 //
66 // X86 cost model.
67 //
68 //===----------------------------------------------------------------------===//
69 
70 // Helper struct to store/access costs for each cost kind.
71 // TODO: Move this to allow other targets to use it?
72 struct CostKindCosts {
73  unsigned RecipThroughputCost = ~0U;
74  unsigned LatencyCost = ~0U;
75  unsigned CodeSizeCost = ~0U;
76  unsigned SizeAndLatencyCost = ~0U;
77 
80  unsigned Cost = ~0U;
81  switch (Kind) {
83  Cost = RecipThroughputCost;
84  break;
86  Cost = LatencyCost;
87  break;
89  Cost = CodeSizeCost;
90  break;
92  Cost = SizeAndLatencyCost;
93  break;
94  }
95  if (Cost == ~0U)
96  return None;
97  return Cost;
98  }
99 };
101 
103 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
104  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
105  // TODO: Currently the __builtin_popcount() implementation using SSE3
106  // instructions is inefficient. Once the problem is fixed, we should
107  // call ST->hasSSE3() instead of ST->hasPOPCNT().
108  return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
109 }
110 
113  switch (Level) {
115  // - Penryn
116  // - Nehalem
117  // - Westmere
118  // - Sandy Bridge
119  // - Ivy Bridge
120  // - Haswell
121  // - Broadwell
122  // - Skylake
123  // - Kabylake
124  return 32 * 1024; // 32 KByte
126  // - Penryn
127  // - Nehalem
128  // - Westmere
129  // - Sandy Bridge
130  // - Ivy Bridge
131  // - Haswell
132  // - Broadwell
133  // - Skylake
134  // - Kabylake
135  return 256 * 1024; // 256 KByte
136  }
137 
138  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
139 }
140 
143  // - Penryn
144  // - Nehalem
145  // - Westmere
146  // - Sandy Bridge
147  // - Ivy Bridge
148  // - Haswell
149  // - Broadwell
150  // - Skylake
151  // - Kabylake
152  switch (Level) {
154  [[fallthrough]];
156  return 8;
157  }
158 
159  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
160 }
161 
162 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
163  bool Vector = (ClassID == 1);
164  if (Vector && !ST->hasSSE1())
165  return 0;
166 
167  if (ST->is64Bit()) {
168  if (Vector && ST->hasAVX512())
169  return 32;
170  return 16;
171  }
172  return 8;
173 }
174 
175 TypeSize
177  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
178  switch (K) {
180  return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
182  if (ST->hasAVX512() && PreferVectorWidth >= 512)
183  return TypeSize::getFixed(512);
184  if (ST->hasAVX() && PreferVectorWidth >= 256)
185  return TypeSize::getFixed(256);
186  if (ST->hasSSE1() && PreferVectorWidth >= 128)
187  return TypeSize::getFixed(128);
188  return TypeSize::getFixed(0);
190  return TypeSize::getScalable(0);
191  }
192 
193  llvm_unreachable("Unsupported register kind");
194 }
195 
196 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
198  .getFixedSize();
199 }
200 
201 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
202  // If the loop will not be vectorized, don't interleave the loop.
203  // Let regular unroll to unroll the loop, which saves the overflow
204  // check and memory check cost.
205  if (VF == 1)
206  return 1;
207 
208  if (ST->isAtom())
209  return 1;
210 
211  // Sandybridge and Haswell have multiple execution ports and pipelined
212  // vector units.
213  if (ST->hasAVX())
214  return 4;
215 
216  return 2;
217 }
218 
220  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
223  const Instruction *CxtI) {
224 
225  // vXi8 multiplications are always promoted to vXi16.
226  if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
227  Ty->getScalarSizeInBits() == 8) {
228  Type *WideVecTy =
229  VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
230  return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
232  CostKind) +
233  getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
235  CostKind) +
236  getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
237  }
238 
239  // Legalize the type.
240  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
241 
242  int ISD = TLI->InstructionOpcodeToISD(Opcode);
243  assert(ISD && "Invalid opcode");
244 
245  if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
246  LT.second.getScalarType() == MVT::i32) {
247  // Check if the operands can be represented as a smaller datatype.
248  bool Op1Signed = false, Op2Signed = false;
249  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
250  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
251  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
252  bool SignedMode = Op1Signed || Op2Signed;
253 
254  // If both are representable as i15 and at least one is constant,
255  // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
256  // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
257  if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) {
258  bool Op1Constant =
259  isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
260  bool Op2Constant =
261  isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
262  bool Op1Sext = isa<SExtInst>(Args[0]) &&
263  (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
264  bool Op2Sext = isa<SExtInst>(Args[1]) &&
265  (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
266 
267  bool IsZeroExtended = !Op1Signed || !Op2Signed;
268  bool IsConstant = Op1Constant || Op2Constant;
269  bool IsSext = Op1Sext || Op2Sext;
270  if (IsConstant || IsZeroExtended || IsSext)
271  LT.second =
272  MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
273  }
274 
275  // Check if the vXi32 operands can be shrunk into a smaller datatype.
276  // This should match the codegen from reduceVMULWidth.
277  // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
278  if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
279  if (OpMinSize <= 7)
280  return LT.first * 3; // pmullw/sext
281  if (!SignedMode && OpMinSize <= 8)
282  return LT.first * 3; // pmullw/zext
283  if (OpMinSize <= 15)
284  return LT.first * 5; // pmullw/pmulhw/pshuf
285  if (!SignedMode && OpMinSize <= 16)
286  return LT.first * 5; // pmullw/pmulhw/pshuf
287  }
288  }
289 
290  // Vector multiply by pow2 will be simplified to shifts.
291  // Vector multiply by -pow2 will be simplified to shifts/negates.
292  if (ISD == ISD::MUL && Op2Info.isConstant() &&
293  (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
295  getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
296  Op1Info.getNoProps(), Op2Info.getNoProps());
297  if (Op2Info.isNegatedPowerOf2())
298  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
299  return Cost;
300  }
301 
302  // On X86, vector signed division by constants power-of-two are
303  // normally expanded to the sequence SRA + SRL + ADD + SRA.
304  // The OperandValue properties may not be the same as that of the previous
305  // operation; conservatively assume OP_None.
306  if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
307  Op2Info.isConstant() && Op2Info.isPowerOf2()) {
309  2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
310  Op1Info.getNoProps(), Op2Info.getNoProps());
311  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
312  Op1Info.getNoProps(), Op2Info.getNoProps());
314  Op1Info.getNoProps(), Op2Info.getNoProps());
315 
316  if (ISD == ISD::SREM) {
317  // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
319  Op2Info.getNoProps());
320  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
321  Op2Info.getNoProps());
322  }
323 
324  return Cost;
325  }
326 
327  // Vector unsigned division/remainder will be simplified to shifts/masks.
328  if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
329  Op2Info.isConstant() && Op2Info.isPowerOf2()) {
330  if (ISD == ISD::UDIV)
331  return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
332  Op1Info.getNoProps(), Op2Info.getNoProps());
333  // UREM
334  return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
335  Op1Info.getNoProps(), Op2Info.getNoProps());
336  }
337 
338  static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
339  { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
340  { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
341  { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
342  { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
343  { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
344  { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
345  { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
346  { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
347  { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
348 
349  { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
350  { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
351  { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
352  { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
353  { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
354  { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
355  };
356 
357  if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
358  if (const auto *Entry =
359  CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
360  if (auto KindCost = Entry->Cost[CostKind])
361  return LT.first * KindCost.value();
362 
363  static const CostKindTblEntry AVX512UniformConstCostTable[] = {
364  { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
365  { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
366  { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
367 
368  { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
369  { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
370  { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
371 
372  { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
373  { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
374  { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
375  { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
376  { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
377  { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
378 
379  { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
380  { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
381  { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
382  { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
383  { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
384  { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
385  { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
386 
387  { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
388  { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
389  { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
390  { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
391  };
392 
393  if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
394  if (const auto *Entry =
395  CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
396  if (auto KindCost = Entry->Cost[CostKind])
397  return LT.first * KindCost.value();
398 
399  static const CostKindTblEntry AVX2UniformConstCostTable[] = {
400  { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
401  { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
402  { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
403  { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
404  { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
405  { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
406 
407  { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
408  { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
409  { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
410  { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
411  { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
412  { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
413 
414  { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
415  { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
416  { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
417  { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
418  { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
419  { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
420 
421  { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
422  { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
423  { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
424  { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
425  { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
426  { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
427 
428  { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
429  { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
430  { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
431  { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
432  };
433 
434  if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
435  if (const auto *Entry =
436  CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
437  if (auto KindCost = Entry->Cost[CostKind])
438  return LT.first * KindCost.value();
439 
440  static const CostKindTblEntry AVXUniformConstCostTable[] = {
441  { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
442  { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
443  { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
444  { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
445  { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
446  { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
447 
448  { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
449  { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
450  { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
451  { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
452  { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
453  { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
454 
455  { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
456  { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
457  { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
458  { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
459  { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
460  { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
461 
462  { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
463  { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
464  { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
465  { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
466  { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
467  { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
468 
469  { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
470  { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
471  { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
472  { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
473  };
474 
475  // XOP has faster vXi8 shifts.
476  if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
477  (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
478  if (const auto *Entry =
479  CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
480  if (auto KindCost = Entry->Cost[CostKind])
481  return LT.first * KindCost.value();
482 
483  static const CostKindTblEntry SSE2UniformConstCostTable[] = {
484  { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
485  { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
486  { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
487 
488  { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
489  { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
490  { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
491 
492  { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
493  { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
494  { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
495 
496  { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
497  { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
498  { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
499 
500  { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
501  { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
502  { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
503  { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
504  };
505 
506  // XOP has faster vXi8 shifts.
507  if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
508  (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
509  if (const auto *Entry =
510  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
511  if (auto KindCost = Entry->Cost[CostKind])
512  return LT.first * KindCost.value();
513 
514  static const CostKindTblEntry AVX512BWConstCostTable[] = {
515  { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
516  { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
517  { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
518  { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
519 
520  { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
521  { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
522  { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
523  { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
524  };
525 
526  if (Op2Info.isConstant() && ST->hasBWI())
527  if (const auto *Entry =
528  CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
529  if (auto KindCost = Entry->Cost[CostKind])
530  return LT.first * KindCost.value();
531 
532  static const CostKindTblEntry AVX512ConstCostTable[] = {
533  { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
534  { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
535  { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
536  { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
537 
538  { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
539  { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
540  { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
541  { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
542 
543  { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
544  { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
545  { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
546  { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
547  };
548 
549  if (Op2Info.isConstant() && ST->hasAVX512())
550  if (const auto *Entry =
551  CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
552  if (auto KindCost = Entry->Cost[CostKind])
553  return LT.first * KindCost.value();
554 
555  static const CostKindTblEntry AVX2ConstCostTable[] = {
556  { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
557  { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
558  { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
559  { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
560 
561  { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
562  { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
563  { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
564  { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
565 
566  { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
567  { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
568  { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
569  { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
570  };
571 
572  if (Op2Info.isConstant() && ST->hasAVX2())
573  if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
574  if (auto KindCost = Entry->Cost[CostKind])
575  return LT.first * KindCost.value();
576 
577  static const CostKindTblEntry AVXConstCostTable[] = {
578  { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
579  { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
580  { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
581  { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
582 
583  { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
584  { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
585  { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
586  { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
587 
588  { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
589  { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
590  { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
591  { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
592  };
593 
594  if (Op2Info.isConstant() && ST->hasAVX())
595  if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
596  if (auto KindCost = Entry->Cost[CostKind])
597  return LT.first * KindCost.value();
598 
599  static const CostKindTblEntry SSE41ConstCostTable[] = {
600  { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
601  { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
602  };
603 
604  if (Op2Info.isConstant() && ST->hasSSE41())
605  if (const auto *Entry =
606  CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
607  if (auto KindCost = Entry->Cost[CostKind])
608  return LT.first * KindCost.value();
609 
610  static const CostKindTblEntry SSE2ConstCostTable[] = {
611  { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
612  { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
613  { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
614  { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
615 
616  { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
617  { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
618  { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
619  { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
620 
621  { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
622  { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
623  { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
624  { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
625  };
626 
627  if (Op2Info.isConstant() && ST->hasSSE2())
628  if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
629  if (auto KindCost = Entry->Cost[CostKind])
630  return LT.first * KindCost.value();
631 
632  static const CostKindTblEntry AVX512BWUniformCostTable[] = {
633  { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
634  { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
635  { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
636  { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
637  { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
638  { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
639  { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
640  { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
641  { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
642 
643  { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
644  { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
645  { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
646  };
647 
648  if (ST->hasBWI() && Op2Info.isUniform())
649  if (const auto *Entry =
650  CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
651  if (auto KindCost = Entry->Cost[CostKind])
652  return LT.first * KindCost.value();
653 
654  static const CostKindTblEntry AVX512UniformCostTable[] = {
655  { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
656  { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
657  { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
658 
659  { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
660  { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
661  { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
662 
663  { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
664  { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
665  { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
666  { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
667  { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
668  { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
669  { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
670  };
671 
672  if (ST->hasAVX512() && Op2Info.isUniform())
673  if (const auto *Entry =
674  CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
675  if (auto KindCost = Entry->Cost[CostKind])
676  return LT.first * KindCost.value();
677 
678  static const CostKindTblEntry AVX2UniformCostTable[] = {
679  // Uniform splats are cheaper for the following instructions.
680  { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
681  { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
682  { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
683  { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
684  { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
685  { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
686 
687  { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
688  { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
689  { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
690  { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
691  { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
692  { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
693 
694  { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
695  { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
696  { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
697  { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
698  { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
699  { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
700 
701  { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
702  { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
703  { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
704  { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
705  { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
706  { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
707  };
708 
709  if (ST->hasAVX2() && Op2Info.isUniform())
710  if (const auto *Entry =
711  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
712  if (auto KindCost = Entry->Cost[CostKind])
713  return LT.first * KindCost.value();
714 
715  static const CostKindTblEntry AVXUniformCostTable[] = {
716  { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
717  { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
718  { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
719  { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
720  { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
721  { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
722 
723  { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
724  { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
725  { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
726  { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
727  { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
728  { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
729 
730  { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
731  { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
732  { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
733  { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
734  { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
735  { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
736 
737  { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
738  { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
739  { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
740  { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
741  { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
742  { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
743  };
744 
745  // XOP has faster vXi8 shifts.
746  if (ST->hasAVX() && Op2Info.isUniform() &&
747  (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
748  if (const auto *Entry =
749  CostTableLookup(AVXUniformCostTable, ISD, LT.second))
750  if (auto KindCost = Entry->Cost[CostKind])
751  return LT.first * KindCost.value();
752 
753  static const CostKindTblEntry SSE2UniformCostTable[] = {
754  // Uniform splats are cheaper for the following instructions.
755  { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
756  { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
757  { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
758 
759  { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
760  { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
761  { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
762 
763  { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
764  { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
765  { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
766 
767  { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
768  { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
769  { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
770  };
771 
772  if (ST->hasSSE2() && Op2Info.isUniform() &&
773  (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
774  if (const auto *Entry =
775  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
776  if (auto KindCost = Entry->Cost[CostKind])
777  return LT.first * KindCost.value();
778 
779  static const CostKindTblEntry AVX512DQCostTable[] = {
780  { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
781  { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
782  { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
783  };
784 
785  // Look for AVX512DQ lowering tricks for custom cases.
786  if (ST->hasDQI())
787  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
788  if (auto KindCost = Entry->Cost[CostKind])
789  return LT.first * KindCost.value();
790 
791  static const CostKindTblEntry AVX512BWCostTable[] = {
792  { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
793  { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
794  { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
795  { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
796  { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
797  { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
798  { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
799  { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
800  { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
801 
802  { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
803  { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
804  { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
805  { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
806  { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
807  { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
808  { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
809  { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
810  { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
811 
812  { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
813  { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
814 
815  { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
816  { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
817  { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
818  { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
819 
820  { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
821  { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
822 
823  { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
824 
825  { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
826  { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
827  { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
828  { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
829  };
830 
831  // Look for AVX512BW lowering tricks for custom cases.
832  if (ST->hasBWI())
833  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
834  if (auto KindCost = Entry->Cost[CostKind])
835  return LT.first * KindCost.value();
836 
837  static const CostKindTblEntry AVX512CostTable[] = {
838  { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
839  { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
840  { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
841 
842  { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
843  { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
844  { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
845 
846  { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
847  { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
848  { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
849  { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
850  { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
851  { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
852  { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
853  { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
854  { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
855 
856  { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
857  { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
858  { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
859  { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
860  { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
861  { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
862  { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
863  { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
864  { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
865 
866  { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
867  { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
868 
869  { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
870  { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
871 
872  { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
873  { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
874  { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
875  { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
876 
877  { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
878  { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
879  { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
880  { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
881 
882  { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
883  { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
884  { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
885  { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
886 
887  { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
888  { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
889  { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
890  { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
891  { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
892 
893  { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
894  { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
895  { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
896  { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
897  { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
898  { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
899  { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
900  { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
901  { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
902 
903  { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
904  { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
905  { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
906  { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
907 
908  { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
909  { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
910  { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
911  { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
912  { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
913  { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
914  { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
915  { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
916  { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
917 
918  { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
919  { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
920  { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
921  { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
922  };
923 
924  if (ST->hasAVX512())
925  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
926  if (auto KindCost = Entry->Cost[CostKind])
927  return LT.first * KindCost.value();
928 
929  static const CostKindTblEntry AVX2ShiftCostTable[] = {
930  // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
931  // customize them to detect the cases where shift amount is a scalar one.
932  { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
933  { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
934  { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
935  { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
936  { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
937  { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
938  { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
939  { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
940  { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
941  { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
942  };
943 
944  if (ST->hasAVX512()) {
945  if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
946  // On AVX512, a packed v32i16 shift left by a constant build_vector
947  // is lowered into a vector multiply (vpmullw).
949  Op1Info.getNoProps(), Op2Info.getNoProps());
950  }
951 
952  // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
953  if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
954  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
955  Op2Info.isConstant())
956  // On AVX2, a packed v16i16 shift left by a constant build_vector
957  // is lowered into a vector multiply (vpmullw).
959  Op1Info.getNoProps(), Op2Info.getNoProps());
960 
961  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
962  if (auto KindCost = Entry->Cost[CostKind])
963  return LT.first * KindCost.value();
964  }
965 
966  static const CostKindTblEntry XOPShiftCostTable[] = {
967  // 128bit shifts take 1cy, but right shifts require negation beforehand.
968  { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
969  { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
970  { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
971  { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
972  { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
973  { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
974  { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
975  { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
976  { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
977  { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
978  { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
979  { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
980  // 256bit shifts require splitting if AVX2 didn't catch them above.
981  { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
982  { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
983  { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
984  { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
985  { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
986  { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
987  { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
988  { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
989  { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
990  { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
991  { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
992  { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
993  };
994 
995  // Look for XOP lowering tricks.
996  if (ST->hasXOP()) {
997  // If the right shift is constant then we'll fold the negation so
998  // it's as cheap as a left shift.
999  int ShiftISD = ISD;
1000  if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1001  ShiftISD = ISD::SHL;
1002  if (const auto *Entry =
1003  CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1004  if (auto KindCost = Entry->Cost[CostKind])
1005  return LT.first * KindCost.value();
1006  }
1007 
1008  if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1009  MVT VT = LT.second;
1010  // Vector shift left by non uniform constant can be lowered
1011  // into vector multiply.
1012  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1013  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1014  ISD = ISD::MUL;
1015  }
1016 
1017  static const CostKindTblEntry GLMCostTable[] = {
1018  { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1019  { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1020  { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1021  { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1022  };
1023 
1024  if (ST->useGLMDivSqrtCosts())
1025  if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1026  if (auto KindCost = Entry->Cost[CostKind])
1027  return LT.first * KindCost.value();
1028 
1029  static const CostKindTblEntry SLMCostTable[] = {
1030  { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1031  { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1032  { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1033  { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1034  { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1035  { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1036  { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1037  { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1038  { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1039  { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1040  { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1041  { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1042  // v2i64/v4i64 mul is custom lowered as a series of long:
1043  // multiplies(3), shifts(3) and adds(2)
1044  // slm muldq version throughput is 2 and addq throughput 4
1045  // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1046  // 3X4 (addq throughput) = 17
1047  { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1048  // slm addq\subq throughput is 4
1049  { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1050  { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1051  };
1052 
1053  if (ST->useSLMArithCosts())
1054  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1055  if (auto KindCost = Entry->Cost[CostKind])
1056  return LT.first * KindCost.value();
1057 
1058  static const CostKindTblEntry AVX2CostTable[] = {
1059  { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1060  { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1061  { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1062  { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1063 
1064  { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1065  { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1066  { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1067  { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1068 
1069  { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1070  { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1071  { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1072  { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1073  { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1074  { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1075 
1076  { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1077  { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1078  { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1079  { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1080  { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1081  { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1082  { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1083  { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1084 
1085  { ISD::MUL, MVT::v16i16, { 2, 5, 1, 1 } }, // pmullw
1086  { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1087  { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1088  { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1089  { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1090 
1091  { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1092  { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1093 
1094  { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1095  { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1096  { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1097  { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1098  { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1099  { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1100 
1101  { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1102  { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1103  { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1104  { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1105  { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1106  { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1107 
1108  { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1109  { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1110  { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1111  { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1112  { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1113  { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1114 
1115  { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1116  { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1117  { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1118  { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1119  { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1120  { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1121  };
1122 
1123  // Look for AVX2 lowering tricks for custom cases.
1124  if (ST->hasAVX2())
1125  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1126  if (auto KindCost = Entry->Cost[CostKind])
1127  return LT.first * KindCost.value();
1128 
1129  static const CostKindTblEntry AVX1CostTable[] = {
1130  // We don't have to scalarize unsupported ops. We can issue two half-sized
1131  // operations and we only need to extract the upper YMM half.
1132  // Two ops + 1 extract + 1 insert = 4.
1133  { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1134  { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1135  { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1136  { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1137 
1138  { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1139  { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1140  { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1141  { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1142 
1143  { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1144  { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1145  { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1146  { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1147 
1148  { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1149  { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1150  { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1151  { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1152 
1153  { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1154  { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1155  { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1156  { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1157  { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1158  { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1159  { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1160  { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1161  { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1162  { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1163 
1164  { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1165  { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1166  { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1167  { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1168  { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1169  { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1170  { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1171  { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1172 
1173  { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1174  { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1175  { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1176  { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1177  { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1178  { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1179  { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1180  { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1181 
1182  { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1183  { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1184  { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1185  { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1186  { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1187  { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1188  { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1189  { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1190 
1191  { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1192  { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1193 
1194  { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1195  { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1196  { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1197  { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1198  { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1199  { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1200 
1201  { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1202  { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1203  { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1204  { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1205  { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1206  { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1207 
1208  { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1209  { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1210  { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1211  { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1212  { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1213  { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1214 
1215  { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1216  { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1217  { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1218  { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1219  { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1220  { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1221  };
1222 
1223  if (ST->hasAVX())
1224  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1225  if (auto KindCost = Entry->Cost[CostKind])
1226  return LT.first * KindCost.value();
1227 
1228  static const CostKindTblEntry SSE42CostTable[] = {
1229  { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1230  { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1231  { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1232  { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1233 
1234  { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1235  { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1236  { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1237  { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1238 
1239  { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1240  { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1241  { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1242  { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1243 
1244  { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1245  { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1246  { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1247  { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1248 
1249  { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1250  };
1251 
1252  if (ST->hasSSE42())
1253  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1254  if (auto KindCost = Entry->Cost[CostKind])
1255  return LT.first * KindCost.value();
1256 
1257 
1258  static const CostKindTblEntry SSE41CostTable[] = {
1259  { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1260  { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1261  { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1262 
1263  { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1264  { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1265  { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1266  { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1267 
1268  { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1269  { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1270  { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1271  { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1272 
1273  { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1274  };
1275 
1276  if (ST->hasSSE41())
1277  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1278  if (auto KindCost = Entry->Cost[CostKind])
1279  return LT.first * KindCost.value();
1280 
1281  static const CostKindTblEntry SSE2CostTable[] = {
1282  // We don't correctly identify costs of casts because they are marked as
1283  // custom.
1284  { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1285  { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1286  { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1287  { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1288 
1289  { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1290  { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1291  { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1292  { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1293 
1294  { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1295  { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1296  { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1297  { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1298 
1299  { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1300  { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1301  { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1302  { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1303 
1304  { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1305  { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1306  { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1307  { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1308 
1309  { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1310  { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1311  { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1312  { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1313 
1314  { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1315  { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1316 
1317  { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1318  { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1319  { ISD::MUL, MVT::v2i64, { 8, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1320 
1321  { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1322  { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1323  { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1324  { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1325 
1326  { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1327  { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1328  { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1329  { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1330 
1331  { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1332  { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1333  { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1334 
1335  { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1336  { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1337  { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1338 
1339  { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1340  { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1341  };
1342 
1343  if (ST->hasSSE2())
1344  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1345  if (auto KindCost = Entry->Cost[CostKind])
1346  return LT.first * KindCost.value();
1347 
1348  static const CostKindTblEntry SSE1CostTable[] = {
1349  { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1350  { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1351 
1352  { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1353  { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1354 
1355  { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1356  { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1357 
1358  { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1359  { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1360 
1361  { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1362  { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1363  };
1364 
1365  if (ST->hasSSE1())
1366  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1367  if (auto KindCost = Entry->Cost[CostKind])
1368  return LT.first * KindCost.value();
1369 
1370  static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1371  { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1372  { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1373  { ISD::MUL, MVT::i64, { 2 } }, // Nehalem from http://www.agner.org/
1374  };
1375 
1376  if (ST->is64Bit())
1377  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1378  if (auto KindCost = Entry->Cost[CostKind])
1379  return LT.first * KindCost.value();
1380 
1381  static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1382  { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1383  { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1384  { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1385 
1386  { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1387  { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1388  { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1389 
1390  { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1391  { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1392  { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1393  { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1394  { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1395  };
1396 
1397  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1398  if (auto KindCost = Entry->Cost[CostKind])
1399  return LT.first * KindCost.value();
1400 
1401  // It is not a good idea to vectorize division. We have to scalarize it and
1402  // in the process we will often end up having to spilling regular
1403  // registers. The overhead of division is going to dominate most kernels
1404  // anyways so try hard to prevent vectorization of division - it is
1405  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1406  // to hide "20 cycles" for each lane.
1407  if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1408  (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1409  ISD == ISD::UREM)) {
1410  InstructionCost ScalarCost =
1412  Op1Info.getNoProps(), Op2Info.getNoProps());
1413  return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1414  }
1415 
1416  // Handle some basic single instruction code size cases.
1417  if (CostKind == TTI::TCK_CodeSize) {
1418  switch (ISD) {
1419  case ISD::FADD:
1420  case ISD::FSUB:
1421  case ISD::FMUL:
1422  case ISD::FDIV:
1423  case ISD::FNEG:
1424  case ISD::AND:
1425  case ISD::OR:
1426  case ISD::XOR:
1427  return LT.first;
1428  break;
1429  }
1430  }
1431 
1432  // Fallback to the default implementation.
1433  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1434  Args, CxtI);
1435 }
1436 
1438  VectorType *BaseTp,
1441  int Index, VectorType *SubTp,
1443  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1444  // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1445  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1446 
1448 
1449  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1450  if (Kind == TTI::SK_Transpose)
1452 
1453  // For Broadcasts we are splatting the first element from the first input
1454  // register, so only need to reference that input and all the output
1455  // registers are the same.
1456  if (Kind == TTI::SK_Broadcast)
1457  LT.first = 1;
1458 
1459  // Subvector extractions are free if they start at the beginning of a
1460  // vector and cheap if the subvectors are aligned.
1461  if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1462  int NumElts = LT.second.getVectorNumElements();
1463  if ((Index % NumElts) == 0)
1464  return 0;
1465  std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1466  if (SubLT.second.isVector()) {
1467  int NumSubElts = SubLT.second.getVectorNumElements();
1468  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1469  return SubLT.first;
1470  // Handle some cases for widening legalization. For now we only handle
1471  // cases where the original subvector was naturally aligned and evenly
1472  // fit in its legalized subvector type.
1473  // FIXME: Remove some of the alignment restrictions.
1474  // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1475  // vectors.
1476  int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1477  if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1478  (NumSubElts % OrigSubElts) == 0 &&
1479  LT.second.getVectorElementType() ==
1480  SubLT.second.getVectorElementType() &&
1481  LT.second.getVectorElementType().getSizeInBits() ==
1482  BaseTp->getElementType()->getPrimitiveSizeInBits()) {
1483  assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1484  "Unexpected number of elements!");
1485  auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1486  LT.second.getVectorNumElements());
1487  auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1488  SubLT.second.getVectorNumElements());
1489  int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1490  InstructionCost ExtractCost =
1492  ExtractIndex, SubTy);
1493 
1494  // If the original size is 32-bits or more, we can use pshufd. Otherwise
1495  // if we have SSSE3 we can use pshufb.
1496  if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1497  return ExtractCost + 1; // pshufd or pshufb
1498 
1499  assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1500  "Unexpected vector size");
1501 
1502  return ExtractCost + 2; // worst case pshufhw + pshufd
1503  }
1504  }
1505  }
1506 
1507  // Subvector insertions are cheap if the subvectors are aligned.
1508  // Note that in general, the insertion starting at the beginning of a vector
1509  // isn't free, because we need to preserve the rest of the wide vector.
1510  if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1511  int NumElts = LT.second.getVectorNumElements();
1512  std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1513  if (SubLT.second.isVector()) {
1514  int NumSubElts = SubLT.second.getVectorNumElements();
1515  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1516  return SubLT.first;
1517  }
1518 
1519  // If the insertion isn't aligned, treat it like a 2-op shuffle.
1521  }
1522 
1523  // Handle some common (illegal) sub-vector types as they are often very cheap
1524  // to shuffle even on targets without PSHUFB.
1525  EVT VT = TLI->getValueType(DL, BaseTp);
1526  if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1527  !ST->hasSSSE3()) {
1528  static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1529  {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1530  {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1531  {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1532  {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1533  {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1534 
1535  {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1536  {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1537  {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1538  {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1539 
1540  {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1541  {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1542  {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1543  {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1544 
1545  {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1546  {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1547  {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1548  {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1549  {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1550 
1551  {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1552  {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1553  {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1554  {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1555  {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1556  };
1557 
1558  if (ST->hasSSE2())
1559  if (const auto *Entry =
1560  CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1561  return Entry->Cost;
1562  }
1563 
1564  // We are going to permute multiple sources and the result will be in multiple
1565  // destinations. Providing an accurate cost only for splits where the element
1566  // type remains the same.
1567  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1568  MVT LegalVT = LT.second;
1569  if (LegalVT.isVector() &&
1570  LegalVT.getVectorElementType().getSizeInBits() ==
1571  BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1572  LegalVT.getVectorNumElements() <
1573  cast<FixedVectorType>(BaseTp)->getNumElements()) {
1574 
1575  unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1576  unsigned LegalVTSize = LegalVT.getStoreSize();
1577  // Number of source vectors after legalization:
1578  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1579  // Number of destination vectors after legalization:
1580  InstructionCost NumOfDests = LT.first;
1581 
1582  auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1583  LegalVT.getVectorNumElements());
1584 
1585  if (!Mask.empty() && NumOfDests.isValid()) {
1586  // Try to perform better estimation of the permutation.
1587  // 1. Split the source/destination vectors into real registers.
1588  // 2. Do the mask analysis to identify which real registers are
1589  // permuted. If more than 1 source registers are used for the
1590  // destination register building, the cost for this destination register
1591  // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1592  // source register is used, build mask and calculate the cost as a cost
1593  // of PermuteSingleSrc.
1594  // Also, for the single register permute we try to identify if the
1595  // destination register is just a copy of the source register or the
1596  // copy of the previous destination register (the cost is
1597  // TTI::TCC_Basic). If the source register is just reused, the cost for
1598  // this operation is 0.
1599  unsigned E = *NumOfDests.getValue();
1600  unsigned NormalizedVF =
1601  LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1602  unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1603  unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1604  SmallVector<int> NormalizedMask(NormalizedVF, UndefMaskElem);
1605  copy(Mask, NormalizedMask.begin());
1606  unsigned PrevSrcReg = 0;
1607  ArrayRef<int> PrevRegMask;
1608  InstructionCost Cost = 0;
1610  NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1611  [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1612  &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1613  if (!ShuffleVectorInst::isIdentityMask(RegMask)) {
1614  // Check if the previous register can be just copied to the next
1615  // one.
1616  if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1617  PrevRegMask != RegMask)
1619  RegMask, CostKind, 0, nullptr);
1620  else
1621  // Just a copy of previous destination register.
1622  Cost += TTI::TCC_Basic;
1623  return;
1624  }
1625  if (SrcReg != DestReg &&
1626  any_of(RegMask, [](int I) { return I != UndefMaskElem; })) {
1627  // Just a copy of the source register.
1628  Cost += TTI::TCC_Basic;
1629  }
1630  PrevSrcReg = SrcReg;
1631  PrevRegMask = RegMask;
1632  },
1633  [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1634  unsigned /*Unused*/,
1635  unsigned /*Unused*/) {
1636  Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1637  CostKind, 0, nullptr);
1638  });
1639  return Cost;
1640  }
1641 
1642  InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1643  return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1644  None, CostKind, 0, nullptr);
1645  }
1646 
1647  return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1648  }
1649 
1650  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1651  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1652  // We assume that source and destination have the same vector type.
1653  InstructionCost NumOfDests = LT.first;
1654  InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1655  LT.first = NumOfDests * NumOfShufflesPerDest;
1656  }
1657 
1658  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1659  {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1660  {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1661 
1662  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1663  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1664 
1665  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1666  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1667  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1668  };
1669 
1670  if (ST->hasVBMI())
1671  if (const auto *Entry =
1672  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1673  return LT.first * Entry->Cost;
1674 
1675  static const CostTblEntry AVX512BWShuffleTbl[] = {
1676  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1677  {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1678  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1679 
1680  {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1681  {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1682  {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1683  {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1684 
1685  {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1686  {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1687  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1688  {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1689  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1690 
1691  {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1692  {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1693  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1694  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1695  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1696 
1697  {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1698  {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1699 
1700  {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1701  {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1702  {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1703  };
1704 
1705  if (ST->hasBWI())
1706  if (const auto *Entry =
1707  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1708  return LT.first * Entry->Cost;
1709 
1710  static const CostTblEntry AVX512ShuffleTbl[] = {
1711  {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
1712  {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1713  {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1714  {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1715  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1716  {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1717  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1718 
1719  {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1720  {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1721  {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1722  {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1723  {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca
1724  {TTI::SK_Reverse, MVT::v32f16, 7}, // per mca
1725  {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca
1726 
1727  {TTI::SK_Splice, MVT::v8f64, 1}, // vpalignd
1728  {TTI::SK_Splice, MVT::v4f64, 1}, // vpalignd
1729  {TTI::SK_Splice, MVT::v16f32, 1}, // vpalignd
1730  {TTI::SK_Splice, MVT::v8f32, 1}, // vpalignd
1731  {TTI::SK_Splice, MVT::v8i64, 1}, // vpalignd
1732  {TTI::SK_Splice, MVT::v4i64, 1}, // vpalignd
1733  {TTI::SK_Splice, MVT::v16i32, 1}, // vpalignd
1734  {TTI::SK_Splice, MVT::v8i32, 1}, // vpalignd
1735  {TTI::SK_Splice, MVT::v32i16, 4}, // split + palignr
1736  {TTI::SK_Splice, MVT::v32f16, 4}, // split + palignr
1737  {TTI::SK_Splice, MVT::v64i8, 4}, // split + palignr
1738 
1739  {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1740  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1741  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1742  {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1743  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1744  {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1745  {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1746  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1747  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1748  {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1749  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1750  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1751  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1752 
1753  {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1754  {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1755  {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1756  {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1757  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1758  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1759  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1760  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1761  {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1762  {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1763  {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1764  {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
1765 
1766  // FIXME: This just applies the type legalization cost rules above
1767  // assuming these completely split.
1774 
1775  {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
1776  {TTI::SK_Select, MVT::v32f16, 1}, // vpternlogq
1777  {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
1778  {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
1779  {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
1780  {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
1781  {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
1782  };
1783 
1784  if (ST->hasAVX512())
1785  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1786  return LT.first * Entry->Cost;
1787 
1788  static const CostTblEntry AVX2ShuffleTbl[] = {
1789  {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1790  {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1791  {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1792  {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1793  {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1794  {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1795  {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1796 
1797  {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1798  {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1799  {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1800  {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1801  {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1802  {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1803  {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1804 
1805  {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1806  {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1807  {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1808 
1809  {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1810  {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1811  {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1812  {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1813  {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1814 
1815  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1816  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1817  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1818  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1819  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1820  // + vpblendvb
1821  {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1822  // + vpblendvb
1823  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1824  // + vpblendvb
1825 
1826  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1827  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1828  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1829  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1830  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1831  // + vpblendvb
1832  {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1833  // + vpblendvb
1834  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1835  // + vpblendvb
1836  };
1837 
1838  if (ST->hasAVX2())
1839  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1840  return LT.first * Entry->Cost;
1841 
1842  static const CostTblEntry XOPShuffleTbl[] = {
1843  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1844  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1845  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1846  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1847  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1848  // + vinsertf128
1849  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1850  // + vinsertf128
1851 
1852  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1853  // + vinsertf128
1854  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1855  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1856  // + vinsertf128
1857  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1858  };
1859 
1860  if (ST->hasXOP())
1861  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1862  return LT.first * Entry->Cost;
1863 
1864  static const CostTblEntry AVX1ShuffleTbl[] = {
1865  {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1866  {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1867  {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1868  {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1869  {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1870  {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1871  {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1872 
1873  {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1874  {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1875  {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1876  {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1877  {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1878  // + vinsertf128
1879  {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1880  // + vinsertf128
1881  {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1882  // + vinsertf128
1883 
1884  {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1885  {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1886  {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1887  {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1888  {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1889  {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
1890  {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1891 
1892  {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
1893  {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
1894  {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1895  {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1896  {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1897  {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1898  {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1899 
1900  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1901  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1902  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1903  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1904  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1905  // + 2*por + vinsertf128
1906  {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
1907  // + 2*por + vinsertf128
1908  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1909  // + 2*por + vinsertf128
1910 
1911  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1912  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1913  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1914  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1915  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1916  // + 4*por + vinsertf128
1917  {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
1918  // + 4*por + vinsertf128
1919  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1920  // + 4*por + vinsertf128
1921  };
1922 
1923  if (ST->hasAVX())
1924  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1925  return LT.first * Entry->Cost;
1926 
1927  static const CostTblEntry SSE41ShuffleTbl[] = {
1928  {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1929  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1930  {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1931  {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1932  {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1933  {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
1934  {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1935  };
1936 
1937  if (ST->hasSSE41())
1938  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1939  return LT.first * Entry->Cost;
1940 
1941  static const CostTblEntry SSSE3ShuffleTbl[] = {
1942  {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1943  {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
1944  {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1945 
1946  {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1947  {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
1948  {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1949 
1950  {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1951  {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
1952  {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1953 
1954  {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
1955  {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
1956  {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
1957  {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
1958  {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
1959 
1960  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1961  {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
1962  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1963 
1964  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1965  {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
1966  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1967  };
1968 
1969  if (ST->hasSSSE3())
1970  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1971  return LT.first * Entry->Cost;
1972 
1973  static const CostTblEntry SSE2ShuffleTbl[] = {
1974  {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1975  {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1976  {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1977  {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1978  {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
1979  {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1980 
1981  {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1982  {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1983  {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1984  {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1985  {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
1986  {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1987  // + 2*pshufd + 2*unpck + packus
1988 
1989  {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1990  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1991  {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1992  {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1993  {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
1994  {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1995 
1996  {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
1997  {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
1998  {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
1999  {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2000  {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2001  {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2002 
2003  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2004  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2005  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2006  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2007  // + pshufd/unpck
2008  {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2009  // + pshufd/unpck
2010  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2011  // + 2*pshufd + 2*unpck + 2*packus
2012 
2013  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2014  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2015  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2016  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2017  { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2018  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2019  };
2020 
2021  static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2022  {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2023  };
2024 
2025  if (ST->hasSSE2()) {
2026  bool IsLoad =
2027  llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2028  if (ST->hasSSE3() && IsLoad)
2029  if (const auto *Entry =
2030  CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2032  LT.second.getVectorElementCount()) &&
2033  "Table entry missing from isLegalBroadcastLoad()");
2034  return LT.first * Entry->Cost;
2035  }
2036 
2037  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2038  return LT.first * Entry->Cost;
2039  }
2040 
2041  static const CostTblEntry SSE1ShuffleTbl[] = {
2042  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2043  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2044  { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2045  { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2046  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2047  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2048  };
2049 
2050  if (ST->hasSSE1())
2051  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2052  return LT.first * Entry->Cost;
2053 
2054  return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2055 }
2056 
2058  Type *Src,
2061  const Instruction *I) {
2062  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2063  assert(ISD && "Invalid opcode");
2064 
2065  // TODO: Allow non-throughput costs that aren't binary.
2066  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2068  return Cost == 0 ? 0 : 1;
2069  return Cost;
2070  };
2071 
2072  // The cost tables include both specific, custom (non-legal) src/dst type
2073  // conversions and generic, legalized types. We test for customs first, before
2074  // falling back to legalization.
2075  // FIXME: Need a better design of the cost table to handle non-simple types of
2076  // potential massive combinations (elem_num x src_type x dst_type).
2077  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
2080 
2081  // Mask sign extend has an instruction.
2099 
2100  // Mask zero extend is a sext + shift.
2118 
2136 
2138  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
2139  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
2140  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
2141  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
2142  };
2143 
2144  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
2145  // Mask sign extend has an instruction.
2154 
2155  // Mask zero extend is a sext + shift.
2164 
2173 
2176 
2179 
2182 
2185  };
2186 
2187  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2188  // 256-bit wide vectors.
2189 
2190  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
2194 
2195  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2196  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2197  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2198  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
2199  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2200  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2201  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2202  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
2203  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
2204  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
2205  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
2206  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
2207  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
2208  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
2209  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
2210  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
2211  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
2212  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
2213  { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb
2214  { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb
2215  { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw
2216  { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw
2217  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
2218  { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
2219  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
2220  { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb
2221  { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb
2222  { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb
2223  { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
2224  { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw
2225  { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw
2226  { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
2227  { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
2228  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
2229 
2230  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
2233 
2234  // Sign extend is zmm vpternlogd+vptruncdb.
2235  // Zero extend is zmm broadcast load+vptruncdw.
2244 
2245  // Sign extend is zmm vpternlogd+vptruncdw.
2246  // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2255 
2256  { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
2257  { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
2258  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
2259  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
2260  { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
2261  { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
2262  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
2263  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
2264  { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
2265  { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
2266 
2267  { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
2268  { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
2269  { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
2270  { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
2271 
2282 
2283  { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
2284  { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
2285 
2294 
2305 
2317 
2324  };
2325 
2326  static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
2327  // Mask sign extend has an instruction.
2345 
2346  // Mask zero extend is a sext + shift.
2364 
2382 
2384  };
2385 
2386  static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
2387  // Mask sign extend has an instruction.
2396 
2397  // Mask zero extend is a sext + shift.
2406 
2415 
2420 
2425 
2430 
2435  };
2436 
2437  static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
2438  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2439  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2440  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2441  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
2442  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2443  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2444  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2445  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
2446  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
2447  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
2448  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2449  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
2450  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
2451  { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
2452  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
2453  { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
2454  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
2455 
2456  // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2457  // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2466 
2467  // sign extend is vpcmpeq+maskedmove+vpmovdw
2468  // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2477 
2478  { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
2479  { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
2480  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
2481  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
2482  { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
2483  { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
2484  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
2485  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
2486  { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
2487  { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
2488 
2501 
2506 
2520 
2524 
2532  };
2533 
2534  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
2541 
2556 
2558 
2571 
2574 
2579 
2588 
2596 
2607  };
2608 
2609  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
2616 
2629 
2635 
2638  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
2642  { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
2644 
2657 
2675 
2687 
2701 
2704  };
2705 
2706  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2719 
2720  // These truncates end up widening elements.
2721  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
2722  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
2723  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
2724 
2728 
2740 
2755 
2766 
2777  };
2778 
2779  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2780  // These are somewhat magic numbers justified by comparing the
2781  // output of llvm-mca for our various supported scheduler models
2782  // and basing it off the worst case scenario.
2795 
2809 
2820 
2824  { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
2831 
2844 
2845  // These truncates are really widening elements.
2846  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2847  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2848  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2849  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2850  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2851  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2852 
2853  { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2855  { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
2861  { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2862  { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2863  { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
2864  };
2865 
2866  // Attempt to map directly to (simple) MVT types to let us match custom entries.
2867  EVT SrcTy = TLI->getValueType(DL, Src);
2868  EVT DstTy = TLI->getValueType(DL, Dst);
2869 
2870  // The function getSimpleVT only handles simple value types.
2871  if (SrcTy.isSimple() && DstTy.isSimple()) {
2872  MVT SimpleSrcTy = SrcTy.getSimpleVT();
2873  MVT SimpleDstTy = DstTy.getSimpleVT();
2874 
2875  if (ST->useAVX512Regs()) {
2876  if (ST->hasBWI())
2877  if (const auto *Entry = ConvertCostTableLookup(
2878  AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2879  return AdjustCost(Entry->Cost);
2880 
2881  if (ST->hasDQI())
2882  if (const auto *Entry = ConvertCostTableLookup(
2883  AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2884  return AdjustCost(Entry->Cost);
2885 
2886  if (ST->hasAVX512())
2887  if (const auto *Entry = ConvertCostTableLookup(
2888  AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2889  return AdjustCost(Entry->Cost);
2890  }
2891 
2892  if (ST->hasBWI())
2893  if (const auto *Entry = ConvertCostTableLookup(
2894  AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2895  return AdjustCost(Entry->Cost);
2896 
2897  if (ST->hasDQI())
2898  if (const auto *Entry = ConvertCostTableLookup(
2899  AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2900  return AdjustCost(Entry->Cost);
2901 
2902  if (ST->hasAVX512())
2903  if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2904  SimpleDstTy, SimpleSrcTy))
2905  return AdjustCost(Entry->Cost);
2906 
2907  if (ST->hasAVX2()) {
2908  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2909  SimpleDstTy, SimpleSrcTy))
2910  return AdjustCost(Entry->Cost);
2911  }
2912 
2913  if (ST->hasAVX()) {
2914  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2915  SimpleDstTy, SimpleSrcTy))
2916  return AdjustCost(Entry->Cost);
2917  }
2918 
2919  if (ST->hasSSE41()) {
2920  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2921  SimpleDstTy, SimpleSrcTy))
2922  return AdjustCost(Entry->Cost);
2923  }
2924 
2925  if (ST->hasSSE2()) {
2926  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2927  SimpleDstTy, SimpleSrcTy))
2928  return AdjustCost(Entry->Cost);
2929  }
2930  }
2931 
2932  // Fall back to legalized types.
2933  std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
2934  std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
2935 
2936  // If we're truncating to the same legalized type - just assume its free.
2937  if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
2938  return TTI::TCC_Free;
2939 
2940  if (ST->useAVX512Regs()) {
2941  if (ST->hasBWI())
2942  if (const auto *Entry = ConvertCostTableLookup(
2943  AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
2944  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2945 
2946  if (ST->hasDQI())
2947  if (const auto *Entry = ConvertCostTableLookup(
2948  AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
2949  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2950 
2951  if (ST->hasAVX512())
2952  if (const auto *Entry = ConvertCostTableLookup(
2953  AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
2954  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2955  }
2956 
2957  if (ST->hasBWI())
2958  if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2959  LTDest.second, LTSrc.second))
2960  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2961 
2962  if (ST->hasDQI())
2963  if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
2964  LTDest.second, LTSrc.second))
2965  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2966 
2967  if (ST->hasAVX512())
2968  if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2969  LTDest.second, LTSrc.second))
2970  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2971 
2972  if (ST->hasAVX2())
2973  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2974  LTDest.second, LTSrc.second))
2975  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2976 
2977  if (ST->hasAVX())
2978  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2979  LTDest.second, LTSrc.second))
2980  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2981 
2982  if (ST->hasSSE41())
2983  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2984  LTDest.second, LTSrc.second))
2985  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2986 
2987  if (ST->hasSSE2())
2988  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2989  LTDest.second, LTSrc.second))
2990  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2991 
2992  // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
2993  // sitofp.
2994  if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
2995  1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
2996  Type *ExtSrc = Src->getWithNewBitWidth(32);
2997  unsigned ExtOpc =
2998  (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
2999 
3000  // For scalar loads the extend would be free.
3001  InstructionCost ExtCost = 0;
3002  if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3003  ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3004 
3005  return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3007  }
3008 
3009  // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3010  // i32.
3011  if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3012  1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3013  Type *TruncDst = Dst->getWithNewBitWidth(32);
3014  return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3015  getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3017  }
3018 
3019  return AdjustCost(
3020  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3021 }
3022 
3024  Type *CondTy,
3025  CmpInst::Predicate VecPred,
3027  const Instruction *I) {
3028  // Early out if this type isn't scalar/vector integer/float.
3029  if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3030  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3031  I);
3032 
3033  // Legalize the type.
3034  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3035 
3036  MVT MTy = LT.second;
3037 
3038  int ISD = TLI->InstructionOpcodeToISD(Opcode);
3039  assert(ISD && "Invalid opcode");
3040 
3041  InstructionCost ExtraCost = 0;
3042  if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3043  // Some vector comparison predicates cost extra instructions.
3044  // TODO: Should we invert this and assume worst case cmp costs
3045  // and reduce for particular predicates?
3046  if (MTy.isVector() &&
3047  !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3048  (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3049  ST->hasBWI())) {
3050  // Fallback to I if a specific predicate wasn't specified.
3051  CmpInst::Predicate Pred = VecPred;
3052  if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3053  Pred == CmpInst::BAD_FCMP_PREDICATE))
3054  Pred = cast<CmpInst>(I)->getPredicate();
3055 
3056  switch (Pred) {
3057  case CmpInst::Predicate::ICMP_NE:
3058  // xor(cmpeq(x,y),-1)
3059  ExtraCost = 1;
3060  break;
3061  case CmpInst::Predicate::ICMP_SGE:
3062  case CmpInst::Predicate::ICMP_SLE:
3063  // xor(cmpgt(x,y),-1)
3064  ExtraCost = 1;
3065  break;
3066  case CmpInst::Predicate::ICMP_ULT:
3067  case CmpInst::Predicate::ICMP_UGT:
3068  // cmpgt(xor(x,signbit),xor(y,signbit))
3069  // xor(cmpeq(pmaxu(x,y),x),-1)
3070  ExtraCost = 2;
3071  break;
3072  case CmpInst::Predicate::ICMP_ULE:
3073  case CmpInst::Predicate::ICMP_UGE:
3074  if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3075  (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3076  // cmpeq(psubus(x,y),0)
3077  // cmpeq(pminu(x,y),x)
3078  ExtraCost = 1;
3079  } else {
3080  // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3081  ExtraCost = 3;
3082  }
3083  break;
3084  case CmpInst::Predicate::FCMP_ONE:
3085  case CmpInst::Predicate::FCMP_UEQ:
3086  // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3087  // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3088  if (CondTy && !ST->hasAVX())
3089  return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3090  CmpInst::Predicate::FCMP_UNO, CostKind) +
3091  getCmpSelInstrCost(Opcode, ValTy, CondTy,
3092  CmpInst::Predicate::FCMP_OEQ, CostKind) +
3093  getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3094 
3095  break;
3096  case CmpInst::Predicate::BAD_ICMP_PREDICATE:
3097  case CmpInst::Predicate::BAD_FCMP_PREDICATE:
3098  // Assume worst case scenario and add the maximum extra cost.
3099  ExtraCost = 3;
3100  break;
3101  default:
3102  break;
3103  }
3104  }
3105  }
3106 
3107  static const CostKindTblEntry SLMCostTbl[] = {
3108  // slm pcmpeq/pcmpgt throughput is 2
3109  { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3110  // slm pblendvb/blendvpd/blendvps throughput is 4
3111  { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3112  { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3113  { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3114  { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3115  { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3116  { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3117  };
3118 
3119  static const CostKindTblEntry AVX512BWCostTbl[] = {
3120  { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3121  { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3122  { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3123  { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3124 
3125  { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3126  { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3127  };
3128 
3129  static const CostKindTblEntry AVX512CostTbl[] = {
3130  { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3131  { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3132  { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3133  { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3134 
3135  { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3136  { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3137  { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3138  { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3139  { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3140  { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3141  { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3142 
3143  { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3144  { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3145  { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3146  { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3147  { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3148  { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3149  { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3150  { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3151  { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3152  { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3153  { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3154  { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3155  { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3156  { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3157 
3158  { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3159  { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3160  { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3161  { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3162  { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3163  { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3164  };
3165 
3166  static const CostKindTblEntry AVX2CostTbl[] = {
3167  { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3168  { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3169  { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3170  { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3171  { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3172  { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3173 
3174  { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3175  { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3176  { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3177  { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3178 
3179  { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3180  { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3181  { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3182  { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3183  { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3184  { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3185  };
3186 
3187  static const CostKindTblEntry XOPCostTbl[] = {
3188  { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3189  { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3190  };
3191 
3192  static const CostKindTblEntry AVX1CostTbl[] = {
3193  { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3194  { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3195  { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3196  { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3197  { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3198  { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3199 
3200  // AVX1 does not support 8-wide integer compare.
3201  { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3202  { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3203  { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3204  { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3205 
3206  { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3207  { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3208  { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3209  { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3210  { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3211  { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3212  };
3213 
3214  static const CostKindTblEntry SSE42CostTbl[] = {
3215  { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3216  };
3217 
3218  static const CostKindTblEntry SSE41CostTbl[]