LLVM  13.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of
17 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
18 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost.
21 /// Some examples of other technologies/CPUs:
22 /// SSE 3 - Pentium4 / Athlon64
23 /// SSE 4.1 - Penryn
24 /// SSE 4.2 - Nehalem
25 /// AVX - Sandy Bridge
26 /// AVX2 - Haswell
27 /// AVX-512 - Xeon Phi / Skylake
28 /// And some examples of instruction target dependent costs (latency)
29 /// divss sqrtss rsqrtss
30 /// AMD K7 11-16 19 3
31 /// Piledriver 9-24 13-15 5
32 /// Jaguar 14 16 2
33 /// Pentium II,III 18 30 2
34 /// Nehalem 7-14 7-18 3
35 /// Haswell 10-13 11 5
36 /// TODO: Develop and implement the target dependent cost model and
37 /// specialize cost numbers for different Cost Model Targets such as throughput,
38 /// code size, latency and uop count.
39 //===----------------------------------------------------------------------===//
40 
41 #include "X86TargetTransformInfo.h"
44 #include "llvm/CodeGen/CostTable.h"
46 #include "llvm/IR/IntrinsicInst.h"
47 #include "llvm/Support/Debug.h"
48 
49 using namespace llvm;
50 
51 #define DEBUG_TYPE "x86tti"
52 
53 //===----------------------------------------------------------------------===//
54 //
55 // X86 cost model.
56 //
57 //===----------------------------------------------------------------------===//
58 
60 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
61  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
62  // TODO: Currently the __builtin_popcount() implementation using SSE3
63  // instructions is inefficient. Once the problem is fixed, we should
64  // call ST->hasSSE3() instead of ST->hasPOPCNT().
66 }
67 
70  switch (Level) {
72  // - Penryn
73  // - Nehalem
74  // - Westmere
75  // - Sandy Bridge
76  // - Ivy Bridge
77  // - Haswell
78  // - Broadwell
79  // - Skylake
80  // - Kabylake
81  return 32 * 1024; // 32 KByte
83  // - Penryn
84  // - Nehalem
85  // - Westmere
86  // - Sandy Bridge
87  // - Ivy Bridge
88  // - Haswell
89  // - Broadwell
90  // - Skylake
91  // - Kabylake
92  return 256 * 1024; // 256 KByte
93  }
94 
95  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
96 }
97 
100  // - Penryn
101  // - Nehalem
102  // - Westmere
103  // - Sandy Bridge
104  // - Ivy Bridge
105  // - Haswell
106  // - Broadwell
107  // - Skylake
108  // - Kabylake
109  switch (Level) {
113  return 8;
114  }
115 
116  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
117 }
118 
119 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
120  bool Vector = (ClassID == 1);
121  if (Vector && !ST->hasSSE1())
122  return 0;
123 
124  if (ST->is64Bit()) {
125  if (Vector && ST->hasAVX512())
126  return 32;
127  return 16;
128  }
129  return 8;
130 }
131 
132 TypeSize
134  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
135  switch (K) {
137  return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
139  if (ST->hasAVX512() && PreferVectorWidth >= 512)
140  return TypeSize::getFixed(512);
141  if (ST->hasAVX() && PreferVectorWidth >= 256)
142  return TypeSize::getFixed(256);
143  if (ST->hasSSE1() && PreferVectorWidth >= 128)
144  return TypeSize::getFixed(128);
145  return TypeSize::getFixed(0);
147  return TypeSize::getScalable(0);
148  }
149 
150  llvm_unreachable("Unsupported register kind");
151 }
152 
153 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
155  .getFixedSize();
156 }
157 
158 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
159  // If the loop will not be vectorized, don't interleave the loop.
160  // Let regular unroll to unroll the loop, which saves the overflow
161  // check and memory check cost.
162  if (VF == 1)
163  return 1;
164 
165  if (ST->isAtom())
166  return 1;
167 
168  // Sandybridge and Haswell have multiple execution ports and pipelined
169  // vector units.
170  if (ST->hasAVX())
171  return 4;
172 
173  return 2;
174 }
175 
177  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
179  TTI::OperandValueProperties Opd1PropInfo,
181  const Instruction *CxtI) {
182  // TODO: Handle more cost kinds.
184  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
185  Op2Info, Opd1PropInfo,
186  Opd2PropInfo, Args, CxtI);
187  // Legalize the type.
188  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
189 
190  int ISD = TLI->InstructionOpcodeToISD(Opcode);
191  assert(ISD && "Invalid opcode");
192 
193  static const CostTblEntry GLMCostTable[] = {
194  { ISD::FDIV, MVT::f32, 18 }, // divss
195  { ISD::FDIV, MVT::v4f32, 35 }, // divps
196  { ISD::FDIV, MVT::f64, 33 }, // divsd
197  { ISD::FDIV, MVT::v2f64, 65 }, // divpd
198  };
199 
200  if (ST->useGLMDivSqrtCosts())
201  if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
202  LT.second))
203  return LT.first * Entry->Cost;
204 
205  static const CostTblEntry SLMCostTable[] = {
206  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
207  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
208  { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
209  { ISD::FMUL, MVT::f64, 2 }, // mulsd
210  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
211  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
212  { ISD::FDIV, MVT::f32, 17 }, // divss
213  { ISD::FDIV, MVT::v4f32, 39 }, // divps
214  { ISD::FDIV, MVT::f64, 32 }, // divsd
215  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
216  { ISD::FADD, MVT::v2f64, 2 }, // addpd
217  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
218  // v2i64/v4i64 mul is custom lowered as a series of long:
219  // multiplies(3), shifts(3) and adds(2)
220  // slm muldq version throughput is 2 and addq throughput 4
221  // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
222  // 3X4 (addq throughput) = 17
223  { ISD::MUL, MVT::v2i64, 17 },
224  // slm addq\subq throughput is 4
225  { ISD::ADD, MVT::v2i64, 4 },
226  { ISD::SUB, MVT::v2i64, 4 },
227  };
228 
229  if (ST->isSLM()) {
230  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
231  // Check if the operands can be shrinked into a smaller datatype.
232  bool Op1Signed = false;
233  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
234  bool Op2Signed = false;
235  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
236 
237  bool SignedMode = Op1Signed || Op2Signed;
238  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
239 
240  if (OpMinSize <= 7)
241  return LT.first * 3; // pmullw/sext
242  if (!SignedMode && OpMinSize <= 8)
243  return LT.first * 3; // pmullw/zext
244  if (OpMinSize <= 15)
245  return LT.first * 5; // pmullw/pmulhw/pshuf
246  if (!SignedMode && OpMinSize <= 16)
247  return LT.first * 5; // pmullw/pmulhw/pshuf
248  }
249 
250  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
251  LT.second)) {
252  return LT.first * Entry->Cost;
253  }
254  }
255 
256  if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
257  ISD == ISD::UREM) &&
260  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
261  if (ISD == ISD::SDIV || ISD == ISD::SREM) {
262  // On X86, vector signed division by constants power-of-two are
263  // normally expanded to the sequence SRA + SRL + ADD + SRA.
264  // The OperandValue properties may not be the same as that of the previous
265  // operation; conservatively assume OP_None.
266  InstructionCost Cost =
267  2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
270  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
271  Op2Info,
274  Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
275  Op2Info,
278 
279  if (ISD == ISD::SREM) {
280  // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
281  Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
282  Op2Info);
283  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
284  Op2Info);
285  }
286 
287  return Cost;
288  }
289 
290  // Vector unsigned division/remainder will be simplified to shifts/masks.
291  if (ISD == ISD::UDIV)
292  return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
293  Op1Info, Op2Info,
296 
297  else // UREM
298  return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
299  Op1Info, Op2Info,
302  }
303 
304  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
305  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
306  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
307  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
308  };
309 
311  ST->hasBWI()) {
312  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
313  LT.second))
314  return LT.first * Entry->Cost;
315  }
316 
317  static const CostTblEntry AVX512UniformConstCostTable[] = {
318  { ISD::SRA, MVT::v2i64, 1 },
319  { ISD::SRA, MVT::v4i64, 1 },
320  { ISD::SRA, MVT::v8i64, 1 },
321 
322  { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
323  { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
324  { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
325 
326  { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
327  { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
328  { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
329  { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
330  };
331 
333  ST->hasAVX512()) {
334  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
335  LT.second))
336  return LT.first * Entry->Cost;
337  }
338 
339  static const CostTblEntry AVX2UniformConstCostTable[] = {
340  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
341  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
342  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
343 
344  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
345 
346  { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
347  { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
348  { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
349  { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
350  };
351 
353  ST->hasAVX2()) {
354  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
355  LT.second))
356  return LT.first * Entry->Cost;
357  }
358 
359  static const CostTblEntry SSE2UniformConstCostTable[] = {
360  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
361  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
362  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
363 
364  { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
365  { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
366  { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
367 
368  { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
369  { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
370  { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
371  { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
372  { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
373  { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
374  { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
375  { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
376  };
377 
378  // XOP has faster vXi8 shifts.
380  ST->hasSSE2() && !ST->hasXOP()) {
381  if (const auto *Entry =
382  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
383  return LT.first * Entry->Cost;
384  }
385 
386  static const CostTblEntry AVX512BWConstCostTable[] = {
387  { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
388  { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
389  { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
390  { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
391  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
392  { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
393  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
394  { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
395  };
396 
399  ST->hasBWI()) {
400  if (const auto *Entry =
401  CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
402  return LT.first * Entry->Cost;
403  }
404 
405  static const CostTblEntry AVX512ConstCostTable[] = {
406  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
407  { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
408  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
409  { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
410  { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
411  { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
412  { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
413  { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
414  { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
415  { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
416  { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
417  { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
418  };
419 
422  ST->hasAVX512()) {
423  if (const auto *Entry =
424  CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
425  return LT.first * Entry->Cost;
426  }
427 
428  static const CostTblEntry AVX2ConstCostTable[] = {
429  { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
430  { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
431  { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
432  { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
433  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
434  { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
435  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
436  { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
437  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
438  { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
439  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
440  { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
441  };
442 
445  ST->hasAVX2()) {
446  if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
447  return LT.first * Entry->Cost;
448  }
449 
450  static const CostTblEntry SSE2ConstCostTable[] = {
451  { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
452  { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
453  { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
454  { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
455  { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
456  { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
457  { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
458  { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
459  { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
460  { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
461  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
462  { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
463  { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
464  { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
465  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
466  { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
467  { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
468  { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
469  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
470  { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
471  { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
472  { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
473  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
474  { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
475  };
476 
479  ST->hasSSE2()) {
480  // pmuldq sequence.
481  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
482  return LT.first * 32;
483  if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
484  return LT.first * 38;
485  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
486  return LT.first * 15;
487  if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
488  return LT.first * 20;
489 
490  if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
491  return LT.first * Entry->Cost;
492  }
493 
494  static const CostTblEntry AVX512BWShiftCostTable[] = {
495  { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
496  { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
497  { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
498 
499  { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
500  { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
501  { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
502 
503  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
504  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
505  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
506  };
507 
508  if (ST->hasBWI())
509  if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
510  return LT.first * Entry->Cost;
511 
512  static const CostTblEntry AVX2UniformCostTable[] = {
513  // Uniform splats are cheaper for the following instructions.
514  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
515  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
516  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
517  { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
518  { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
519  { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
520  };
521 
522  if (ST->hasAVX2() &&
524  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
525  if (const auto *Entry =
526  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
527  return LT.first * Entry->Cost;
528  }
529 
530  static const CostTblEntry SSE2UniformCostTable[] = {
531  // Uniform splats are cheaper for the following instructions.
532  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
533  { ISD::SHL, MVT::v4i32, 1 }, // pslld
534  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
535 
536  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
537  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
538  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
539 
540  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
541  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
542  };
543 
544  if (ST->hasSSE2() &&
546  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
547  if (const auto *Entry =
548  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
549  return LT.first * Entry->Cost;
550  }
551 
552  static const CostTblEntry AVX512DQCostTable[] = {
553  { ISD::MUL, MVT::v2i64, 1 },
554  { ISD::MUL, MVT::v4i64, 1 },
555  { ISD::MUL, MVT::v8i64, 1 }
556  };
557 
558  // Look for AVX512DQ lowering tricks for custom cases.
559  if (ST->hasDQI())
560  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
561  return LT.first * Entry->Cost;
562 
563  static const CostTblEntry AVX512BWCostTable[] = {
564  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
565  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
566  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
567 
568  { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
569  { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
570  { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
571  };
572 
573  // Look for AVX512BW lowering tricks for custom cases.
574  if (ST->hasBWI())
575  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
576  return LT.first * Entry->Cost;
577 
578  static const CostTblEntry AVX512CostTable[] = {
579  { ISD::SHL, MVT::v16i32, 1 },
580  { ISD::SRL, MVT::v16i32, 1 },
581  { ISD::SRA, MVT::v16i32, 1 },
582 
583  { ISD::SHL, MVT::v8i64, 1 },
584  { ISD::SRL, MVT::v8i64, 1 },
585 
586  { ISD::SRA, MVT::v2i64, 1 },
587  { ISD::SRA, MVT::v4i64, 1 },
588  { ISD::SRA, MVT::v8i64, 1 },
589 
590  { ISD::MUL, MVT::v64i8, 26 }, // extend/pmullw/trunc sequence.
591  { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
592  { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
593  { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
594  { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
595  { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
596  { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
597 
598  { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
599  { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
600  { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
601 
602  { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
603  { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
604  { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
605  };
606 
607  if (ST->hasAVX512())
608  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
609  return LT.first * Entry->Cost;
610 
611  static const CostTblEntry AVX2ShiftCostTable[] = {
612  // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
613  // customize them to detect the cases where shift amount is a scalar one.
614  { ISD::SHL, MVT::v4i32, 1 },
615  { ISD::SRL, MVT::v4i32, 1 },
616  { ISD::SRA, MVT::v4i32, 1 },
617  { ISD::SHL, MVT::v8i32, 1 },
618  { ISD::SRL, MVT::v8i32, 1 },
619  { ISD::SRA, MVT::v8i32, 1 },
620  { ISD::SHL, MVT::v2i64, 1 },
621  { ISD::SRL, MVT::v2i64, 1 },
622  { ISD::SHL, MVT::v4i64, 1 },
623  { ISD::SRL, MVT::v4i64, 1 },
624  };
625 
626  if (ST->hasAVX512()) {
627  if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
630  // On AVX512, a packed v32i16 shift left by a constant build_vector
631  // is lowered into a vector multiply (vpmullw).
632  return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
633  Op1Info, Op2Info,
636  }
637 
638  // Look for AVX2 lowering tricks.
639  if (ST->hasAVX2()) {
640  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
643  // On AVX2, a packed v16i16 shift left by a constant build_vector
644  // is lowered into a vector multiply (vpmullw).
645  return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
646  Op1Info, Op2Info,
649 
650  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
651  return LT.first * Entry->Cost;
652  }
653 
654  static const CostTblEntry XOPShiftCostTable[] = {
655  // 128bit shifts take 1cy, but right shifts require negation beforehand.
656  { ISD::SHL, MVT::v16i8, 1 },
657  { ISD::SRL, MVT::v16i8, 2 },
658  { ISD::SRA, MVT::v16i8, 2 },
659  { ISD::SHL, MVT::v8i16, 1 },
660  { ISD::SRL, MVT::v8i16, 2 },
661  { ISD::SRA, MVT::v8i16, 2 },
662  { ISD::SHL, MVT::v4i32, 1 },
663  { ISD::SRL, MVT::v4i32, 2 },
664  { ISD::SRA, MVT::v4i32, 2 },
665  { ISD::SHL, MVT::v2i64, 1 },
666  { ISD::SRL, MVT::v2i64, 2 },
667  { ISD::SRA, MVT::v2i64, 2 },
668  // 256bit shifts require splitting if AVX2 didn't catch them above.
669  { ISD::SHL, MVT::v32i8, 2+2 },
670  { ISD::SRL, MVT::v32i8, 4+2 },
671  { ISD::SRA, MVT::v32i8, 4+2 },
672  { ISD::SHL, MVT::v16i16, 2+2 },
673  { ISD::SRL, MVT::v16i16, 4+2 },
674  { ISD::SRA, MVT::v16i16, 4+2 },
675  { ISD::SHL, MVT::v8i32, 2+2 },
676  { ISD::SRL, MVT::v8i32, 4+2 },
677  { ISD::SRA, MVT::v8i32, 4+2 },
678  { ISD::SHL, MVT::v4i64, 2+2 },
679  { ISD::SRL, MVT::v4i64, 4+2 },
680  { ISD::SRA, MVT::v4i64, 4+2 },
681  };
682 
683  // Look for XOP lowering tricks.
684  if (ST->hasXOP()) {
685  // If the right shift is constant then we'll fold the negation so
686  // it's as cheap as a left shift.
687  int ShiftISD = ISD;
688  if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
691  ShiftISD = ISD::SHL;
692  if (const auto *Entry =
693  CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
694  return LT.first * Entry->Cost;
695  }
696 
697  static const CostTblEntry SSE2UniformShiftCostTable[] = {
698  // Uniform splats are cheaper for the following instructions.
699  { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
700  { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
701  { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
702 
703  { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
704  { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
705  { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
706 
707  { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
708  { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
709  { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
710  { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
711  };
712 
713  if (ST->hasSSE2() &&
715  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
716 
717  // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
718  if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
719  return LT.first * 4; // 2*psrad + shuffle.
720 
721  if (const auto *Entry =
722  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
723  return LT.first * Entry->Cost;
724  }
725 
726  if (ISD == ISD::SHL &&
728  MVT VT = LT.second;
729  // Vector shift left by non uniform constant can be lowered
730  // into vector multiply.
731  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
732  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
733  ISD = ISD::MUL;
734  }
735 
736  static const CostTblEntry AVX2CostTable[] = {
737  { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
738  { ISD::SHL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
739  { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
740  { ISD::SHL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
741 
742  { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
743  { ISD::SRL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
744  { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
745  { ISD::SRL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
746 
747  { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
748  { ISD::SRA, MVT::v64i8, 48 }, // 2*vpblendvb sequence.
749  { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
750  { ISD::SRA, MVT::v32i16, 20 }, // 2*extend/vpsravd/pack sequence.
751  { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
752  { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
753 
754  { ISD::SUB, MVT::v32i8, 1 }, // psubb
755  { ISD::ADD, MVT::v32i8, 1 }, // paddb
756  { ISD::SUB, MVT::v16i16, 1 }, // psubw
757  { ISD::ADD, MVT::v16i16, 1 }, // paddw
758  { ISD::SUB, MVT::v8i32, 1 }, // psubd
759  { ISD::ADD, MVT::v8i32, 1 }, // paddd
760  { ISD::SUB, MVT::v4i64, 1 }, // psubq
761  { ISD::ADD, MVT::v4i64, 1 }, // paddq
762 
763  { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
764  { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
765  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
766  { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
767  { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
768 
769  { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
770  { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
771  { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
772  { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
773  { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
774  { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
775 
776  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
777  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
778  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
779  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
780  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
781  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
782  };
783 
784  // Look for AVX2 lowering tricks for custom cases.
785  if (ST->hasAVX2())
786  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
787  return LT.first * Entry->Cost;
788 
789  static const CostTblEntry AVX1CostTable[] = {
790  // We don't have to scalarize unsupported ops. We can issue two half-sized
791  // operations and we only need to extract the upper YMM half.
792  // Two ops + 1 extract + 1 insert = 4.
793  { ISD::MUL, MVT::v16i16, 4 },
794  { ISD::MUL, MVT::v8i32, 4 },
795  { ISD::SUB, MVT::v32i8, 4 },
796  { ISD::ADD, MVT::v32i8, 4 },
797  { ISD::SUB, MVT::v16i16, 4 },
798  { ISD::ADD, MVT::v16i16, 4 },
799  { ISD::SUB, MVT::v8i32, 4 },
800  { ISD::ADD, MVT::v8i32, 4 },
801  { ISD::SUB, MVT::v4i64, 4 },
802  { ISD::ADD, MVT::v4i64, 4 },
803 
804  // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
805  // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
806  // Because we believe v4i64 to be a legal type, we must also include the
807  // extract+insert in the cost table. Therefore, the cost here is 18
808  // instead of 8.
809  { ISD::MUL, MVT::v4i64, 18 },
810 
811  { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
812 
813  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
814  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
815  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
816  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
817  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
818  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
819  };
820 
821  if (ST->hasAVX())
822  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
823  return LT.first * Entry->Cost;
824 
825  static const CostTblEntry SSE42CostTable[] = {
826  { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
827  { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
828  { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
829  { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
830 
831  { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
832  { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
833  { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
834  { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
835 
836  { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
837  { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
838  { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
839  { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
840 
841  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
842  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
843  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
844  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
845  };
846 
847  if (ST->hasSSE42())
848  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
849  return LT.first * Entry->Cost;
850 
851  static const CostTblEntry SSE41CostTable[] = {
852  { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
853  { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
854  { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
855  { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
856  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
857  { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
858 
859  { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
860  { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
861  { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
862  { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
863  { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
864  { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
865 
866  { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
867  { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
868  { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
869  { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
870  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
871  { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
872 
873  { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
874  };
875 
876  if (ST->hasSSE41())
877  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
878  return LT.first * Entry->Cost;
879 
880  static const CostTblEntry SSE2CostTable[] = {
881  // We don't correctly identify costs of casts because they are marked as
882  // custom.
883  { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
884  { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
885  { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
886  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
887  { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
888 
889  { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
890  { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
891  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
892  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
893  { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
894 
895  { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
896  { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
897  { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
898  { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
899  { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
900 
901  { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
902  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
903  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
904  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
905 
906  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
907  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
908  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
909  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
910 
911  { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
912  { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
913 
914  { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
915  { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
916  };
917 
918  if (ST->hasSSE2())
919  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
920  return LT.first * Entry->Cost;
921 
922  static const CostTblEntry SSE1CostTable[] = {
923  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
924  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
925 
926  { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
927  { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
928 
929  { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
930  { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
931 
932  { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
933  { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
934  { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
935 
936  { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
937  { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
938  { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
939  };
940 
941  if (ST->hasSSE1())
942  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
943  return LT.first * Entry->Cost;
944 
945  // It is not a good idea to vectorize division. We have to scalarize it and
946  // in the process we will often end up having to spilling regular
947  // registers. The overhead of division is going to dominate most kernels
948  // anyways so try hard to prevent vectorization of division - it is
949  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
950  // to hide "20 cycles" for each lane.
951  if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
952  ISD == ISD::UDIV || ISD == ISD::UREM)) {
954  Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
956  return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
957  }
958 
959  // Fallback to the default implementation.
960  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
961 }
962 
964  VectorType *BaseTp,
965  ArrayRef<int> Mask, int Index,
966  VectorType *SubTp) {
967  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
968  // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
969  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
970 
971  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
972  if (Kind == TTI::SK_Transpose)
974 
975  // For Broadcasts we are splatting the first element from the first input
976  // register, so only need to reference that input and all the output
977  // registers are the same.
978  if (Kind == TTI::SK_Broadcast)
979  LT.first = 1;
980 
981  // Subvector extractions are free if they start at the beginning of a
982  // vector and cheap if the subvectors are aligned.
983  if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
984  int NumElts = LT.second.getVectorNumElements();
985  if ((Index % NumElts) == 0)
986  return 0;
987  std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
988  if (SubLT.second.isVector()) {
989  int NumSubElts = SubLT.second.getVectorNumElements();
990  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
991  return SubLT.first;
992  // Handle some cases for widening legalization. For now we only handle
993  // cases where the original subvector was naturally aligned and evenly
994  // fit in its legalized subvector type.
995  // FIXME: Remove some of the alignment restrictions.
996  // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
997  // vectors.
998  int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
999  if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1000  (NumSubElts % OrigSubElts) == 0 &&
1001  LT.second.getVectorElementType() ==
1002  SubLT.second.getVectorElementType() &&
1003  LT.second.getVectorElementType().getSizeInBits() ==
1004  BaseTp->getElementType()->getPrimitiveSizeInBits()) {
1005  assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1006  "Unexpected number of elements!");
1007  auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1008  LT.second.getVectorNumElements());
1009  auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1010  SubLT.second.getVectorNumElements());
1011  int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1012  InstructionCost ExtractCost = getShuffleCost(
1013  TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy);
1014 
1015  // If the original size is 32-bits or more, we can use pshufd. Otherwise
1016  // if we have SSSE3 we can use pshufb.
1017  if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1018  return ExtractCost + 1; // pshufd or pshufb
1019 
1020  assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1021  "Unexpected vector size");
1022 
1023  return ExtractCost + 2; // worst case pshufhw + pshufd
1024  }
1025  }
1026  }
1027 
1028  // Subvector insertions are cheap if the subvectors are aligned.
1029  // Note that in general, the insertion starting at the beginning of a vector
1030  // isn't free, because we need to preserve the rest of the wide vector.
1031  if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1032  int NumElts = LT.second.getVectorNumElements();
1033  std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
1034  if (SubLT.second.isVector()) {
1035  int NumSubElts = SubLT.second.getVectorNumElements();
1036  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1037  return SubLT.first;
1038  }
1039  }
1040 
1041  // Handle some common (illegal) sub-vector types as they are often very cheap
1042  // to shuffle even on targets without PSHUFB.
1043  EVT VT = TLI->getValueType(DL, BaseTp);
1044  if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1045  !ST->hasSSSE3()) {
1046  static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1047  {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1048  {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1049  {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1050  {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1051  {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1052 
1053  {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1054  {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1055  {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1056  {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1057 
1058  {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1059  {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1060  {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1061  {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1062  {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1063 
1064  {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1065  {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1066  {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1067  {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1068  {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1069  };
1070 
1071  if (ST->hasSSE2())
1072  if (const auto *Entry =
1073  CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1074  return Entry->Cost;
1075  }
1076 
1077  // We are going to permute multiple sources and the result will be in multiple
1078  // destinations. Providing an accurate cost only for splits where the element
1079  // type remains the same.
1080  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1081  MVT LegalVT = LT.second;
1082  if (LegalVT.isVector() &&
1083  LegalVT.getVectorElementType().getSizeInBits() ==
1084  BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1085  LegalVT.getVectorNumElements() <
1086  cast<FixedVectorType>(BaseTp)->getNumElements()) {
1087 
1088  unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1089  unsigned LegalVTSize = LegalVT.getStoreSize();
1090  // Number of source vectors after legalization:
1091  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1092  // Number of destination vectors after legalization:
1093  unsigned NumOfDests = LT.first;
1094 
1095  auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1096  LegalVT.getVectorNumElements());
1097 
1098  unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1099  return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1100  None, 0, nullptr);
1101  }
1102 
1103  return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1104  }
1105 
1106  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1107  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1108  // We assume that source and destination have the same vector type.
1109  int NumOfDests = LT.first;
1110  int NumOfShufflesPerDest = LT.first * 2 - 1;
1111  LT.first = NumOfDests * NumOfShufflesPerDest;
1112  }
1113 
1114  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1115  {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1116  {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1117 
1118  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1119  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1120 
1121  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1122  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1123  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1124  };
1125 
1126  if (ST->hasVBMI())
1127  if (const auto *Entry =
1128  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1129  return LT.first * Entry->Cost;
1130 
1131  static const CostTblEntry AVX512BWShuffleTbl[] = {
1132  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1133  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1134 
1135  {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1136  {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1137  {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1138 
1139  {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1140  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1141  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1142 
1143  {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1144  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1145  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1146  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1147 
1148  {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1149  {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1150  };
1151 
1152  if (ST->hasBWI())
1153  if (const auto *Entry =
1154  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1155  return LT.first * Entry->Cost;
1156 
1157  static const CostTblEntry AVX512ShuffleTbl[] = {
1158  {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
1159  {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1160  {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1161  {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1162  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1163  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1164 
1165  {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1166  {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1167  {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1168  {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1169 
1170  {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1171  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1172  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1173  {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1174  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1175  {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1176  {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1177  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1178  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1179  {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1180  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1181  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1182  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1183 
1184  {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1185  {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1186  {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1187  {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1188  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1189  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1190  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1191  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1192  {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1193  {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1194  {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1195  {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
1196 
1197  // FIXME: This just applies the type legalization cost rules above
1198  // assuming these completely split.
1203 
1204  {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
1205  {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
1206  {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
1207  {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
1208  {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
1209  {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
1210  };
1211 
1212  if (ST->hasAVX512())
1213  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1214  return LT.first * Entry->Cost;
1215 
1216  static const CostTblEntry AVX2ShuffleTbl[] = {
1217  {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1218  {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1219  {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1220  {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1221  {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1222  {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1223 
1224  {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1225  {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1226  {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1227  {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1228  {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1229  {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1230 
1231  {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1232  {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1233 
1234  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1235  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1236  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1237  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1238  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1239  // + vpblendvb
1240  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1241  // + vpblendvb
1242 
1243  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1244  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1245  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1246  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1247  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1248  // + vpblendvb
1249  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1250  // + vpblendvb
1251  };
1252 
1253  if (ST->hasAVX2())
1254  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1255  return LT.first * Entry->Cost;
1256 
1257  static const CostTblEntry XOPShuffleTbl[] = {
1258  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1259  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1260  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1261  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1262  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1263  // + vinsertf128
1264  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1265  // + vinsertf128
1266 
1267  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1268  // + vinsertf128
1269  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1270  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1271  // + vinsertf128
1272  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1273  };
1274 
1275  if (ST->hasXOP())
1276  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1277  return LT.first * Entry->Cost;
1278 
1279  static const CostTblEntry AVX1ShuffleTbl[] = {
1280  {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1281  {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1282  {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1283  {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1284  {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1285  {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1286 
1287  {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1288  {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1289  {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1290  {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1291  {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1292  // + vinsertf128
1293  {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1294  // + vinsertf128
1295 
1296  {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1297  {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1298  {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1299  {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1300  {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1301  {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1302 
1303  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1304  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1305  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1306  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1307  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1308  // + 2*por + vinsertf128
1309  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1310  // + 2*por + vinsertf128
1311 
1312  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1313  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1314  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1315  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1316  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1317  // + 4*por + vinsertf128
1318  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1319  // + 4*por + vinsertf128
1320  };
1321 
1322  if (ST->hasAVX())
1323  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1324  return LT.first * Entry->Cost;
1325 
1326  static const CostTblEntry SSE41ShuffleTbl[] = {
1327  {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1328  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1329  {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1330  {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1331  {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1332  {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1333  };
1334 
1335  if (ST->hasSSE41())
1336  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1337  return LT.first * Entry->Cost;
1338 
1339  static const CostTblEntry SSSE3ShuffleTbl[] = {
1340  {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1341  {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1342 
1343  {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1344  {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1345 
1346  {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1347  {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1348 
1349  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1350  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1351 
1352  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1353  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1354  };
1355 
1356  if (ST->hasSSSE3())
1357  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1358  return LT.first * Entry->Cost;
1359 
1360  static const CostTblEntry SSE2ShuffleTbl[] = {
1361  {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1362  {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1363  {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1364  {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1365  {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1366 
1367  {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1368  {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1369  {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1370  {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1371  {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1372  // + 2*pshufd + 2*unpck + packus
1373 
1374  {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1375  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1376  {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1377  {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1378  {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1379 
1380  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1381  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1382  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1383  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1384  // + pshufd/unpck
1385  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1386  // + 2*pshufd + 2*unpck + 2*packus
1387 
1388  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1389  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1390  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1391  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1392  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1393  };
1394 
1395  if (ST->hasSSE2())
1396  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1397  return LT.first * Entry->Cost;
1398 
1399  static const CostTblEntry SSE1ShuffleTbl[] = {
1400  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1401  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1402  { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1403  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1404  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1405  };
1406 
1407  if (ST->hasSSE1())
1408  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1409  return LT.first * Entry->Cost;
1410 
1411  return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1412 }
1413 
1415  Type *Src,
1418  const Instruction *I) {
1419  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1420  assert(ISD && "Invalid opcode");
1421 
1422  // TODO: Allow non-throughput costs that aren't binary.
1423  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1425  return Cost == 0 ? 0 : 1;
1426  return Cost;
1427  };
1428 
1429  // FIXME: Need a better design of the cost table to handle non-simple types of
1430  // potential massive combinations (elem_num x src_type x dst_type).
1431 
1432  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1435 
1436  // Mask sign extend has an instruction.
1448 
1449  // Mask zero extend is a sext + shift.
1461 
1463  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
1464  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm
1465  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm
1466  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm
1467  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm
1468  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm
1469  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm
1470  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm
1471  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm
1472  { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm
1475  };
1476 
1477  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1480 
1483 
1486 
1489  };
1490 
1491  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1492  // 256-bit wide vectors.
1493 
1494  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1498 
1499  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1500  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1501  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1502  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
1503  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1504  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1505  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1506  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
1507  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
1508  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
1509  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
1510  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
1511  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
1512  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
1513  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
1519  { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
1520  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
1521 
1522  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
1524 
1525  // Sign extend is zmm vpternlogd+vptruncdb.
1526  // Zero extend is zmm broadcast load+vptruncdw.
1535 
1536  // Sign extend is zmm vpternlogd+vptruncdw.
1537  // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
1546 
1547  { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
1548  { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
1549  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
1550  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
1551  { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
1552  { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
1553  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
1554  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
1555  { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
1556  { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
1557 
1558  { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
1559  { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
1560  { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
1561  { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
1562 
1573 
1574  { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1575  { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1576 
1585 
1596 
1601 
1608  };
1609 
1610  static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
1611  // Mask sign extend has an instruction.
1621 
1622  // Mask zero extend is a sext + shift.
1632 
1634  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // vpsllw+vptestmb
1635  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // vpsllw+vptestmw
1636  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // vpsllw+vptestmb
1637  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // vpsllw+vptestmw
1638  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // vpsllw+vptestmb
1639  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // vpsllw+vptestmw
1640  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // vpsllw+vptestmb
1641  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // vpsllw+vptestmw
1642  { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // vpsllw+vptestmb
1643  };
1644 
1645  static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
1650 
1655 
1660 
1665  };
1666 
1667  static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
1668  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1669  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1670  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1671  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
1672  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1673  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1674  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1675  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
1676  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
1677  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
1678  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
1679  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
1680  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
1681  { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
1682 
1683  // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
1684  // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
1693 
1694  // sign extend is vpcmpeq+maskedmove+vpmovdw
1695  // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
1704 
1705  { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
1706  { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
1707  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
1708  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
1709  { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
1710  { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
1711  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
1712  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
1713  { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
1714  { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
1715 
1730 
1733 
1736 
1739 
1745  };
1746 
1747  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1768 
1771 
1776 
1779 
1781  };
1782 
1783  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1802 
1808 
1819 
1832 
1848  // The generic code to compute the scalar overhead is currently broken.
1849  // Workaround this limitation by estimating the scalarization overhead
1850  // here. We have roughly 10 instructions per scalar element.
1851  // Multiply that by the vector width.
1852  // FIXME: remove that when PR19268 is fixed.
1855 
1860 
1865  // This node is expanded into scalarized operations but BasicTTI is overly
1866  // optimistic estimating its cost. It computes 3 per element (one
1867  // vector-extract, one scalar conversion and one vector-insert). The
1868  // problem is that the inserts form a read-modify-write chain so latency
1869  // should be factored in too. Inflating the cost per element by 1.
1872 
1875  };
1876 
1877  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1884 
1903 
1904  // These truncates end up widening elements.
1905  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
1906  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
1907  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
1908 
1917  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB
1918 
1921 
1924 
1928  };
1929 
1930  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1931  // These are somewhat magic numbers justified by looking at the output of
1932  // Intel's IACA, running some kernels and making sure when we take
1933  // legalization into account the throughput will be overestimated.
1935  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1943 
1944  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1952 
1959 
1961 
1964 
1973 
1998 
1999  // These truncates are really widening elements.
2000  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2001  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2002  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2003  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2004  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2005  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2006 
2007  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB
2008  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // PAND+PACKUSWB
2009  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2011  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+2*PACKUSWB
2019  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2020  { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2021  { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1 }, // PSHUFD
2022  };
2023 
2024  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
2025  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
2026 
2027  if (ST->hasSSE2() && !ST->hasAVX()) {
2028  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2029  LTDest.second, LTSrc.second))
2030  return AdjustCost(LTSrc.first * Entry->Cost);
2031  }
2032 
2033  EVT SrcTy = TLI->getValueType(DL, Src);
2034  EVT DstTy = TLI->getValueType(DL, Dst);
2035 
2036  // The function getSimpleVT only handles simple value types.
2037  if (!SrcTy.isSimple() || !DstTy.isSimple())
2038  return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind));
2039 
2040  MVT SimpleSrcTy = SrcTy.getSimpleVT();
2041  MVT SimpleDstTy = DstTy.getSimpleVT();
2042 
2043  if (ST->useAVX512Regs()) {
2044  if (ST->hasBWI())
2045  if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
2046  SimpleDstTy, SimpleSrcTy))
2047  return AdjustCost(Entry->Cost);
2048 
2049  if (ST->hasDQI())
2050  if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
2051  SimpleDstTy, SimpleSrcTy))
2052  return AdjustCost(Entry->Cost);
2053 
2054  if (ST->hasAVX512())
2055  if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
2056  SimpleDstTy, SimpleSrcTy))
2057  return AdjustCost(Entry->Cost);
2058  }
2059 
2060  if (ST->hasBWI())
2061  if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2062  SimpleDstTy, SimpleSrcTy))
2063  return AdjustCost(Entry->Cost);
2064 
2065  if (ST->hasDQI())
2066  if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
2067  SimpleDstTy, SimpleSrcTy))
2068  return AdjustCost(Entry->Cost);
2069 
2070  if (ST->hasAVX512())
2071  if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2072  SimpleDstTy, SimpleSrcTy))
2073  return AdjustCost(Entry->Cost);
2074 
2075  if (ST->hasAVX2()) {
2076  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2077  SimpleDstTy, SimpleSrcTy))
2078  return AdjustCost(Entry->Cost);
2079  }
2080 
2081  if (ST->hasAVX()) {
2082  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2083  SimpleDstTy, SimpleSrcTy))
2084  return AdjustCost(Entry->Cost);
2085  }
2086 
2087  if (ST->hasSSE41()) {
2088  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2089  SimpleDstTy, SimpleSrcTy))
2090  return AdjustCost(Entry->Cost);
2091  }
2092 
2093  if (ST->hasSSE2()) {
2094  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2095  SimpleDstTy, SimpleSrcTy))
2096  return AdjustCost(Entry->Cost);
2097  }
2098 
2099  return AdjustCost(
2100  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2101 }
2102 
2104  Type *CondTy,
2105  CmpInst::Predicate VecPred,
2107  const Instruction *I) {
2108  // TODO: Handle other cost kinds.
2110  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2111  I);
2112 
2113  // Legalize the type.
2114  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2115 
2116  MVT MTy = LT.second;
2117 
2118  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2119  assert(ISD && "Invalid opcode");
2120 
2121  unsigned ExtraCost = 0;
2122  if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {
2123  // Some vector comparison predicates cost extra instructions.
2124  if (MTy.isVector() &&
2125  !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
2126  (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
2127  ST->hasBWI())) {
2128  switch (cast<CmpInst>(I)->getPredicate()) {
2129  case CmpInst::Predicate::ICMP_NE:
2130  // xor(cmpeq(x,y),-1)
2131  ExtraCost = 1;
2132  break;
2133  case CmpInst::Predicate::ICMP_SGE:
2134  case CmpInst::Predicate::ICMP_SLE:
2135  // xor(cmpgt(x,y),-1)
2136  ExtraCost = 1;
2137  break;
2138  case CmpInst::Predicate::ICMP_ULT:
2139  case CmpInst::Predicate::ICMP_UGT:
2140  // cmpgt(xor(x,signbit),xor(y,signbit))
2141  // xor(cmpeq(pmaxu(x,y),x),-1)
2142  ExtraCost = 2;
2143  break;
2144  case CmpInst::Predicate::ICMP_ULE:
2145  case CmpInst::Predicate::ICMP_UGE:
2146  if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
2147  (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
2148  // cmpeq(psubus(x,y),0)
2149  // cmpeq(pminu(x,y),x)
2150  ExtraCost = 1;
2151  } else {
2152  // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
2153  ExtraCost = 3;
2154  }
2155  break;
2156  default:
2157  break;
2158  }
2159  }
2160  }
2161 
2162  static const CostTblEntry SLMCostTbl[] = {
2163  // slm pcmpeq/pcmpgt throughput is 2
2164  { ISD::SETCC, MVT::v2i64, 2 },
2165  };
2166 
2167  static const CostTblEntry AVX512BWCostTbl[] = {
2168  { ISD::SETCC, MVT::v32i16, 1 },
2169  { ISD::SETCC, MVT::v64i8, 1 },
2170 
2171  { ISD::SELECT, MVT::v32i16, 1 },
2172  { ISD::SELECT, MVT::v64i8, 1 },
2173  };
2174 
2175  static const CostTblEntry AVX512CostTbl[] = {
2176  { ISD::SETCC, MVT::v8i64, 1 },
2177  { ISD::SETCC, MVT::v16i32, 1 },
2178  { ISD::SETCC, MVT::v8f64, 1 },
2179  { ISD::SETCC, MVT::v16f32, 1 },
2180 
2181  { ISD::SELECT, MVT::v8i64, 1 },
2182  { ISD::SELECT, MVT::v16i32, 1 },
2183  { ISD::SELECT, MVT::v8f64, 1 },
2184  { ISD::SELECT, MVT::v16f32, 1 },
2185 
2186  { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
2187  { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
2188 
2189  { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3
2190  { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3
2191  };
2192 
2193  static const CostTblEntry AVX2CostTbl[] = {
2194  { ISD::SETCC, MVT::v4i64, 1 },
2195  { ISD::SETCC, MVT::v8i32, 1 },
2196  { ISD::SETCC, MVT::v16i16, 1 },
2197  { ISD::SETCC, MVT::v32i8, 1 },
2198 
2199  { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
2200  { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
2201  { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
2202  { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
2203  };
2204 
2205  static const CostTblEntry AVX1CostTbl[] = {
2206  { ISD::SETCC, MVT::v4f64, 1 },
2207  { ISD::SETCC, MVT::v8f32, 1 },
2208  // AVX1 does not support 8-wide integer compare.
2209  { ISD::SETCC, MVT::v4i64, 4 },
2210  { ISD::SETCC, MVT::v8i32, 4 },
2211  { ISD::SETCC, MVT::v16i16, 4 },
2212  { ISD::SETCC, MVT::v32i8, 4 },
2213 
2214  { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
2215  { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
2216  { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
2217  { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
2218  { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
2219  { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
2220  };
2221 
2222  static const CostTblEntry SSE42CostTbl[] = {
2223  { ISD::SETCC, MVT::v2f64, 1 },
2224  { ISD::SETCC, MVT::v4f32, 1 },
2225  { ISD::SETCC, MVT::v2i64, 1 },
2226  };
2227 
2228  static const CostTblEntry SSE41CostTbl[] = {
2229  { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
2230  { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
2231  { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
2232  { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
2233  { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
2234  { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
2235  };
2236 
2237  static const CostTblEntry SSE2CostTbl[] = {
2238  { ISD::SETCC, MVT::v2f64, 2 },
2239  { ISD::SETCC, MVT::f64, 1 },
2240  { ISD::SETCC, MVT::v2i64, 8 },
2241  { ISD::SETCC, MVT::v4i32, 1 },
2242  { ISD::SETCC, MVT::v8i16, 1 },
2243  { ISD::SETCC, MVT::v16i8, 1 },
2244 
2245  { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
2246  { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
2247  { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
2248  { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
2249  { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
2250  };
2251 
2252  static const CostTblEntry SSE1CostTbl[] = {
2253  { ISD::SETCC, MVT::v4f32, 2 },
2254  { ISD::SETCC, MVT::f32, 1 },
2255 
2256  { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
2257  };
2258 
2259  if (ST->isSLM())
2260  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2261  return LT.first * (ExtraCost + Entry->Cost);
2262 
2263  if (ST->hasBWI())
2264  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2265  return LT.first * (ExtraCost + Entry->Cost);
2266 
2267  if (ST->hasAVX512())
2268  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2269  return LT.first * (ExtraCost + Entry->Cost);
2270 
2271  if (ST->hasAVX2())
2272  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2273  return LT.first * (ExtraCost + Entry->Cost);
2274 
2275  if (ST->hasAVX())
2276  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2277  return LT.first * (ExtraCost + Entry->Cost);
2278 
2279  if (ST->hasSSE42())
2280  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2281  return LT.first * (ExtraCost + Entry->Cost);
2282 
2283  if (ST->hasSSE41())
2284  if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
2285  return LT.first * (ExtraCost + Entry->Cost);
2286 
2287  if (ST->hasSSE2())
2288  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2289  return LT.first * (ExtraCost + Entry->Cost);
2290 
2291  if (ST->hasSSE1())
2292  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2293  return LT.first * (ExtraCost + Entry->Cost);
2294 
2295  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2296 }
2297 
2299 
2303 
2304  // Costs should match the codegen from:
2305  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
2306  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
2307  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
2308  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
2309  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
2310 
2311  // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
2312  // specialized in these tables yet.
2313  static const CostTblEntry AVX512CDCostTbl[] = {
2314  { ISD::CTLZ, MVT::v8i64, 1 },
2315  { ISD::CTLZ, MVT::v16i32, 1 },
2316  { ISD::CTLZ, MVT::v32i16, 8 },
2317  { ISD::CTLZ, MVT::v64i8, 20 },
2318  { ISD::CTLZ, MVT::v4i64, 1 },
2319  { ISD::CTLZ, MVT::v8i32, 1 },
2320  { ISD::CTLZ, MVT::v16i16, 4 },
2321  { ISD::CTLZ, MVT::v32i8, 10 },
2322  { ISD::CTLZ, MVT::v2i64, 1 },
2323  { ISD::CTLZ, MVT::v4i32, 1 },
2324  { ISD::CTLZ, MVT::v8i16, 4 },
2325  { ISD::CTLZ, MVT::v16i8, 4 },
2326  };
2327  static const CostTblEntry AVX512BWCostTbl[] = {
2328  { ISD::ABS, MVT::v32i16, 1 },
2329  { ISD::ABS, MVT::v64i8, 1 },
2330  { ISD::BITREVERSE, MVT::v8i64, 5 },
2331  { ISD::BITREVERSE, MVT::v16i32, 5 },
2332  { ISD::BITREVERSE, MVT::v32i16, 5 },
2333  { ISD::BITREVERSE, MVT::v64i8, 5 },
2334  { ISD::CTLZ, MVT::v8i64, 23 },
2335  { ISD::CTLZ, MVT::v16i32, 22 },
2336  { ISD::CTLZ, MVT::v32i16, 18 },
2337  { ISD::CTLZ, MVT::v64i8, 17 },
2338  { ISD::CTPOP, MVT::v8i64, 7 },
2339  { ISD::CTPOP, MVT::v16i32, 11 },
2340  { ISD::CTPOP, MVT::v32i16, 9 },
2341  { ISD::CTPOP, MVT::v64i8, 6 },
2342  { ISD::CTTZ, MVT::v8i64, 10 },
2343  { ISD::CTTZ, MVT::v16i32, 14 },
2344  { ISD::CTTZ, MVT::v32i16, 12 },
2345  { ISD::CTTZ, MVT::v64i8, 9 },
2346  { ISD::SADDSAT, MVT::v32i16, 1 },
2347  { ISD::SADDSAT, MVT::v64i8, 1 },
2348  { ISD::SMAX, MVT::v32i16, 1 },
2349  { ISD::SMAX, MVT::v64i8, 1 },
2350  { ISD::SMIN, MVT::v32i16, 1 },
2351  { ISD::SMIN, MVT::v64i8, 1 },
2352  { ISD::SSUBSAT, MVT::v32i16, 1 },
2353  { ISD::SSUBSAT, MVT::v64i8, 1 },
2354  { ISD::UADDSAT, MVT::v32i16, 1 },
2355  { ISD::UADDSAT, MVT::v64i8, 1 },
2356  { ISD::UMAX, MVT::v32i16, 1 },
2357  { ISD::UMAX, MVT::v64i8, 1 },
2358  { ISD::UMIN, MVT::v32i16, 1 },
2359  { ISD::UMIN, MVT::v64i8, 1 },
2360  { ISD::USUBSAT, MVT::v32i16, 1 },
2361  { ISD::USUBSAT, MVT::v64i8, 1 },
2362  };
2363  static const CostTblEntry AVX512CostTbl[] = {
2364  { ISD::ABS, MVT::v8i64, 1 },
2365  { ISD::ABS, MVT::v16i32, 1 },
2366  { ISD::ABS, MVT::v32i16, 2 }, // FIXME: include split
2367  { ISD::ABS, MVT::v64i8, 2 }, // FIXME: include split
2368  { ISD::ABS, MVT::v4i64, 1 },
2369  { ISD::ABS, MVT::v2i64, 1 },
2370  { ISD::BITREVERSE, MVT::v8i64, 36 },
2371  { ISD::BITREVERSE, MVT::v16i32, 24 },
2372  { ISD::BITREVERSE, MVT::v32i16, 10 },
2373  { ISD::BITREVERSE, MVT::v64i8, 10 },
2374  { ISD::CTLZ, MVT::v8i64, 29 },
2375  { ISD::CTLZ, MVT::v16i32, 35 },
2376  { ISD::CTLZ, MVT::v32i16, 28 },
2377  { ISD::CTLZ, MVT::v64i8, 18 },
2378  { ISD::CTPOP, MVT::v8i64, 16 },
2379  { ISD::CTPOP, MVT::v16i32, 24 },
2380  { ISD::CTPOP, MVT::v32i16, 18 },
2381  { ISD::CTPOP, MVT::v64i8, 12 },
2382  { ISD::CTTZ, MVT::v8i64, 20 },
2383  { ISD::CTTZ, MVT::v16i32, 28 },
2384  { ISD::CTTZ, MVT::v32i16, 24 },
2385  { ISD::CTTZ, MVT::v64i8, 18 },
2386  { ISD::SMAX, MVT::v8i64, 1 },
2387  { ISD::SMAX, MVT::v16i32, 1 },
2388  { ISD::SMAX, MVT::v32i16, 2 }, // FIXME: include split
2389  { ISD::SMAX, MVT::v64i8, 2 }, // FIXME: include split
2390  { ISD::SMAX, MVT::v4i64, 1 },
2391  { ISD::SMAX, MVT::v2i64, 1 },
2392  { ISD::SMIN, MVT::v8i64, 1 },
2393  { ISD::SMIN, MVT::v16i32, 1 },
2394  { ISD::SMIN, MVT::v32i16, 2 }, // FIXME: include split
2395  { ISD::SMIN, MVT::v64i8, 2 }, // FIXME: include split
2396  { ISD::SMIN, MVT::v4i64, 1 },
2397  { ISD::SMIN, MVT::v2i64, 1 },
2398  { ISD::UMAX, MVT::v8i64, 1 },
2399  { ISD::UMAX, MVT::v16i32, 1 },
2400  { ISD::UMAX, MVT::v32i16, 2 }, // FIXME: include split
2401  { ISD::UMAX, MVT::v64i8, 2 }, // FIXME: include split
2402  { ISD::UMAX, MVT::v4i64, 1 },
2403  { ISD::UMAX, MVT::v2i64, 1 },
2404  { ISD::UMIN, MVT::v8i64, 1 },
2405  { ISD::UMIN, MVT::v16i32, 1 },
2406  { ISD::UMIN, MVT::v32i16, 2 }, // FIXME: include split
2407  { ISD::UMIN, MVT::v64i8, 2 }, // FIXME: include split
2408  { ISD::UMIN, MVT::v4i64, 1 },
2409  { ISD::UMIN, MVT::v2i64, 1 },
2410  { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
2411  { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
2412  { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
2413  { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
2414  { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
2415  { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
2416  { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
2417  { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
2418  { ISD::SADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2419  { ISD::SADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2420  { ISD::SSUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2421  { ISD::SSUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2422  { ISD::UADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2423  { ISD::UADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2424  { ISD::USUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2425  { ISD::USUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2426  { ISD::FMAXNUM, MVT::f32, 2 },
2427  { ISD::FMAXNUM, MVT::v4f32, 2 },
2428  { ISD::FMAXNUM, MVT::v8f32, 2 },
2429  { ISD::FMAXNUM, MVT::v16f32, 2 },
2430  { ISD::FMAXNUM, MVT::f64, 2 },
2431  { ISD::FMAXNUM, MVT::v2f64, 2 },
2432  { ISD::FMAXNUM, MVT::v4f64, 2 },
2433  { ISD::FMAXNUM, MVT::v8f64, 2 },
2434  };
2435  static const CostTblEntry XOPCostTbl[] = {
2436  { ISD::BITREVERSE, MVT::v4i64, 4 },
2437  { ISD::BITREVERSE, MVT::v8i32, 4 },
2438  { ISD::BITREVERSE, MVT::v16i16, 4 },
2439  { ISD::BITREVERSE, MVT::v32i8, 4 },
2440  { ISD::BITREVERSE, MVT::v2i64, 1 },
2441  { ISD::BITREVERSE, MVT::v4i32, 1 },
2442  { ISD::BITREVERSE, MVT::v8i16, 1 },
2443  { ISD::BITREVERSE, MVT::v16i8, 1 },
2444  { ISD::BITREVERSE, MVT::i64, 3 },
2445  { ISD::BITREVERSE, MVT::i32, 3 },
2446  { ISD::BITREVERSE, MVT::i16, 3 },
2447  { ISD::BITREVERSE, MVT::i8, 3 }
2448  };
2449  static const CostTblEntry AVX2CostTbl[] = {
2450  { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2451  { ISD::ABS, MVT::v8i32, 1 },
2452  { ISD::ABS, MVT::v16i16, 1 },
2453  { ISD::ABS, MVT::v32i8, 1 },
2454  { ISD::BITREVERSE, MVT::v4i64, 5 },
2455  { ISD::BITREVERSE, MVT::v8i32, 5 },
2456  { ISD::BITREVERSE, MVT::v16i16, 5 },
2457  { ISD::BITREVERSE, MVT::v32i8, 5 },
2458  { ISD::BSWAP, MVT::v4i64, 1 },
2459  { ISD::BSWAP, MVT::v8i32, 1 },
2460  { ISD::BSWAP, MVT::v16i16, 1 },
2461  { ISD::CTLZ, MVT::v4i64, 23 },
2462  { ISD::CTLZ, MVT::v8i32, 18 },
2463  { ISD::CTLZ, MVT::v16i16, 14 },
2464  { ISD::CTLZ, MVT::v32i8, 9 },
2465  { ISD::CTPOP, MVT::v4i64, 7 },
2466  { ISD::CTPOP, MVT::v8i32, 11 },
2467  { ISD::CTPOP, MVT::v16i16, 9 },
2468  { ISD::CTPOP, MVT::v32i8, 6 },
2469  { ISD::CTTZ, MVT::v4i64, 10 },
2470  { ISD::CTTZ, MVT::v8i32, 14 },
2471  { ISD::CTTZ, MVT::v16i16, 12 },
2472  { ISD::CTTZ, MVT::v32i8, 9 },
2473  { ISD::SADDSAT, MVT::v16i16, 1 },
2474  { ISD::SADDSAT, MVT::v32i8, 1 },
2475  { ISD::SMAX, MVT::v8i32, 1 },
2476  { ISD::SMAX, MVT::v16i16, 1 },
2477  { ISD::SMAX, MVT::v32i8, 1 },
2478  { ISD::SMIN, MVT::v8i32, 1 },
2479  { ISD::SMIN, MVT::v16i16, 1 },
2480  { ISD::SMIN, MVT::v32i8, 1 },
2481  { ISD::SSUBSAT, MVT::v16i16, 1 },
2482  { ISD::SSUBSAT, MVT::v32i8, 1 },
2483  { ISD::UADDSAT, MVT::v16i16, 1 },
2484  { ISD::UADDSAT, MVT::v32i8, 1 },
2485  { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
2486  { ISD::UMAX, MVT::v8i32, 1 },
2487  { ISD::UMAX, MVT::v16i16, 1 },
2488  { ISD::UMAX, MVT::v32i8, 1 },
2489  { ISD::UMIN, MVT::v8i32, 1 },
2490  { ISD::UMIN, MVT::v16i16, 1 },
2491  { ISD::UMIN, MVT::v32i8, 1 },
2492  { ISD::USUBSAT, MVT::v16i16, 1 },
2493  { ISD::USUBSAT, MVT::v32i8, 1 },
2494  { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
2495  { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
2496  { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
2497  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
2498  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
2499  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
2500  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
2501  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
2502  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
2503  };
2504  static const CostTblEntry AVX1CostTbl[] = {
2505  { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2506  { ISD::ABS, MVT::v8i32, 3 },
2507  { ISD::ABS, MVT::v16i16, 3 },
2508  { ISD::ABS, MVT::v32i8, 3 },
2509  { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
2510  { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
2511  { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
2512  { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
2513  { ISD::BSWAP, MVT::v4i64, 4 },
2514  { ISD::BSWAP, MVT::v8i32, 4 },
2515  { ISD::BSWAP, MVT::v16i16, 4 },
2516  { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
2517  { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
2518  { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
2519  { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2520  { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
2521  { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
2522  { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
2523  { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
2524  { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
2525  { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
2526  { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
2527  { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2528  { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2529  { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2530  { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2531  { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2532  { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2533  { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2534  { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2535  { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2536  { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2537  { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2538  { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2539  { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2540  { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
2541  { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2542  { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2543  { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2544  { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2545  { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2546  { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2547  { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2548  { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2549  { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
2550  { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS
2551  { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
2552  { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
2553  { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD
2554  { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
2555  { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
2556  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
2557  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
2558  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
2559  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
2560  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
2561  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
2562  };
2563  static const CostTblEntry GLMCostTbl[] = {
2564  { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
2565  { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
2566  { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
2567  { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
2568  };
2569  static const CostTblEntry SLMCostTbl[] = {
2570  { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
2571  { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
2572  { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
2573  { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
2574  };
2575  static const CostTblEntry SSE42CostTbl[] = {
2576  { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
2577  { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
2578  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
2579  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
2580  };
2581  static const CostTblEntry SSE41CostTbl[] = {
2582  { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X)
2583  { ISD::SMAX, MVT::v4i32, 1 },
2584  { ISD::SMAX, MVT::v16i8, 1 },
2585  { ISD::SMIN, MVT::v4i32, 1 },
2586  { ISD::SMIN, MVT::v16i8, 1 },
2587  { ISD::UMAX, MVT::v4i32, 1 },
2588  { ISD::UMAX, MVT::v8i16, 1 },
2589  { ISD::UMIN, MVT::v4i32, 1 },
2590  { ISD::UMIN, MVT::v8i16, 1 },
2591  };
2592  static const CostTblEntry SSSE3CostTbl[] = {
2593  { ISD::ABS, MVT::v4i32, 1 },
2594  { ISD::ABS, MVT::v8i16, 1 },
2595  { ISD::ABS, MVT::v16i8, 1 },
2596  { ISD::BITREVERSE, MVT::v2i64, 5 },
2597  { ISD::BITREVERSE, MVT::v4i32, 5 },
2598  { ISD::BITREVERSE, MVT::v8i16, 5 },
2599  { ISD::BITREVERSE, MVT::v16i8, 5 },
2600  { ISD::BSWAP, MVT::v2i64, 1 },
2601  { ISD::BSWAP, MVT::v4i32, 1 },
2602  { ISD::BSWAP, MVT::v8i16, 1 },
2603  { ISD::CTLZ, MVT::v2i64, 23 },
2604  { ISD::CTLZ, MVT::v4i32, 18 },
2605  { ISD::CTLZ, MVT::v8i16, 14 },
2606  { ISD::CTLZ, MVT::v16i8, 9 },
2607  { ISD::CTPOP, MVT::v2i64, 7 },
2608  { ISD::CTPOP, MVT::v4i32, 11 },
2609  { ISD::CTPOP, MVT::v8i16, 9 },
2610  { ISD::CTPOP, MVT::v16i8, 6 },
2611  { ISD::CTTZ, MVT::v2i64, 10 },
2612  { ISD::CTTZ, MVT::v4i32, 14 },
2613  { ISD::CTTZ, MVT::v8i16, 12 },
2614  { ISD::CTTZ, MVT::v16i8, 9 }
2615  };
2616  static const CostTblEntry SSE2CostTbl[] = {
2617  { ISD::ABS, MVT::v2i64, 4 },
2618  { ISD::ABS, MVT::v4i32, 3 },
2619  { ISD::ABS, MVT::v8i16, 2 },
2620  { ISD::ABS, MVT::v16i8, 2 },
2621  { ISD::BITREVERSE, MVT::v2i64, 29 },
2622  { ISD::BITREVERSE, MVT::v4i32, 27 },
2623  { ISD::BITREVERSE, MVT::v8i16, 27 },
2624  { ISD::BITREVERSE, MVT::v16i8, 20 },
2625  { ISD::BSWAP, MVT::v2i64, 7 },
2626  { ISD::BSWAP, MVT::v4i32, 7 },
2627  { ISD::BSWAP, MVT::v8i16, 7 },
2628  { ISD::CTLZ, MVT::v2i64, 25 },
2629  { ISD::CTLZ, MVT::v4i32, 26 },
2630  { ISD::CTLZ, MVT::v8i16, 20 },
2631  { ISD::CTLZ, MVT::v16i8, 17 },
2632  { ISD::CTPOP, MVT::v2i64, 12 },
2633  { ISD::CTPOP, MVT::v4i32, 15 },
2634  { ISD::CTPOP, MVT::v8i16, 13 },
2635  { ISD::CTPOP, MVT::v16i8, 10 },
2636  { ISD::CTTZ, MVT::v2i64, 14 },
2637  { ISD::CTTZ, MVT::v4i32, 18 },
2638  { ISD::CTTZ, MVT::v8i16, 16 },
2639  { ISD::CTTZ, MVT::v16i8, 13 },
2640  { ISD::SADDSAT, MVT::v8i16, 1 },
2641  { ISD::SADDSAT, MVT::v16i8, 1 },
2642  { ISD::SMAX, MVT::v8i16, 1 },
2643  { ISD::SMIN, MVT::v8i16, 1 },
2644  { ISD::SSUBSAT, MVT::v8i16, 1 },
2645  { ISD::SSUBSAT, MVT::v16i8, 1 },
2646  { ISD::UADDSAT, MVT::v8i16, 1 },
2647  { ISD::UADDSAT, MVT::v16i8, 1 },
2648  { ISD::UMAX, MVT::v8i16, 2 },
2649  { ISD::UMAX, MVT::v16i8, 1 },
2650  { ISD::UMIN, MVT::v8i16, 2 },
2651  { ISD::UMIN, MVT::v16i8, 1 },
2652  { ISD::USUBSAT, MVT::v8i16, 1 },
2653  { ISD::USUBSAT, MVT::v16i8, 1 },
2654  { ISD::FMAXNUM, MVT::f64, 4 },
2655  { ISD::FMAXNUM, MVT::v2f64, 4 },
2656  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
2657  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
2658  };
2659  static const CostTblEntry SSE1CostTbl[] = {
2660  { ISD::FMAXNUM, MVT::f32, 4 },
2661  { ISD::FMAXNUM, MVT::v4f32, 4 },
2662  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
2663  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
2664  };
2665  static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
2666  { ISD::CTTZ, MVT::i64, 1 },
2667  };
2668  static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
2669  { ISD::CTTZ, MVT::i32, 1 },
2670  { ISD::CTTZ, MVT::i16, 1 },
2671  { ISD::CTTZ, MVT::i8, 1 },
2672  };
2673  static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
2674  { ISD::CTLZ, MVT::i64, 1 },
2675  };
2676  static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
2677  { ISD::CTLZ, MVT::i32, 1 },
2678  { ISD::CTLZ, MVT::i16, 1 },
2679  { ISD::CTLZ, MVT::i8, 1 },
2680  };
2681  static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
2682  { ISD::CTPOP, MVT::i64, 1 },
2683  };
2684  static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
2685  { ISD::CTPOP, MVT::i32, 1 },
2686  { ISD::CTPOP, MVT::i16, 1 },
2687  { ISD::CTPOP, MVT::i8, 1 },
2688  };
2689  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2690  { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
2691  { ISD::BITREVERSE, MVT::i64, 14 },
2692  { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
2693  { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
2694  { ISD::CTPOP, MVT::i64, 10 },
2695  { ISD::SADDO, MVT::i64, 1 },
2696  { ISD::UADDO, MVT::i64, 1 },
2697  { ISD::UMULO, MVT::i64, 2 }, // mulq + seto
2698  };
2699  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2700  { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV
2701  { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV
2702  { ISD::BITREVERSE, MVT::i32, 14 },
2703  { ISD::BITREVERSE, MVT::i16, 14 },
2704  { ISD::BITREVERSE, MVT::i8, 11 },
2705  { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
2706  { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
2707  { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
2708  { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH
2709  { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH
2710  { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH
2711  { ISD::CTPOP, MVT::i32, 8 },
2712  { ISD::CTPOP, MVT::i16, 9 },
2713  { ISD::CTPOP, MVT::i8, 7 },
2714  { ISD::SADDO, MVT::i32, 1 },
2715  { ISD::SADDO, MVT::i16, 1 },
2716  { ISD::SADDO, MVT::i8, 1 },
2717  { ISD::UADDO, MVT::i32, 1 },
2718  { ISD::UADDO, MVT::i16, 1 },
2719  { ISD::UADDO, MVT::i8, 1 },
2720  { ISD::UMULO, MVT::i32, 2 }, // mul + seto
2721  { ISD::UMULO, MVT::i16, 2 },
2722  { ISD::UMULO, MVT::i8, 2 },
2723  };
2724 
2725  Type *RetTy = ICA.getReturnType();
2726  Type *OpTy = RetTy;
2727  Intrinsic::ID IID = ICA.getID();
2728  unsigned ISD = ISD::DELETED_NODE;
2729  switch (IID) {
2730  default:
2731  break;
2732  case Intrinsic::abs:
2733  ISD = ISD::ABS;
2734  break;
2735  case Intrinsic::bitreverse:
2736  ISD = ISD::BITREVERSE;
2737  break;
2738  case Intrinsic::bswap:
2739  ISD = ISD::BSWAP;
2740  break;
2741  case Intrinsic::ctlz:
2742  ISD = ISD::CTLZ;
2743  break;
2744  case Intrinsic::ctpop:
2745  ISD = ISD::CTPOP;
2746  break;
2747  case Intrinsic::cttz:
2748  ISD = ISD::CTTZ;
2749  break;
2750  case Intrinsic::maxnum:
2751  case Intrinsic::minnum:
2752  // FMINNUM has same costs so don't duplicate.
2753  ISD = ISD::FMAXNUM;
2754  break;
2755  case Intrinsic::sadd_sat:
2756  ISD = ISD::SADDSAT;
2757  break;
2758  case Intrinsic::smax:
2759  ISD = ISD::SMAX;
2760  break;
2761  case Intrinsic::smin:
2762  ISD = ISD::SMIN;
2763  break;
2764  case Intrinsic::ssub_sat:
2765  ISD = ISD::SSUBSAT;
2766  break;
2767  case Intrinsic::uadd_sat:
2768  ISD = ISD::UADDSAT;
2769  break;
2770  case Intrinsic::umax:
2771  ISD = ISD::UMAX;
2772  break;
2773  case Intrinsic::umin:
2774  ISD = ISD::UMIN;
2775  break;
2776  case Intrinsic::usub_sat:
2777  ISD = ISD::USUBSAT;
2778  break;
2779  case Intrinsic::sqrt:
2780  ISD = ISD::FSQRT;
2781  break;
2782  case Intrinsic::sadd_with_overflow:
2783  case Intrinsic::ssub_with_overflow:
2784  // SSUBO has same costs so don't duplicate.
2785  ISD = ISD::SADDO;
2786  OpTy = RetTy->getContainedType(0);
2787  break;
2788  case Intrinsic::uadd_with_overflow:
2789  case Intrinsic::usub_with_overflow:
2790  // USUBO has same costs so don't duplicate.
2791  ISD = ISD::UADDO;
2792  OpTy = RetTy->getContainedType(0);
2793  break;
2794  case Intrinsic::umul_with_overflow:
2795  case Intrinsic::smul_with_overflow:
2796  // SMULO has same costs so don't duplicate.
2797  ISD = ISD::UMULO;
2798  OpTy = RetTy->getContainedType(0);
2799  break;
2800  }
2801 
2802  if (ISD != ISD::DELETED_NODE) {
2803  // Legalize the type.
2804  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
2805  MVT MTy = LT.second;
2806 
2807  // Attempt to lookup cost.
2808  if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
2809  MTy.isVector()) {
2810  // With PSHUFB the code is very similar for all types. If we have integer
2811  // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
2812  // we also need a PSHUFB.
2813  unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
2814 
2815  // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
2816  // instructions. We also need an extract and an insert.
2817  if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
2818  (ST->hasBWI() && MTy.is512BitVector())))
2819  Cost = Cost * 2 + 2;
2820 
2821  return LT.first * Cost;
2822  }
2823 
2824  auto adjustTableCost = [](const CostTblEntry &Entry, int LegalizationCost,
2825  FastMathFlags FMF) {
2826  // If there are no NANs to deal with, then these are reduced to a
2827  // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
2828  // assume is used in the non-fast case.
2829  if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) {
2830  if (FMF.noNaNs())
2831  return LegalizationCost * 1;
2832  }
2833  return LegalizationCost * (int)Entry.Cost;
2834  };
2835 
2836  if (ST->useGLMDivSqrtCosts())
2837  if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
2838  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2839 
2840  if (ST->isSLM())
2841  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2842  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2843 
2844  if (ST->hasCDI())
2845  if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
2846  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2847 
2848  if (ST->hasBWI())
2849  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2850  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2851 
2852  if (ST->hasAVX512())
2853  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2854  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2855 
2856  if (ST->hasXOP())
2857  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2858  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2859 
2860  if (ST->hasAVX2())
2861  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2862  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2863 
2864  if (ST->hasAVX())
2865  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2866  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2867 
2868  if (ST->hasSSE42())
2869  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2870  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2871 
2872  if (ST->hasSSE41())
2873  if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
2874  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2875 
2876  if (ST->hasSSSE3())
2877  if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
2878  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2879 
2880  if (ST->hasSSE2())
2881  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2882  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2883 
2884  if (ST->hasSSE1())
2885  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2886  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2887 
2888  if (ST->hasBMI()) {
2889  if (ST->is64Bit())
2890  if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
2891  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2892 
2893  if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
2894  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2895  }
2896 
2897  if (ST->hasLZCNT()) {
2898  if (ST->is64Bit())
2899  if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
2900  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2901 
2902  if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
2903  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2904  }
2905 
2906  if (ST->hasPOPCNT()) {
2907  if (ST->is64Bit())
2908  if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
2909  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2910 
2911  if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
2912  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2913  }
2914 
2915  // TODO - add BMI (TZCNT) scalar handling
2916 
2917  if (ST->is64Bit())
2918  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2919  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2920 
2921  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2922  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2923  }
2924 
2926 }
2927 
2931  if (ICA.isTypeBasedOnly())
2933 
2934  static const CostTblEntry AVX512CostTbl[] = {
2935  { ISD::ROTL, MVT::v8i64, 1 },
2936  { ISD::ROTL, MVT::v4i64, 1 },
2937  { ISD::ROTL, MVT::v2i64, 1 },
2938  { ISD::ROTL, MVT::v16i32, 1 },
2939  { ISD::ROTL, MVT::v8i32, 1 },
2940  { ISD::ROTL, MVT::v4i32, 1 },
2941  { ISD::ROTR, MVT::v8i64, 1 },
2942  { ISD::ROTR, MVT::v4i64, 1 },
2943  { ISD::ROTR, MVT::v2i64, 1 },
2944  { ISD::ROTR, MVT::v16i32, 1 },
2945  { ISD::ROTR, MVT::v8i32, 1 },
2946  { ISD::ROTR, MVT::v4i32, 1 }
2947  };
2948  // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
2949  static const CostTblEntry XOPCostTbl[] = {
2950  { ISD::ROTL, MVT::v4i64, 4 },
2951  { ISD::ROTL, MVT::v8i32, 4 },
2952  { ISD::ROTL, MVT::v16i16, 4 },
2953  { ISD::ROTL, MVT::v32i8, 4 },
2954  { ISD::ROTL, MVT::v2i64, 1 },
2955  { ISD::ROTL, MVT::v4i32, 1 },
2956  { ISD::ROTL, MVT::v8i16, 1 },
2957  { ISD::ROTL, MVT::v16i8, 1 },
2958  { ISD::ROTR, MVT::v4i64, 6 },
2959  { ISD::ROTR, MVT::v8i32, 6 },
2960  { ISD::ROTR, MVT::v16i16, 6 },
2961  { ISD::ROTR, MVT::v32i8, 6 },
2962  { ISD::ROTR, MVT::v2i64, 2 },
2963  { ISD::ROTR, MVT::v4i32, 2 },
2964  { ISD::ROTR, MVT::v8i16, 2 },
2965  { ISD::ROTR, MVT::v16i8, 2 }
2966  };
2967  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2968  { ISD::ROTL, MVT::i64, 1 },
2969  { ISD::ROTR, MVT::i64, 1 },
2970  { ISD::FSHL, MVT::i64, 4 }
2971  };
2972  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2973  { ISD::ROTL, MVT::i32, 1 },
2974  { ISD::ROTL, MVT::i16, 1 },
2975  { ISD::ROTL, MVT::i8, 1 },
2976  { ISD::ROTR, MVT::i32, 1 },
2977  { ISD::ROTR, MVT::i16, 1 },
2978  { ISD::ROTR, MVT::i8, 1 },
2979  { ISD::FSHL, MVT::i32, 4 },
2980  { ISD::FSHL, MVT::i16, 4 },
2981  { ISD::FSHL, MVT::i8, 4 }
2982  };
2983 
2984  Intrinsic::ID IID = ICA.getID();
2985  Type *RetTy = ICA.getReturnType();
2987  unsigned ISD = ISD::DELETED_NODE;
2988  switch (IID) {
2989  default:
2990  break;
2991  case Intrinsic::fshl:
2992  ISD = ISD::FSHL;
2993  if (Args[0] == Args[1])
2994  ISD = ISD::ROTL;
2995  break;
2996  case Intrinsic::fshr:
2997  // FSHR has same costs so don't duplicate.
2998  ISD = ISD::FSHL;
2999  if (Args[0] == Args[1])
3000  ISD = ISD::ROTR;
3001  break;
3002  }
3003 
3004  if (ISD != ISD::DELETED_NODE) {
3005  // Legalize the type.
3006  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
3007  MVT MTy = LT.second;
3008 
3009  // Attempt to lookup cost.
3010  if (ST->hasAVX512())
3011  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3012  return LT.first * Entry->Cost;
3013 
3014  if (ST->hasXOP())
3015  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3016  return LT.first * Entry->Cost;
3017 
3018  if (ST->is64Bit())
3019  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
3020  return LT.first * Entry->Cost;
3021 
3022  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
3023  return LT.first * Entry->Cost;
3024  }
3025 
3027 }
3028 
3030  unsigned Index) {
3031  static const CostTblEntry SLMCostTbl[] = {
3036  };
3037 
3038  assert(Val->isVectorTy() && "This must be a vector type");
3039  Type *ScalarType = Val->getScalarType();
3040  int RegisterFileMoveCost = 0;
3041 
3042  if (Index != -1U && (Opcode == Instruction::ExtractElement ||
3043  Opcode == Instruction::InsertElement)) {
3044  // Legalize the type.
3045  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
3046 
3047  // This type is legalized to a scalar type.
3048  if (!LT.second.isVector())
3049  return 0;
3050 
3051  // The type may be split. Normalize the index to the new type.
3052  unsigned NumElts = LT.second.getVectorNumElements();
3053  unsigned SubNumElts = NumElts;
3054  Index = Index % NumElts;
3055 
3056  // For >128-bit vectors, we need to extract higher 128-bit subvectors.
3057  // For inserts, we also need to insert the subvector back.
3058  if (LT.second.getSizeInBits() > 128) {
3059  assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector");
3060  unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
3061  SubNumElts = NumElts / NumSubVecs;
3062  if (SubNumElts <= Index) {
3063  RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
3064  Index %= SubNumElts;
3065  }
3066  }
3067 
3068  if (Index == 0) {
3069  // Floating point scalars are already located in index #0.
3070  // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
3071  // true for all.
3072  if (ScalarType->isFloatingPointTy())
3073  return RegisterFileMoveCost;
3074 
3075  // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
3076  if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
3077  return 1 + RegisterFileMoveCost;
3078  }
3079 
3080  int ISD = TLI->InstructionOpcodeToISD(Opcode);
3081  assert(ISD && "Unexpected vector opcode");
3082  MVT MScalarTy = LT.second.getScalarType();
3083  if (ST->isSLM())
3084  if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
3085  return Entry->Cost + RegisterFileMoveCost;
3086 
3087  // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
3088  if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
3089  (MScalarTy.isInteger() && ST->hasSSE41()))
3090  return 1 + RegisterFileMoveCost;
3091 
3092  // Assume insertps is relatively cheap on all targets.
3093  if (MScalarTy == MVT::f32 && ST->hasSSE41() &&
3094  Opcode == Instruction::InsertElement)
3095  return 1 + RegisterFileMoveCost;
3096 
3097  // For extractions we just need to shuffle the element to index 0, which
3098  // should be very cheap (assume cost = 1). For insertions we need to shuffle
3099  // the elements to its destination. In both cases we must handle the
3100  // subvector move(s).
3101  // If the vector type is already less than 128-bits then don't reduce it.
3102  // TODO: Under what circumstances should we shuffle using the full width?
3103  InstructionCost ShuffleCost = 1;
3104  if (Opcode == Instruction::InsertElement) {
3105  auto *SubTy = cast<VectorType>(Val);
3106  EVT VT = TLI->getValueType(DL, Val);
3107  if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
3108  SubTy = FixedVectorType::get(ScalarType, SubNumElts);
3109  ShuffleCost =
3110  getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, None, 0, SubTy);
3111  }
3112  int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
3113  return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
3114  }
3115 
3116  // Add to the base cost if we know that the extracted element of a vector is
3117  // destined to be moved to and used in the integer register file.
3118  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
3119  RegisterFileMoveCost += 1;
3120 
3121  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
3122 }
3123 
3125  const APInt &DemandedElts,
3126  bool Insert, bool Extract) {
3127  unsigned Cost = 0;
3128 
3129  // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
3130  // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
3131  if (Insert) {
3132  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
3133  MVT MScalarTy = LT.second.getScalarType();
3134 
3135  if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
3136  (MScalarTy.isInteger() && ST->hasSSE41()) ||
3137  (MScalarTy == MVT::f32 && ST->hasSSE41())) {
3138  // For types we can insert directly, insertion into 128-bit sub vectors is
3139  // cheap, followed by a cheap chain of concatenations.
3140  if (LT.second.getSizeInBits() <= 128) {
3141  Cost +=
3142  BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
3143  } else {
3144  // In each 128-lane, if at least one index is demanded but not all
3145  // indices are demanded and this 128-lane is not the first 128-lane of
3146  // the legalized-vector, then this 128-lane needs a extracti128; If in
3147  // each 128-lane, there is at least one demanded index, this 128-lane
3148  // needs a inserti128.
3149 
3150  // The following cases will help you build a better understanding:
3151  // Assume we insert several elements into a v8i32 vector in avx2,
3152  // Case#1: inserting into 1th index needs vpinsrd + inserti128.
3153  // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
3154  // inserti128.
3155  // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
3156  unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * LT.first;
3157  unsigned NumElts = LT.second.getVectorNumElements() * LT.first;
3158  APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts);
3159  unsigned Scale = NumElts / Num128Lanes;
3160  // We iterate each 128-lane, and check if we need a
3161  // extracti128/inserti128 for this 128-lane.
3162  for (unsigned I = 0; I < NumElts; I += Scale) {
3163  APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale);
3164  APInt MaskedDE = Mask & WidenedDemandedElts;
3165  unsigned Population = MaskedDE.countPopulation();
3166  Cost += (Population > 0 && Population != Scale &&
3167  I % LT.second.getVectorNumElements() != 0);
3168  Cost += Population > 0;
3169  }
3170  Cost += DemandedElts.countPopulation();
3171 
3172  // For vXf32 cases, insertion into the 0'th index in each v4f32
3173  // 128-bit vector is free.
3174  // NOTE: This assumes legalization widens vXf32 vectors.
3175  if (MScalarTy == MVT::f32)
3176  for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements();
3177  i < e; i += 4)
3178  if (DemandedElts[i])
3179  Cost--;
3180  }
3181  } else if (LT.second.isVector()) {
3182  // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
3183  // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
3184  // series of UNPCK followed by CONCAT_VECTORS - all of these can be
3185  // considered cheap.
3186  if (Ty->isIntOrIntVectorTy())
3187  Cost += DemandedElts.countPopulation();
3188 
3189  // Get the smaller of the legalized or original pow2-extended number of
3190  // vector elements, which represents the number of unpacks we'll end up
3191  // performing.
3192  unsigned NumElts = LT.second.getVectorNumElements();
3193  unsigned Pow2Elts =
3194  PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
3195  Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
3196  }
3197  }
3198 
3199  // TODO: Use default extraction for now, but we should investigate extending this
3200  // to handle repeated subvector extraction.
3201  if (Extract)
3202  Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
3203 
3204  return Cost;
3205 }
3206 
3208  MaybeAlign Alignment,
3209  unsigned AddressSpace,
3211  const Instruction *I) {
3212  // TODO: Handle other cost kinds.
3214  if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
3215  // Store instruction with index and scale costs 2 Uops.
3216  // Check the preceding GEP to identify non-const indices.
3217  if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
3218  if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
3219  return TTI::TCC_Basic * 2;
3220  }
3221  }
3222  return TTI::TCC_Basic;
3223  }
3224 
3225  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
3226  "Invalid Opcode");
3227  // Type legalization can't handle structs
3228  if (TLI->getValueType(DL, Src, true) == MVT::Other)
3229  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3230  CostKind);
3231 
3232  // Handle non-power-of-two vectors such as <3 x float> and <48 x i16>
3233  if (auto *VTy = dyn_cast<FixedVectorType>(Src)) {
3234  const unsigned NumElem = VTy->getNumElements();
3235  if (!isPowerOf2_32(NumElem)) {
3236  // Factorize NumElem into sum of power-of-two.
3237  InstructionCost Cost = 0;
3238  unsigned NumElemDone = 0;
3239  for (unsigned NumElemLeft = NumElem, Factor;
3240  Factor = PowerOf2Floor(NumElemLeft), NumElemLeft > 0;
3241  NumElemLeft -= Factor) {
3242  Type *SubTy = FixedVectorType::get(VTy->getScalarType(), Factor);
3243  unsigned SubTyBytes = SubTy->getPrimitiveSizeInBits() / 8;
3244 
3245  Cost +=
3246  getMemoryOpCost(Opcode, SubTy, Alignment, AddressSpace, CostKind);
3247 
3248  std::pair<int, MVT> LST = TLI->getTypeLegalizationCost(DL, SubTy);
3249  if (!LST.second.isVector()) {
3250  APInt DemandedElts =
3251  APInt::getBitsSet(NumElem, NumElemDone, NumElemDone + Factor);
3252  Cost += getScalarizationOverhead(VTy, DemandedElts,
3253  Opcode == Instruction::Load,
3254  Opcode == Instruction::Store);
3255  }
3256 
3257  NumElemDone += Factor;
3258  Alignment = commonAlignment(Alignment.valueOrOne(), SubTyBytes);
3259  }
3260  assert(NumElemDone == NumElem && "Processed wrong element count?");
3261  return Cost;
3262  }
3263  }
3264 
3265  // Legalize the type.
3266  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
3267 
3268  // Each load/store unit costs 1.
3269  int Cost = LT.first * 1;
3270 
3271  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
3272  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
3273  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
3274  Cost *= 2;
3275 
3276  return Cost;
3277 }
3278 
3280 X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
3281  unsigned AddressSpace,
3283  bool IsLoad = (Instruction::Load == Opcode);
3284  bool IsStore = (Instruction::Store == Opcode);
3285 
3286  auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
3287  if (!SrcVTy)
3288  // To calculate scalar take the regular cost, without mask
3289  return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
3290 
3291  unsigned NumElem = SrcVTy->getNumElements();
3292  auto *MaskTy =
3293  FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
3294  if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
3295  (IsStore && !isLegalMaskedStore(SrcVTy, Alignment)) ||
3296  !isPowerOf2_32(NumElem)) {
3297  // Scalarization
3298  APInt DemandedElts = APInt::getAllOnesValue(NumElem);
3299  InstructionCost MaskSplitCost =
3300  getScalarizationOverhead(MaskTy, DemandedElts, false, true);
3301  InstructionCost ScalarCompareCost = getCmpSelInstrCost(
3302  Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
3304  InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
3305  InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
3306  InstructionCost ValueSplitCost =
3307  getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
3308  InstructionCost MemopCost =
3309  NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3310  Alignment, AddressSpace, CostKind);
3311  return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
3312  }
3313 
3314  // Legalize the type.
3315  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
3316  auto VT = TLI->getValueType(DL, SrcVTy);
3317  InstructionCost Cost = 0;
3318  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
3319  LT.second.getVectorNumElements() == NumElem)
3320  // Promotion requires expand/truncate for data and a shuffle for mask.
3321  Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, None, 0, nullptr) +
3322  getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, None, 0, nullptr);
3323 
3324  else if (LT.second.getVectorNumElements() > NumElem) {
3325  auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
3326  LT.second.getVectorNumElements());
3327  // Expanding requires fill mask with zeroes
3328  Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, None, 0, MaskTy);
3329  }
3330 
3331  // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
3332  if (!ST->hasAVX512())
3333  return Cost + LT.first * (IsLoad ? 2 : 8);
3334 
3335  // AVX-512 masked load/store is cheapper
3336  return Cost + LT.first;
3337 }
3338 
3340  const SCEV *Ptr) {
3341  // Address computations in vectorized code with non-consecutive addresses will
3342  // likely result in more instructions compared to scalar code where the
3343  // computation can more often be merged into the index mode. The resulting
3344  // extra micro-ops can significantly decrease throughput.
3345  const unsigned NumVectorInstToHideOverhead = 10;
3346 
3347  // Cost modeling of Strided Access Computation is hidden by the indexing
3348  // modes of X86 regardless of the stride value. We dont believe that there
3349  // is a difference between constant strided access in gerenal and constant
3350  // strided value which is less than or equal to 64.
3351  // Even in the case of (loop invariant) stride whose value is not known at
3352  // compile time, the address computation will not incur more than one extra
3353  // ADD instruction.
3354  if (Ty->isVectorTy() && SE) {
3355  if (!BaseT::isStridedAccess(Ptr))
3356  return NumVectorInstToHideOverhead;
3357  if (!BaseT::getConstantStrideStep(SE, Ptr))
3358  return 1;
3359  }
3360 
3361  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
3362 }
3363 
3366  bool IsPairwise,
3368  // Just use the default implementation for pair reductions.
3369  if (IsPairwise)
3370  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind);
3371 
3372  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
3373  // and make it as the cost.
3374 
3375  static const CostTblEntry SLMCostTblNoPairWise[] = {
3376  { ISD::FADD, MVT::v2f64, 3 },
3377  { ISD::ADD, MVT::v2i64, 5 },
3378  };
3379 
3380  static const CostTblEntry SSE2CostTblNoPairWise[] = {
3381  { ISD::FADD, MVT::v2f64, 2 },
3382  { ISD::FADD, MVT::v4f32, 4 },
3383  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
3384  { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
3385  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
3386  { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
3387  { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
3388  { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
3389  { ISD::ADD, MVT::v2i8, 2 },
3390  { ISD::ADD, MVT::v4i8, 2 },
3391  { ISD::ADD, MVT::v8i8, 2 },
3392  { ISD::ADD, MVT::v16i8, 3 },
3393  };
3394 
3395  static const CostTblEntry AVX1CostTblNoPairWise[] = {
3396  { ISD::FADD, MVT::v4f64, 3 },
3397  { ISD::FADD, MVT::v4f32, 3 },
3398  { ISD::FADD, MVT::v8f32, 4 },
3399  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
3400  { ISD::ADD, MVT::v4i64, 3 },
3401  { ISD::ADD, MVT::v8i32, 5 },
3402  { ISD::ADD, MVT::v16i16, 5 },
3403  { ISD::ADD, MVT::v32i8, 4 },
3404  };
3405 
3406  int ISD = TLI->InstructionOpcodeToISD(Opcode);
3407  assert(ISD && "Invalid opcode");
3408 
3409  // Before legalizing the type, give a chance to look up illegal narrow types
3410  // in the table.
3411  // FIXME: Is there a better way to do this?
3412  EVT VT = TLI->getValueType(DL, ValTy);
3413  if (VT.isSimple()) {
3414  MVT MTy = VT.getSimpleVT();
3415  if (ST->isSLM())
3416  if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
3417  return Entry->Cost;
3418 
3419  if (ST->hasAVX())
3420  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3421  return Entry->Cost;
3422 
3423  if (ST->hasSSE2())
3424  if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3425  return Entry->Cost;
3426  }
3427 
3428  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
3429 
3430  MVT MTy = LT.second;
3431 
3432  auto *ValVTy = cast<FixedVectorType>(ValTy);
3433 
3434  // Special case: vXi8 mul reductions are performed as vXi16.
3435  if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
3436  auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
3437  auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
3438  return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
3440  CostKind) +
3441  getArithmeticReductionCost(Opcode, WideVecTy, IsPairwise, CostKind);
3442  }
3443 
3444  InstructionCost ArithmeticCost = 0;
3445  if (LT.first != 1 && MTy.isVector() &&
3446  MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3447  // Type needs to be split. We need LT.first - 1 arithmetic ops.
3448  auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
3449  MTy.getVectorNumElements());
3450  ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
3451  ArithmeticCost *= LT.first - 1;
3452  }
3453 
3454  if (ST->isSLM())
3455  if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
3456  return ArithmeticCost + Entry->Cost;
3457 
3458  if (ST->hasAVX())
3459  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3460  return ArithmeticCost + Entry->Cost;
3461 
3462  if (ST->hasSSE2())
3463  if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3464  return ArithmeticCost + Entry->Cost;
3465 
3466  // FIXME: These assume a naive kshift+binop lowering, which is probably
3467  // conservative in most cases.
3468  static const CostTblEntry AVX512BoolReduction[] = {
3469  { ISD::AND, MVT::v2i1, 3 },
3470  { ISD::AND, MVT::v4i1, 5 },
3471  { ISD::AND, MVT::v8i1, 7 },
3472  { ISD::AND, MVT::v16i1, 9 },
3473  { ISD::AND, MVT::v32i1, 11 },
3474  { ISD::AND, MVT::v64i1, 13 },
3475  { ISD::OR, MVT::v2i1, 3 },
3476  { ISD::OR, MVT::v4i1, 5 },
3477  { ISD::OR, MVT::v8i1, 7 },
3478  { ISD::OR, MVT::v16i1, 9 },
3479  { ISD::OR, MVT::v32i1, 11 },
3480  { ISD::OR, MVT::v64i1, 13 },
3481  };
3482 
3483  static const CostTblEntry AVX2BoolReduction[] = {
3484  { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
3485  { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
3486  { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
3487  { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
3488  };
3489 
3490  static const CostTblEntry AVX1BoolReduction[] = {
3491  { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
3492  { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
3493  { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
3494  { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
3495  { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp