LLVM  9.0.0svn
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of
17 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
18 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost.
21 /// Some examples of other technologies/CPUs:
22 /// SSE 3 - Pentium4 / Athlon64
23 /// SSE 4.1 - Penryn
24 /// SSE 4.2 - Nehalem
25 /// AVX - Sandy Bridge
26 /// AVX2 - Haswell
27 /// AVX-512 - Xeon Phi / Skylake
28 /// And some examples of instruction target dependent costs (latency)
29 /// divss sqrtss rsqrtss
30 /// AMD K7 11-16 19 3
31 /// Piledriver 9-24 13-15 5
32 /// Jaguar 14 16 2
33 /// Pentium II,III 18 30 2
34 /// Nehalem 7-14 7-18 3
35 /// Haswell 10-13 11 5
36 /// TODO: Develop and implement the target dependent cost model and
37 /// specialize cost numbers for different Cost Model Targets such as throughput,
38 /// code size, latency and uop count.
39 //===----------------------------------------------------------------------===//
40 
41 #include "X86TargetTransformInfo.h"
44 #include "llvm/CodeGen/CostTable.h"
46 #include "llvm/IR/IntrinsicInst.h"
47 #include "llvm/Support/Debug.h"
48 
49 using namespace llvm;
50 
51 #define DEBUG_TYPE "x86tti"
52 
53 //===----------------------------------------------------------------------===//
54 //
55 // X86 cost model.
56 //
57 //===----------------------------------------------------------------------===//
58 
60 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
61  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
62  // TODO: Currently the __builtin_popcount() implementation using SSE3
63  // instructions is inefficient. Once the problem is fixed, we should
64  // call ST->hasSSE3() instead of ST->hasPOPCNT().
66 }
67 
70  switch (Level) {
72  // - Penryn
73  // - Nehalem
74  // - Westmere
75  // - Sandy Bridge
76  // - Ivy Bridge
77  // - Haswell
78  // - Broadwell
79  // - Skylake
80  // - Kabylake
81  return 32 * 1024; // 32 KByte
83  // - Penryn
84  // - Nehalem
85  // - Westmere
86  // - Sandy Bridge
87  // - Ivy Bridge
88  // - Haswell
89  // - Broadwell
90  // - Skylake
91  // - Kabylake
92  return 256 * 1024; // 256 KByte
93  }
94 
95  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
96 }
97 
100  // - Penryn
101  // - Nehalem
102  // - Westmere
103  // - Sandy Bridge
104  // - Ivy Bridge
105  // - Haswell
106  // - Broadwell
107  // - Skylake
108  // - Kabylake
109  switch (Level) {
113  return 8;
114  }
115 
116  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
117 }
118 
119 unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
120  if (Vector && !ST->hasSSE1())
121  return 0;
122 
123  if (ST->is64Bit()) {
124  if (Vector && ST->hasAVX512())
125  return 32;
126  return 16;
127  }
128  return 8;
129 }
130 
131 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
132  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
133  if (Vector) {
134  if (ST->hasAVX512() && PreferVectorWidth >= 512)
135  return 512;
136  if (ST->hasAVX() && PreferVectorWidth >= 256)
137  return 256;
138  if (ST->hasSSE1() && PreferVectorWidth >= 128)
139  return 128;
140  return 0;
141  }
142 
143  if (ST->is64Bit())
144  return 64;
145 
146  return 32;
147 }
148 
149 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
150  return getRegisterBitWidth(true);
151 }
152 
153 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
154  // If the loop will not be vectorized, don't interleave the loop.
155  // Let regular unroll to unroll the loop, which saves the overflow
156  // check and memory check cost.
157  if (VF == 1)
158  return 1;
159 
160  if (ST->isAtom())
161  return 1;
162 
163  // Sandybridge and Haswell have multiple execution ports and pipelined
164  // vector units.
165  if (ST->hasAVX())
166  return 4;
167 
168  return 2;
169 }
170 
172  unsigned Opcode, Type *Ty,
174  TTI::OperandValueProperties Opd1PropInfo,
175  TTI::OperandValueProperties Opd2PropInfo,
177  // Legalize the type.
178  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
179 
180  int ISD = TLI->InstructionOpcodeToISD(Opcode);
181  assert(ISD && "Invalid opcode");
182 
183  static const CostTblEntry GLMCostTable[] = {
184  { ISD::FDIV, MVT::f32, 18 }, // divss
185  { ISD::FDIV, MVT::v4f32, 35 }, // divps
186  { ISD::FDIV, MVT::f64, 33 }, // divsd
187  { ISD::FDIV, MVT::v2f64, 65 }, // divpd
188  };
189 
190  if (ST->isGLM())
191  if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
192  LT.second))
193  return LT.first * Entry->Cost;
194 
195  static const CostTblEntry SLMCostTable[] = {
196  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
197  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
198  { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
199  { ISD::FMUL, MVT::f64, 2 }, // mulsd
200  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
201  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
202  { ISD::FDIV, MVT::f32, 17 }, // divss
203  { ISD::FDIV, MVT::v4f32, 39 }, // divps
204  { ISD::FDIV, MVT::f64, 32 }, // divsd
205  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
206  { ISD::FADD, MVT::v2f64, 2 }, // addpd
207  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
208  // v2i64/v4i64 mul is custom lowered as a series of long:
209  // multiplies(3), shifts(3) and adds(2)
210  // slm muldq version throughput is 2 and addq throughput 4
211  // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
212  // 3X4 (addq throughput) = 17
213  { ISD::MUL, MVT::v2i64, 17 },
214  // slm addq\subq throughput is 4
215  { ISD::ADD, MVT::v2i64, 4 },
216  { ISD::SUB, MVT::v2i64, 4 },
217  };
218 
219  if (ST->isSLM()) {
220  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
221  // Check if the operands can be shrinked into a smaller datatype.
222  bool Op1Signed = false;
223  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
224  bool Op2Signed = false;
225  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
226 
227  bool signedMode = Op1Signed | Op2Signed;
228  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
229 
230  if (OpMinSize <= 7)
231  return LT.first * 3; // pmullw/sext
232  if (!signedMode && OpMinSize <= 8)
233  return LT.first * 3; // pmullw/zext
234  if (OpMinSize <= 15)
235  return LT.first * 5; // pmullw/pmulhw/pshuf
236  if (!signedMode && OpMinSize <= 16)
237  return LT.first * 5; // pmullw/pmulhw/pshuf
238  }
239 
240  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
241  LT.second)) {
242  return LT.first * Entry->Cost;
243  }
244  }
245 
246  if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
247  ISD == ISD::UREM) &&
250  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
251  if (ISD == ISD::SDIV || ISD == ISD::SREM) {
252  // On X86, vector signed division by constants power-of-two are
253  // normally expanded to the sequence SRA + SRL + ADD + SRA.
254  // The OperandValue properties may not be the same as that of the previous
255  // operation; conservatively assume OP_None.
256  int Cost =
257  2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
260  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
263  Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
266 
267  if (ISD == ISD::SREM) {
268  // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
269  Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info);
270  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info);
271  }
272 
273  return Cost;
274  }
275 
276  // Vector unsigned division/remainder will be simplified to shifts/masks.
277  if (ISD == ISD::UDIV)
278  return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
281 
282  if (ISD == ISD::UREM)
283  return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info,
286  }
287 
288  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
289  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
290  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
291  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
292  };
293 
295  ST->hasBWI()) {
296  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
297  LT.second))
298  return LT.first * Entry->Cost;
299  }
300 
301  static const CostTblEntry AVX512UniformConstCostTable[] = {
302  { ISD::SRA, MVT::v2i64, 1 },
303  { ISD::SRA, MVT::v4i64, 1 },
304  { ISD::SRA, MVT::v8i64, 1 },
305  };
306 
308  ST->hasAVX512()) {
309  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
310  LT.second))
311  return LT.first * Entry->Cost;
312  }
313 
314  static const CostTblEntry AVX2UniformConstCostTable[] = {
315  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
316  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
317  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
318 
319  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
320  };
321 
323  ST->hasAVX2()) {
324  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
325  LT.second))
326  return LT.first * Entry->Cost;
327  }
328 
329  static const CostTblEntry SSE2UniformConstCostTable[] = {
330  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
331  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
332  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
333 
334  { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
335  { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
336  { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
337  };
338 
339  // XOP has faster vXi8 shifts.
341  ST->hasSSE2() && !ST->hasXOP()) {
342  if (const auto *Entry =
343  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
344  return LT.first * Entry->Cost;
345  }
346 
347  static const CostTblEntry AVX512BWConstCostTable[] = {
348  { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
349  { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
350  { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
351  { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
352  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
353  { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
354  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
355  { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
356  };
357 
360  ST->hasBWI()) {
361  if (const auto *Entry =
362  CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
363  return LT.first * Entry->Cost;
364  }
365 
366  static const CostTblEntry AVX512ConstCostTable[] = {
367  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
368  { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
369  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
370  { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
371  };
372 
375  ST->hasAVX512()) {
376  if (const auto *Entry =
377  CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
378  return LT.first * Entry->Cost;
379  }
380 
381  static const CostTblEntry AVX2ConstCostTable[] = {
382  { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
383  { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
384  { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
385  { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
386  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
387  { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
388  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
389  { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
390  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
391  { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
392  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
393  { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
394  };
395 
398  ST->hasAVX2()) {
399  if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
400  return LT.first * Entry->Cost;
401  }
402 
403  static const CostTblEntry SSE2ConstCostTable[] = {
404  { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
405  { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
406  { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
407  { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
408  { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
409  { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
410  { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
411  { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
412  { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
413  { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
414  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
415  { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
416  { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
417  { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
418  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
419  { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
420  { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
421  { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
422  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
423  { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
424  { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
425  { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
426  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
427  { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
428  };
429 
432  ST->hasSSE2()) {
433  // pmuldq sequence.
434  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
435  return LT.first * 32;
436  if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
437  return LT.first * 38;
438  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
439  return LT.first * 15;
440  if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
441  return LT.first * 20;
442 
443  if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
444  return LT.first * Entry->Cost;
445  }
446 
447  static const CostTblEntry AVX2UniformCostTable[] = {
448  // Uniform splats are cheaper for the following instructions.
449  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
450  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
451  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
452  };
453 
454  if (ST->hasAVX2() &&
456  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
457  if (const auto *Entry =
458  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
459  return LT.first * Entry->Cost;
460  }
461 
462  static const CostTblEntry SSE2UniformCostTable[] = {
463  // Uniform splats are cheaper for the following instructions.
464  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
465  { ISD::SHL, MVT::v4i32, 1 }, // pslld
466  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
467 
468  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
469  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
470  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
471 
472  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
473  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
474  };
475 
476  if (ST->hasSSE2() &&
478  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
479  if (const auto *Entry =
480  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
481  return LT.first * Entry->Cost;
482  }
483 
484  static const CostTblEntry AVX512DQCostTable[] = {
485  { ISD::MUL, MVT::v2i64, 1 },
486  { ISD::MUL, MVT::v4i64, 1 },
487  { ISD::MUL, MVT::v8i64, 1 }
488  };
489 
490  // Look for AVX512DQ lowering tricks for custom cases.
491  if (ST->hasDQI())
492  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
493  return LT.first * Entry->Cost;
494 
495  static const CostTblEntry AVX512BWCostTable[] = {
496  { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
497  { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
498  { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
499 
500  { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
501  { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
502  { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
503 
504  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
505  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
506  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
507 
508  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
509  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
510  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
511 
512  { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
513  { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
514  { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
515  };
516 
517  // Look for AVX512BW lowering tricks for custom cases.
518  if (ST->hasBWI())
519  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
520  return LT.first * Entry->Cost;
521 
522  static const CostTblEntry AVX512CostTable[] = {
523  { ISD::SHL, MVT::v16i32, 1 },
524  { ISD::SRL, MVT::v16i32, 1 },
525  { ISD::SRA, MVT::v16i32, 1 },
526 
527  { ISD::SHL, MVT::v8i64, 1 },
528  { ISD::SRL, MVT::v8i64, 1 },
529 
530  { ISD::SRA, MVT::v2i64, 1 },
531  { ISD::SRA, MVT::v4i64, 1 },
532  { ISD::SRA, MVT::v8i64, 1 },
533 
534  { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
535  { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
536  { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
537  { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
538  { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
539  { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
540 
541  { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
542  { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
543  { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
544 
545  { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
546  { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
547  { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
548  };
549 
550  if (ST->hasAVX512())
551  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
552  return LT.first * Entry->Cost;
553 
554  static const CostTblEntry AVX2ShiftCostTable[] = {
555  // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
556  // customize them to detect the cases where shift amount is a scalar one.
557  { ISD::SHL, MVT::v4i32, 1 },
558  { ISD::SRL, MVT::v4i32, 1 },
559  { ISD::SRA, MVT::v4i32, 1 },
560  { ISD::SHL, MVT::v8i32, 1 },
561  { ISD::SRL, MVT::v8i32, 1 },
562  { ISD::SRA, MVT::v8i32, 1 },
563  { ISD::SHL, MVT::v2i64, 1 },
564  { ISD::SRL, MVT::v2i64, 1 },
565  { ISD::SHL, MVT::v4i64, 1 },
566  { ISD::SRL, MVT::v4i64, 1 },
567  };
568 
569  // Look for AVX2 lowering tricks.
570  if (ST->hasAVX2()) {
571  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
574  // On AVX2, a packed v16i16 shift left by a constant build_vector
575  // is lowered into a vector multiply (vpmullw).
576  return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info,
579 
580  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
581  return LT.first * Entry->Cost;
582  }
583 
584  static const CostTblEntry XOPShiftCostTable[] = {
585  // 128bit shifts take 1cy, but right shifts require negation beforehand.
586  { ISD::SHL, MVT::v16i8, 1 },
587  { ISD::SRL, MVT::v16i8, 2 },
588  { ISD::SRA, MVT::v16i8, 2 },
589  { ISD::SHL, MVT::v8i16, 1 },
590  { ISD::SRL, MVT::v8i16, 2 },
591  { ISD::SRA, MVT::v8i16, 2 },
592  { ISD::SHL, MVT::v4i32, 1 },
593  { ISD::SRL, MVT::v4i32, 2 },
594  { ISD::SRA, MVT::v4i32, 2 },
595  { ISD::SHL, MVT::v2i64, 1 },
596  { ISD::SRL, MVT::v2i64, 2 },
597  { ISD::SRA, MVT::v2i64, 2 },
598  // 256bit shifts require splitting if AVX2 didn't catch them above.
599  { ISD::SHL, MVT::v32i8, 2+2 },
600  { ISD::SRL, MVT::v32i8, 4+2 },
601  { ISD::SRA, MVT::v32i8, 4+2 },
602  { ISD::SHL, MVT::v16i16, 2+2 },
603  { ISD::SRL, MVT::v16i16, 4+2 },
604  { ISD::SRA, MVT::v16i16, 4+2 },
605  { ISD::SHL, MVT::v8i32, 2+2 },
606  { ISD::SRL, MVT::v8i32, 4+2 },
607  { ISD::SRA, MVT::v8i32, 4+2 },
608  { ISD::SHL, MVT::v4i64, 2+2 },
609  { ISD::SRL, MVT::v4i64, 4+2 },
610  { ISD::SRA, MVT::v4i64, 4+2 },
611  };
612 
613  // Look for XOP lowering tricks.
614  if (ST->hasXOP()) {
615  // If the right shift is constant then we'll fold the negation so
616  // it's as cheap as a left shift.
617  int ShiftISD = ISD;
618  if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
621  ShiftISD = ISD::SHL;
622  if (const auto *Entry =
623  CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
624  return LT.first * Entry->Cost;
625  }
626 
627  static const CostTblEntry SSE2UniformShiftCostTable[] = {
628  // Uniform splats are cheaper for the following instructions.
629  { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
630  { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
631  { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
632 
633  { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
634  { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
635  { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
636 
637  { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
638  { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
639  { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
640  { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
641  };
642 
643  if (ST->hasSSE2() &&
645  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
646 
647  // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
648  if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
649  return LT.first * 4; // 2*psrad + shuffle.
650 
651  if (const auto *Entry =
652  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
653  return LT.first * Entry->Cost;
654  }
655 
656  if (ISD == ISD::SHL &&
658  MVT VT = LT.second;
659  // Vector shift left by non uniform constant can be lowered
660  // into vector multiply.
661  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
662  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
663  ISD = ISD::MUL;
664  }
665 
666  static const CostTblEntry AVX2CostTable[] = {
667  { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
668  { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
669 
670  { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
671  { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
672 
673  { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
674  { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
675  { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
676  { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
677 
678  { ISD::SUB, MVT::v32i8, 1 }, // psubb
679  { ISD::ADD, MVT::v32i8, 1 }, // paddb
680  { ISD::SUB, MVT::v16i16, 1 }, // psubw
681  { ISD::ADD, MVT::v16i16, 1 }, // paddw
682  { ISD::SUB, MVT::v8i32, 1 }, // psubd
683  { ISD::ADD, MVT::v8i32, 1 }, // paddd
684  { ISD::SUB, MVT::v4i64, 1 }, // psubq
685  { ISD::ADD, MVT::v4i64, 1 }, // paddq
686 
687  { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
688  { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
689  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
690  { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
691  { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
692 
693  { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
694  { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
695  { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
696  { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
697  { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
698  { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
699 
700  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
701  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
702  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
703  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
704  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
705  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
706  };
707 
708  // Look for AVX2 lowering tricks for custom cases.
709  if (ST->hasAVX2())
710  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
711  return LT.first * Entry->Cost;
712 
713  static const CostTblEntry AVX1CostTable[] = {
714  // We don't have to scalarize unsupported ops. We can issue two half-sized
715  // operations and we only need to extract the upper YMM half.
716  // Two ops + 1 extract + 1 insert = 4.
717  { ISD::MUL, MVT::v16i16, 4 },
718  { ISD::MUL, MVT::v8i32, 4 },
719  { ISD::SUB, MVT::v32i8, 4 },
720  { ISD::ADD, MVT::v32i8, 4 },
721  { ISD::SUB, MVT::v16i16, 4 },
722  { ISD::ADD, MVT::v16i16, 4 },
723  { ISD::SUB, MVT::v8i32, 4 },
724  { ISD::ADD, MVT::v8i32, 4 },
725  { ISD::SUB, MVT::v4i64, 4 },
726  { ISD::ADD, MVT::v4i64, 4 },
727 
728  // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
729  // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
730  // Because we believe v4i64 to be a legal type, we must also include the
731  // extract+insert in the cost table. Therefore, the cost here is 18
732  // instead of 8.
733  { ISD::MUL, MVT::v4i64, 18 },
734 
735  { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
736 
737  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
738  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
739  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
740  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
741  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
742  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
743  };
744 
745  if (ST->hasAVX())
746  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
747  return LT.first * Entry->Cost;
748 
749  static const CostTblEntry SSE42CostTable[] = {
750  { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
751  { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
752  { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
753  { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
754 
755  { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
756  { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
757  { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
758  { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
759 
760  { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
761  { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
762  { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
763  { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
764 
765  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
766  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
767  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
768  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
769  };
770 
771  if (ST->hasSSE42())
772  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
773  return LT.first * Entry->Cost;
774 
775  static const CostTblEntry SSE41CostTable[] = {
776  { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
777  { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
778  { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
779  { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
780  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
781  { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
782 
783  { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
784  { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
785  { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
786  { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
787  { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
788  { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
789 
790  { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
791  { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
792  { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
793  { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
794  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
795  { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
796 
797  { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
798  };
799 
800  if (ST->hasSSE41())
801  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
802  return LT.first * Entry->Cost;
803 
804  static const CostTblEntry SSE2CostTable[] = {
805  // We don't correctly identify costs of casts because they are marked as
806  // custom.
807  { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
808  { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
809  { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
810  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
811  { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
812 
813  { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
814  { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
815  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
816  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
817  { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
818 
819  { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
820  { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
821  { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
822  { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
823  { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
824 
825  { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
826  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
827  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
828  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
829 
830  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
831  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
832  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
833  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
834 
835  { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
836  { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
837 
838  { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
839  { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
840  };
841 
842  if (ST->hasSSE2())
843  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
844  return LT.first * Entry->Cost;
845 
846  static const CostTblEntry SSE1CostTable[] = {
847  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
848  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
849 
850  { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
851  { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
852 
853  { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
854  { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
855 
856  { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
857  { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
858  { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
859 
860  { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
861  { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
862  { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
863  };
864 
865  if (ST->hasSSE1())
866  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
867  return LT.first * Entry->Cost;
868 
869  // It is not a good idea to vectorize division. We have to scalarize it and
870  // in the process we will often end up having to spilling regular
871  // registers. The overhead of division is going to dominate most kernels
872  // anyways so try hard to prevent vectorization of division - it is
873  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
874  // to hide "20 cycles" for each lane.
875  if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
876  ISD == ISD::UDIV || ISD == ISD::UREM)) {
877  int ScalarCost = getArithmeticInstrCost(
878  Opcode, Ty->getScalarType(), Op1Info, Op2Info,
880  return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
881  }
882 
883  // Fallback to the default implementation.
884  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
885 }
886 
888  Type *SubTp) {
889  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
890  // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
891  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
892 
893  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
894  if (Kind == TTI::SK_Transpose)
895  Kind = TTI::SK_PermuteTwoSrc;
896 
897  // For Broadcasts we are splatting the first element from the first input
898  // register, so only need to reference that input and all the output
899  // registers are the same.
900  if (Kind == TTI::SK_Broadcast)
901  LT.first = 1;
902 
903  // Subvector extractions are free if they start at the beginning of a
904  // vector and cheap if the subvectors are aligned.
905  if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
906  int NumElts = LT.second.getVectorNumElements();
907  if ((Index % NumElts) == 0)
908  return 0;
909  std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
910  if (SubLT.second.isVector()) {
911  int NumSubElts = SubLT.second.getVectorNumElements();
912  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
913  return SubLT.first;
914  }
915  }
916 
917  // We are going to permute multiple sources and the result will be in multiple
918  // destinations. Providing an accurate cost only for splits where the element
919  // type remains the same.
920  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
921  MVT LegalVT = LT.second;
922  if (LegalVT.isVector() &&
923  LegalVT.getVectorElementType().getSizeInBits() ==
925  LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
926 
927  unsigned VecTySize = DL.getTypeStoreSize(Tp);
928  unsigned LegalVTSize = LegalVT.getStoreSize();
929  // Number of source vectors after legalization:
930  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
931  // Number of destination vectors after legalization:
932  unsigned NumOfDests = LT.first;
933 
934  Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
935  LegalVT.getVectorNumElements());
936 
937  unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
938  return NumOfShuffles *
939  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
940  }
941 
942  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
943  }
944 
945  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
946  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
947  // We assume that source and destination have the same vector type.
948  int NumOfDests = LT.first;
949  int NumOfShufflesPerDest = LT.first * 2 - 1;
950  LT.first = NumOfDests * NumOfShufflesPerDest;
951  }
952 
953  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
954  {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
955  {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
956 
957  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
958  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
959 
960  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 1}, // vpermt2b
961  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 1}, // vpermt2b
962  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1} // vpermt2b
963  };
964 
965  if (ST->hasVBMI())
966  if (const auto *Entry =
967  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
968  return LT.first * Entry->Cost;
969 
970  static const CostTblEntry AVX512BWShuffleTbl[] = {
971  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
972  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
973 
974  {TTI::SK_Reverse, MVT::v32i16, 1}, // vpermw
975  {TTI::SK_Reverse, MVT::v16i16, 1}, // vpermw
976  {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
977 
978  {TTI::SK_PermuteSingleSrc, MVT::v32i16, 1}, // vpermw
979  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpermw
980  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // vpermw
981  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
982  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 3}, // vpermw + zext/trunc
983 
984  {TTI::SK_PermuteTwoSrc, MVT::v32i16, 1}, // vpermt2w
985  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 1}, // vpermt2w
986  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpermt2w
987  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 3}, // zext + vpermt2w + trunc
988  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
989  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3} // zext + vpermt2w + trunc
990  };
991 
992  if (ST->hasBWI())
993  if (const auto *Entry =
994  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
995  return LT.first * Entry->Cost;
996 
997  static const CostTblEntry AVX512ShuffleTbl[] = {
998  {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
999  {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1000  {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1001  {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1002 
1003  {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1004  {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1005  {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1006  {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1007 
1008  {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1009  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1010  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1011  {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1012  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1013  {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1014  {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1015  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1016  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1017  {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1018  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1019  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1020  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1021 
1022  {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1023  {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1024  {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1025  {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1026  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1027  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1028  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1029  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1030  {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1031  {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1032  {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1033  {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1} // vpermt2d
1034  };
1035 
1036  if (ST->hasAVX512())
1037  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1038  return LT.first * Entry->Cost;
1039 
1040  static const CostTblEntry AVX2ShuffleTbl[] = {
1041  {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1042  {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1043  {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1044  {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1045  {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1046  {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1047 
1048  {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1049  {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1050  {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1051  {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1052  {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1053  {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1054 
1055  {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1056  {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1057 
1058  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1059  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1060  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1061  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1062  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1063  // + vpblendvb
1064  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1065  // + vpblendvb
1066 
1067  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1068  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1069  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1070  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1071  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1072  // + vpblendvb
1073  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1074  // + vpblendvb
1075  };
1076 
1077  if (ST->hasAVX2())
1078  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1079  return LT.first * Entry->Cost;
1080 
1081  static const CostTblEntry XOPShuffleTbl[] = {
1082  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1083  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1084  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1085  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1086  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1087  // + vinsertf128
1088  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1089  // + vinsertf128
1090 
1091  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1092  // + vinsertf128
1093  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1094  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1095  // + vinsertf128
1096  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1097  };
1098 
1099  if (ST->hasXOP())
1100  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1101  return LT.first * Entry->Cost;
1102 
1103  static const CostTblEntry AVX1ShuffleTbl[] = {
1104  {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1105  {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1106  {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1107  {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1108  {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1109  {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1110 
1111  {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1112  {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1113  {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1114  {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1115  {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1116  // + vinsertf128
1117  {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1118  // + vinsertf128
1119 
1120  {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1121  {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1122  {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1123  {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1124  {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1125  {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1126 
1127  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1128  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1129  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1130  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1131  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1132  // + 2*por + vinsertf128
1133  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1134  // + 2*por + vinsertf128
1135 
1136  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1137  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1138  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1139  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1140  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1141  // + 4*por + vinsertf128
1142  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1143  // + 4*por + vinsertf128
1144  };
1145 
1146  if (ST->hasAVX())
1147  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1148  return LT.first * Entry->Cost;
1149 
1150  static const CostTblEntry SSE41ShuffleTbl[] = {
1151  {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1152  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1153  {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1154  {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1155  {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1156  {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1157  };
1158 
1159  if (ST->hasSSE41())
1160  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1161  return LT.first * Entry->Cost;
1162 
1163  static const CostTblEntry SSSE3ShuffleTbl[] = {
1164  {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1165  {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1166 
1167  {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1168  {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1169 
1170  {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1171  {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1172 
1173  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1174  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1175 
1176  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1177  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1178  };
1179 
1180  if (ST->hasSSSE3())
1181  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1182  return LT.first * Entry->Cost;
1183 
1184  static const CostTblEntry SSE2ShuffleTbl[] = {
1185  {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1186  {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1187  {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1188  {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1189  {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1190 
1191  {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1192  {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1193  {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1194  {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1195  {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1196  // + 2*pshufd + 2*unpck + packus
1197 
1198  {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1199  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1200  {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1201  {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1202  {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1203 
1204  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1205  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1206  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1207  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1208  // + pshufd/unpck
1209  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1210  // + 2*pshufd + 2*unpck + 2*packus
1211 
1212  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1213  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1214  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1215  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1216  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1217  };
1218 
1219  if (ST->hasSSE2())
1220  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1221  return LT.first * Entry->Cost;
1222 
1223  static const CostTblEntry SSE1ShuffleTbl[] = {
1224  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1225  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1226  { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1227  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1228  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1229  };
1230 
1231  if (ST->hasSSE1())
1232  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1233  return LT.first * Entry->Cost;
1234 
1235  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1236 }
1237 
1238 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1239  const Instruction *I) {
1240  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1241  assert(ISD && "Invalid opcode");
1242 
1243  // FIXME: Need a better design of the cost table to handle non-simple types of
1244  // potential massive combinations (elem_num x src_type x dst_type).
1245 
1246  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1249 
1250  // Mask sign extend has an instruction.
1257 
1258  // Mask zero extend is a load + broadcast.
1265  };
1266 
1267  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1274 
1281 
1288 
1295  };
1296 
1297  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1298  // 256-bit wide vectors.
1299 
1300  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1304 
1309 
1310  // v16i1 -> v16i32 - load + broadcast
1321 
1330 
1355 
1357 
1367  };
1368 
1369  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1386 
1393 
1396 
1398  };
1399 
1400  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1417 
1425 
1438 
1454  // The generic code to compute the scalar overhead is currently broken.
1455  // Workaround this limitation by estimating the scalarization overhead
1456  // here. We have roughly 10 instructions per scalar element.
1457  // Multiply that by the vector width.
1458  // FIXME: remove that when PR19268 is fixed.
1461 
1464  // This node is expanded into scalarized operations but BasicTTI is overly
1465  // optimistic estimating its cost. It computes 3 per element (one
1466  // vector-extract, one scalar conversion and one vector-insert). The
1467  // problem is that the inserts form a read-modify-write chain so latency
1468  // should be factored in too. Inflating the cost per element by 1.
1471 
1474  };
1475 
1476  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1483 
1502 
1510 
1512  };
1513 
1514  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1515  // These are somewhat magic numbers justified by looking at the output of
1516  // Intel's IACA, running some kernels and making sure when we take
1517  // legalization into account the throughput will be overestimated.
1519  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1526 
1527  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1535 
1537 
1539 
1564 
1574  };
1575 
1576  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
1577  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
1578 
1579  if (ST->hasSSE2() && !ST->hasAVX()) {
1580  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1581  LTDest.second, LTSrc.second))
1582  return LTSrc.first * Entry->Cost;
1583  }
1584 
1585  EVT SrcTy = TLI->getValueType(DL, Src);
1586  EVT DstTy = TLI->getValueType(DL, Dst);
1587 
1588  // The function getSimpleVT only handles simple value types.
1589  if (!SrcTy.isSimple() || !DstTy.isSimple())
1590  return BaseT::getCastInstrCost(Opcode, Dst, Src);
1591 
1592  MVT SimpleSrcTy = SrcTy.getSimpleVT();
1593  MVT SimpleDstTy = DstTy.getSimpleVT();
1594 
1595  // Make sure that neither type is going to be split before using the
1596  // AVX512 tables. This handles -mprefer-vector-width=256
1597  // with -min-legal-vector-width<=256
1598  if (TLI->getTypeAction(SimpleSrcTy) != TargetLowering::TypeSplitVector &&
1599  TLI->getTypeAction(SimpleDstTy) != TargetLowering::TypeSplitVector) {
1600  if (ST->hasBWI())
1601  if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
1602  SimpleDstTy, SimpleSrcTy))
1603  return Entry->Cost;
1604 
1605  if (ST->hasDQI())
1606  if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
1607  SimpleDstTy, SimpleSrcTy))
1608  return Entry->Cost;
1609 
1610  if (ST->hasAVX512())
1611  if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
1612  SimpleDstTy, SimpleSrcTy))
1613  return Entry->Cost;
1614  }
1615 
1616  if (ST->hasAVX2()) {
1617  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
1618  SimpleDstTy, SimpleSrcTy))
1619  return Entry->Cost;
1620  }
1621 
1622  if (ST->hasAVX()) {
1623  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
1624  SimpleDstTy, SimpleSrcTy))
1625  return Entry->Cost;
1626  }
1627 
1628  if (ST->hasSSE41()) {
1629  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
1630  SimpleDstTy, SimpleSrcTy))
1631  return Entry->Cost;
1632  }
1633 
1634  if (ST->hasSSE2()) {
1635  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1636  SimpleDstTy, SimpleSrcTy))
1637  return Entry->Cost;
1638  }
1639 
1640  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
1641 }
1642 
1643 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1644  const Instruction *I) {
1645  // Legalize the type.
1646  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1647 
1648  MVT MTy = LT.second;
1649 
1650  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1651  assert(ISD && "Invalid opcode");
1652 
1653  unsigned ExtraCost = 0;
1654  if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {
1655  // Some vector comparison predicates cost extra instructions.
1656  if (MTy.isVector() &&
1657  !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
1658  (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
1659  ST->hasBWI())) {
1660  switch (cast<CmpInst>(I)->getPredicate()) {
1661  case CmpInst::Predicate::ICMP_NE:
1662  // xor(cmpeq(x,y),-1)
1663  ExtraCost = 1;
1664  break;
1665  case CmpInst::Predicate::ICMP_SGE:
1666  case CmpInst::Predicate::ICMP_SLE:
1667  // xor(cmpgt(x,y),-1)
1668  ExtraCost = 1;
1669  break;
1670  case CmpInst::Predicate::ICMP_ULT:
1671  case CmpInst::Predicate::ICMP_UGT:
1672  // cmpgt(xor(x,signbit),xor(y,signbit))
1673  // xor(cmpeq(pmaxu(x,y),x),-1)
1674  ExtraCost = 2;
1675  break;
1676  case CmpInst::Predicate::ICMP_ULE:
1677  case CmpInst::Predicate::ICMP_UGE:
1678  if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
1679  (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
1680  // cmpeq(psubus(x,y),0)
1681  // cmpeq(pminu(x,y),x)
1682  ExtraCost = 1;
1683  } else {
1684  // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
1685  ExtraCost = 3;
1686  }
1687  break;
1688  default:
1689  break;
1690  }
1691  }
1692  }
1693 
1694  static const CostTblEntry AVX512BWCostTbl[] = {
1695  { ISD::SETCC, MVT::v32i16, 1 },
1696  { ISD::SETCC, MVT::v64i8, 1 },
1697 
1698  { ISD::SELECT, MVT::v32i16, 1 },
1699  { ISD::SELECT, MVT::v64i8, 1 },
1700  };
1701 
1702  static const CostTblEntry AVX512CostTbl[] = {
1703  { ISD::SETCC, MVT::v8i64, 1 },
1704  { ISD::SETCC, MVT::v16i32, 1 },
1705  { ISD::SETCC, MVT::v8f64, 1 },
1706  { ISD::SETCC, MVT::v16f32, 1 },
1707 
1708  { ISD::SELECT, MVT::v8i64, 1 },
1709  { ISD::SELECT, MVT::v16i32, 1 },
1710  { ISD::SELECT, MVT::v8f64, 1 },
1711  { ISD::SELECT, MVT::v16f32, 1 },
1712  };
1713 
1714  static const CostTblEntry AVX2CostTbl[] = {
1715  { ISD::SETCC, MVT::v4i64, 1 },
1716  { ISD::SETCC, MVT::v8i32, 1 },
1717  { ISD::SETCC, MVT::v16i16, 1 },
1718  { ISD::SETCC, MVT::v32i8, 1 },
1719 
1720  { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
1721  { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
1722  { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
1723  { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
1724  };
1725 
1726  static const CostTblEntry AVX1CostTbl[] = {
1727  { ISD::SETCC, MVT::v4f64, 1 },
1728  { ISD::SETCC, MVT::v8f32, 1 },
1729  // AVX1 does not support 8-wide integer compare.
1730  { ISD::SETCC, MVT::v4i64, 4 },
1731  { ISD::SETCC, MVT::v8i32, 4 },
1732  { ISD::SETCC, MVT::v16i16, 4 },
1733  { ISD::SETCC, MVT::v32i8, 4 },
1734 
1735  { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
1736  { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
1737  { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
1738  { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
1739  { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
1740  { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
1741  };
1742 
1743  static const CostTblEntry SSE42CostTbl[] = {
1744  { ISD::SETCC, MVT::v2f64, 1 },
1745  { ISD::SETCC, MVT::v4f32, 1 },
1746  { ISD::SETCC, MVT::v2i64, 1 },
1747  };
1748 
1749  static const CostTblEntry SSE41CostTbl[] = {
1750  { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
1751  { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
1752  { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
1753  { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
1754  { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
1755  { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
1756  };
1757 
1758  static const CostTblEntry SSE2CostTbl[] = {
1759  { ISD::SETCC, MVT::v2f64, 2 },
1760  { ISD::SETCC, MVT::f64, 1 },
1761  { ISD::SETCC, MVT::v2i64, 8 },
1762  { ISD::SETCC, MVT::v4i32, 1 },
1763  { ISD::SETCC, MVT::v8i16, 1 },
1764  { ISD::SETCC, MVT::v16i8, 1 },
1765 
1766  { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
1767  { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
1768  { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
1769  { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
1770  { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
1771  };
1772 
1773  static const CostTblEntry SSE1CostTbl[] = {
1774  { ISD::SETCC, MVT::v4f32, 2 },
1775  { ISD::SETCC, MVT::f32, 1 },
1776 
1777  { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
1778  };
1779 
1780  if (ST->hasBWI())
1781  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
1782  return LT.first * (ExtraCost + Entry->Cost);
1783 
1784  if (ST->hasAVX512())
1785  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1786  return LT.first * (ExtraCost + Entry->Cost);
1787 
1788  if (ST->hasAVX2())
1789  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1790  return LT.first * (ExtraCost + Entry->Cost);
1791 
1792  if (ST->hasAVX())
1793  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1794  return LT.first * (ExtraCost + Entry->Cost);
1795 
1796  if (ST->hasSSE42())
1797  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1798  return LT.first * (ExtraCost + Entry->Cost);
1799 
1800  if (ST->hasSSE41())
1801  if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
1802  return LT.first * (ExtraCost + Entry->Cost);
1803 
1804  if (ST->hasSSE2())
1805  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1806  return LT.first * (ExtraCost + Entry->Cost);
1807 
1808  if (ST->hasSSE1())
1809  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
1810  return LT.first * (ExtraCost + Entry->Cost);
1811 
1812  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
1813 }
1814 
1816 
1819  unsigned ScalarizationCostPassed) {
1820  // Costs should match the codegen from:
1821  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
1822  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
1823  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
1824  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
1825  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
1826  static const CostTblEntry AVX512CDCostTbl[] = {
1827  { ISD::CTLZ, MVT::v8i64, 1 },
1828  { ISD::CTLZ, MVT::v16i32, 1 },
1829  { ISD::CTLZ, MVT::v32i16, 8 },
1830  { ISD::CTLZ, MVT::v64i8, 20 },
1831  { ISD::CTLZ, MVT::v4i64, 1 },
1832  { ISD::CTLZ, MVT::v8i32, 1 },
1833  { ISD::CTLZ, MVT::v16i16, 4 },
1834  { ISD::CTLZ, MVT::v32i8, 10 },
1835  { ISD::CTLZ, MVT::v2i64, 1 },
1836  { ISD::CTLZ, MVT::v4i32, 1 },
1837  { ISD::CTLZ, MVT::v8i16, 4 },
1838  { ISD::CTLZ, MVT::v16i8, 4 },
1839  };
1840  static const CostTblEntry AVX512BWCostTbl[] = {
1841  { ISD::BITREVERSE, MVT::v8i64, 5 },
1842  { ISD::BITREVERSE, MVT::v16i32, 5 },
1843  { ISD::BITREVERSE, MVT::v32i16, 5 },
1844  { ISD::BITREVERSE, MVT::v64i8, 5 },
1845  { ISD::CTLZ, MVT::v8i64, 23 },
1846  { ISD::CTLZ, MVT::v16i32, 22 },
1847  { ISD::CTLZ, MVT::v32i16, 18 },
1848  { ISD::CTLZ, MVT::v64i8, 17 },
1849  { ISD::CTPOP, MVT::v8i64, 7 },
1850  { ISD::CTPOP, MVT::v16i32, 11 },
1851  { ISD::CTPOP, MVT::v32i16, 9 },
1852  { ISD::CTPOP, MVT::v64i8, 6 },
1853  { ISD::CTTZ, MVT::v8i64, 10 },
1854  { ISD::CTTZ, MVT::v16i32, 14 },
1855  { ISD::CTTZ, MVT::v32i16, 12 },
1856  { ISD::CTTZ, MVT::v64i8, 9 },
1857  { ISD::SADDSAT, MVT::v32i16, 1 },
1858  { ISD::SADDSAT, MVT::v64i8, 1 },
1859  { ISD::SSUBSAT, MVT::v32i16, 1 },
1860  { ISD::SSUBSAT, MVT::v64i8, 1 },
1861  { ISD::UADDSAT, MVT::v32i16, 1 },
1862  { ISD::UADDSAT, MVT::v64i8, 1 },
1863  { ISD::USUBSAT, MVT::v32i16, 1 },
1864  { ISD::USUBSAT, MVT::v64i8, 1 },
1865  };
1866  static const CostTblEntry AVX512CostTbl[] = {
1867  { ISD::BITREVERSE, MVT::v8i64, 36 },
1868  { ISD::BITREVERSE, MVT::v16i32, 24 },
1869  { ISD::CTLZ, MVT::v8i64, 29 },
1870  { ISD::CTLZ, MVT::v16i32, 35 },
1871  { ISD::CTPOP, MVT::v8i64, 16 },
1872  { ISD::CTPOP, MVT::v16i32, 24 },
1873  { ISD::CTTZ, MVT::v8i64, 20 },
1874  { ISD::CTTZ, MVT::v16i32, 28 },
1875  { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
1876  { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
1877  { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
1878  { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
1879  { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
1880  { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
1881  { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
1882  { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
1883  };
1884  static const CostTblEntry XOPCostTbl[] = {
1885  { ISD::BITREVERSE, MVT::v4i64, 4 },
1886  { ISD::BITREVERSE, MVT::v8i32, 4 },
1887  { ISD::BITREVERSE, MVT::v16i16, 4 },
1888  { ISD::BITREVERSE, MVT::v32i8, 4 },
1889  { ISD::BITREVERSE, MVT::v2i64, 1 },
1890  { ISD::BITREVERSE, MVT::v4i32, 1 },
1891  { ISD::BITREVERSE, MVT::v8i16, 1 },
1892  { ISD::BITREVERSE, MVT::v16i8, 1 },
1893  { ISD::BITREVERSE, MVT::i64, 3 },
1894  { ISD::BITREVERSE, MVT::i32, 3 },
1895  { ISD::BITREVERSE, MVT::i16, 3 },
1896  { ISD::BITREVERSE, MVT::i8, 3 }
1897  };
1898  static const CostTblEntry AVX2CostTbl[] = {
1899  { ISD::BITREVERSE, MVT::v4i64, 5 },
1900  { ISD::BITREVERSE, MVT::v8i32, 5 },
1901  { ISD::BITREVERSE, MVT::v16i16, 5 },
1902  { ISD::BITREVERSE, MVT::v32i8, 5 },
1903  { ISD::BSWAP, MVT::v4i64, 1 },
1904  { ISD::BSWAP, MVT::v8i32, 1 },
1905  { ISD::BSWAP, MVT::v16i16, 1 },
1906  { ISD::CTLZ, MVT::v4i64, 23 },
1907  { ISD::CTLZ, MVT::v8i32, 18 },
1908  { ISD::CTLZ, MVT::v16i16, 14 },
1909  { ISD::CTLZ, MVT::v32i8, 9 },
1910  { ISD::CTPOP, MVT::v4i64, 7 },
1911  { ISD::CTPOP, MVT::v8i32, 11 },
1912  { ISD::CTPOP, MVT::v16i16, 9 },
1913  { ISD::CTPOP, MVT::v32i8, 6 },
1914  { ISD::CTTZ, MVT::v4i64, 10 },
1915  { ISD::CTTZ, MVT::v8i32, 14 },
1916  { ISD::CTTZ, MVT::v16i16, 12 },
1917  { ISD::CTTZ, MVT::v32i8, 9 },
1918  { ISD::SADDSAT, MVT::v16i16, 1 },
1919  { ISD::SADDSAT, MVT::v32i8, 1 },
1920  { ISD::SSUBSAT, MVT::v16i16, 1 },
1921  { ISD::SSUBSAT, MVT::v32i8, 1 },
1922  { ISD::UADDSAT, MVT::v16i16, 1 },
1923  { ISD::UADDSAT, MVT::v32i8, 1 },
1924  { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
1925  { ISD::USUBSAT, MVT::v16i16, 1 },
1926  { ISD::USUBSAT, MVT::v32i8, 1 },
1927  { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
1928  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
1929  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
1930  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
1931  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
1932  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
1933  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
1934  };
1935  static const CostTblEntry AVX1CostTbl[] = {
1936  { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
1937  { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
1938  { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
1939  { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
1940  { ISD::BSWAP, MVT::v4i64, 4 },
1941  { ISD::BSWAP, MVT::v8i32, 4 },
1942  { ISD::BSWAP, MVT::v16i16, 4 },
1943  { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
1944  { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
1945  { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
1946  { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
1947  { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
1948  { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
1949  { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
1950  { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
1951  { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
1952  { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
1953  { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
1954  { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
1955  { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
1956  { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
1957  { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
1958  { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
1959  { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
1960  { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
1961  { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
1962  { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
1963  { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
1964  { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
1965  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
1966  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
1967  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
1968  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
1969  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
1970  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
1971  };
1972  static const CostTblEntry GLMCostTbl[] = {
1973  { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
1974  { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
1975  { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
1976  { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
1977  };
1978  static const CostTblEntry SLMCostTbl[] = {
1979  { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
1980  { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
1981  { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
1982  { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
1983  };
1984  static const CostTblEntry SSE42CostTbl[] = {
1985  { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
1986  { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
1987  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
1988  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
1989  };
1990  static const CostTblEntry SSSE3CostTbl[] = {
1991  { ISD::BITREVERSE, MVT::v2i64, 5 },
1992  { ISD::BITREVERSE, MVT::v4i32, 5 },
1993  { ISD::BITREVERSE, MVT::v8i16, 5 },
1994  { ISD::BITREVERSE, MVT::v16i8, 5 },
1995  { ISD::BSWAP, MVT::v2i64, 1 },
1996  { ISD::BSWAP, MVT::v4i32, 1 },
1997  { ISD::BSWAP, MVT::v8i16, 1 },
1998  { ISD::CTLZ, MVT::v2i64, 23 },
1999  { ISD::CTLZ, MVT::v4i32, 18 },
2000  { ISD::CTLZ, MVT::v8i16, 14 },
2001  { ISD::CTLZ, MVT::v16i8, 9 },
2002  { ISD::CTPOP, MVT::v2i64, 7 },
2003  { ISD::CTPOP, MVT::v4i32, 11 },
2004  { ISD::CTPOP, MVT::v8i16, 9 },
2005  { ISD::CTPOP, MVT::v16i8, 6 },
2006  { ISD::CTTZ, MVT::v2i64, 10 },
2007  { ISD::CTTZ, MVT::v4i32, 14 },
2008  { ISD::CTTZ, MVT::v8i16, 12 },
2009  { ISD::CTTZ, MVT::v16i8, 9 }
2010  };
2011  static const CostTblEntry SSE2CostTbl[] = {
2012  { ISD::BITREVERSE, MVT::v2i64, 29 },
2013  { ISD::BITREVERSE, MVT::v4i32, 27 },
2014  { ISD::BITREVERSE, MVT::v8i16, 27 },
2015  { ISD::BITREVERSE, MVT::v16i8, 20 },
2016  { ISD::BSWAP, MVT::v2i64, 7 },
2017  { ISD::BSWAP, MVT::v4i32, 7 },
2018  { ISD::BSWAP, MVT::v8i16, 7 },
2019  { ISD::CTLZ, MVT::v2i64, 25 },
2020  { ISD::CTLZ, MVT::v4i32, 26 },
2021  { ISD::CTLZ, MVT::v8i16, 20 },
2022  { ISD::CTLZ, MVT::v16i8, 17 },
2023  { ISD::CTPOP, MVT::v2i64, 12 },
2024  { ISD::CTPOP, MVT::v4i32, 15 },
2025  { ISD::CTPOP, MVT::v8i16, 13 },
2026  { ISD::CTPOP, MVT::v16i8, 10 },
2027  { ISD::CTTZ, MVT::v2i64, 14 },
2028  { ISD::CTTZ, MVT::v4i32, 18 },
2029  { ISD::CTTZ, MVT::v8i16, 16 },
2030  { ISD::CTTZ, MVT::v16i8, 13 },
2031  { ISD::SADDSAT, MVT::v8i16, 1 },
2032  { ISD::SADDSAT, MVT::v16i8, 1 },
2033  { ISD::SSUBSAT, MVT::v8i16, 1 },
2034  { ISD::SSUBSAT, MVT::v16i8, 1 },
2035  { ISD::UADDSAT, MVT::v8i16, 1 },
2036  { ISD::UADDSAT, MVT::v16i8, 1 },
2037  { ISD::USUBSAT, MVT::v8i16, 1 },
2038  { ISD::USUBSAT, MVT::v16i8, 1 },
2039  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
2040  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
2041  };
2042  static const CostTblEntry SSE1CostTbl[] = {
2043  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
2044  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
2045  };
2046  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2047  { ISD::BITREVERSE, MVT::i64, 14 },
2048  { ISD::SADDO, MVT::i64, 1 },
2049  { ISD::UADDO, MVT::i64, 1 },
2050  };
2051  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2052  { ISD::BITREVERSE, MVT::i32, 14 },
2053  { ISD::BITREVERSE, MVT::i16, 14 },
2054  { ISD::BITREVERSE, MVT::i8, 11 },
2055  { ISD::SADDO, MVT::i32, 1 },
2056  { ISD::SADDO, MVT::i16, 1 },
2057  { ISD::SADDO, MVT::i8, 1 },
2058  { ISD::UADDO, MVT::i32, 1 },
2059  { ISD::UADDO, MVT::i16, 1 },
2060  { ISD::UADDO, MVT::i8, 1 },
2061  };
2062 
2063  Type *OpTy = RetTy;
2064  unsigned ISD = ISD::DELETED_NODE;
2065  switch (IID) {
2066  default:
2067  break;
2068  case Intrinsic::bitreverse:
2069  ISD = ISD::BITREVERSE;
2070  break;
2071  case Intrinsic::bswap:
2072  ISD = ISD::BSWAP;
2073  break;
2074  case Intrinsic::ctlz:
2075  ISD = ISD::CTLZ;
2076  break;
2077  case Intrinsic::ctpop:
2078  ISD = ISD::CTPOP;
2079  break;
2080  case Intrinsic::cttz:
2081  ISD = ISD::CTTZ;
2082  break;
2083  case Intrinsic::sadd_sat:
2084  ISD = ISD::SADDSAT;
2085  break;
2086  case Intrinsic::ssub_sat:
2087  ISD = ISD::SSUBSAT;
2088  break;
2089  case Intrinsic::uadd_sat:
2090  ISD = ISD::UADDSAT;
2091  break;
2092  case Intrinsic::usub_sat:
2093  ISD = ISD::USUBSAT;
2094  break;
2095  case Intrinsic::sqrt:
2096  ISD = ISD::FSQRT;
2097  break;
2098  case Intrinsic::sadd_with_overflow:
2099  case Intrinsic::ssub_with_overflow:
2100  // SSUBO has same costs so don't duplicate.
2101  ISD = ISD::SADDO;
2102  OpTy = RetTy->getContainedType(0);
2103  break;
2104  case Intrinsic::uadd_with_overflow:
2105  case Intrinsic::usub_with_overflow:
2106  // USUBO has same costs so don't duplicate.
2107  ISD = ISD::UADDO;
2108  OpTy = RetTy->getContainedType(0);
2109  break;
2110  }
2111 
2112  if (ISD != ISD::DELETED_NODE) {
2113  // Legalize the type.
2114  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
2115  MVT MTy = LT.second;
2116 
2117  // Attempt to lookup cost.
2118  if (ST->isGLM())
2119  if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
2120  return LT.first * Entry->Cost;
2121 
2122  if (ST->isSLM())
2123  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2124  return LT.first * Entry->Cost;
2125 
2126  if (ST->hasCDI())
2127  if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
2128  return LT.first * Entry->Cost;
2129 
2130  if (ST->hasBWI())
2131  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2132  return LT.first * Entry->Cost;
2133 
2134  if (ST->hasAVX512())
2135  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2136  return LT.first * Entry->Cost;
2137 
2138  if (ST->hasXOP())
2139  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2140  return LT.first * Entry->Cost;
2141 
2142  if (ST->hasAVX2())
2143  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2144  return LT.first * Entry->Cost;
2145 
2146  if (ST->hasAVX())
2147  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2148  return LT.first * Entry->Cost;
2149 
2150  if (ST->hasSSE42())
2151  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2152  return LT.first * Entry->Cost;
2153 
2154  if (ST->hasSSSE3())
2155  if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
2156  return LT.first * Entry->Cost;
2157 
2158  if (ST->hasSSE2())
2159  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2160  return LT.first * Entry->Cost;
2161 
2162  if (ST->hasSSE1())
2163  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2164  return LT.first * Entry->Cost;
2165 
2166  if (ST->is64Bit())
2167  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2168  return LT.first * Entry->Cost;
2169 
2170  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2171  return LT.first * Entry->Cost;
2172  }
2173 
2174  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
2175 }
2176 
2179  unsigned VF) {
2180  static const CostTblEntry AVX512CostTbl[] = {
2181  { ISD::ROTL, MVT::v8i64, 1 },
2182  { ISD::ROTL, MVT::v4i64, 1 },
2183  { ISD::ROTL, MVT::v2i64, 1 },
2184  { ISD::ROTL, MVT::v16i32, 1 },
2185  { ISD::ROTL, MVT::v8i32, 1 },
2186  { ISD::ROTL, MVT::v4i32, 1 },
2187  { ISD::ROTR, MVT::v8i64, 1 },
2188  { ISD::ROTR, MVT::v4i64, 1 },
2189  { ISD::ROTR, MVT::v2i64, 1 },
2190  { ISD::ROTR, MVT::v16i32, 1 },
2191  { ISD::ROTR, MVT::v8i32, 1 },
2192  { ISD::ROTR, MVT::v4i32, 1 }
2193  };
2194  // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
2195  static const CostTblEntry XOPCostTbl[] = {
2196  { ISD::ROTL, MVT::v4i64, 4 },
2197  { ISD::ROTL, MVT::v8i32, 4 },
2198  { ISD::ROTL, MVT::v16i16, 4 },
2199  { ISD::ROTL, MVT::v32i8, 4 },
2200  { ISD::ROTL, MVT::v2i64, 1 },
2201  { ISD::ROTL, MVT::v4i32, 1 },
2202  { ISD::ROTL, MVT::v8i16, 1 },
2203  { ISD::ROTL, MVT::v16i8, 1 },
2204  { ISD::ROTR, MVT::v4i64, 6 },
2205  { ISD::ROTR, MVT::v8i32, 6 },
2206  { ISD::ROTR, MVT::v16i16, 6 },
2207  { ISD::ROTR, MVT::v32i8, 6 },
2208  { ISD::ROTR, MVT::v2i64, 2 },
2209  { ISD::ROTR, MVT::v4i32, 2 },
2210  { ISD::ROTR, MVT::v8i16, 2 },
2211  { ISD::ROTR, MVT::v16i8, 2 }
2212  };
2213  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2214  { ISD::ROTL, MVT::i64, 1 },
2215  { ISD::ROTR, MVT::i64, 1 },
2216  { ISD::FSHL, MVT::i64, 4 }
2217  };
2218  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2219  { ISD::ROTL, MVT::i32, 1 },
2220  { ISD::ROTL, MVT::i16, 1 },
2221  { ISD::ROTL, MVT::i8, 1 },
2222  { ISD::ROTR, MVT::i32, 1 },
2223  { ISD::ROTR, MVT::i16, 1 },
2224  { ISD::ROTR, MVT::i8, 1 },
2225  { ISD::FSHL, MVT::i32, 4 },
2226  { ISD::FSHL, MVT::i16, 4 },
2227  { ISD::FSHL, MVT::i8, 4 }
2228  };
2229 
2230  unsigned ISD = ISD::DELETED_NODE;
2231  switch (IID) {
2232  default:
2233  break;
2234  case Intrinsic::fshl:
2235  ISD = ISD::FSHL;
2236  if (Args[0] == Args[1])
2237  ISD = ISD::ROTL;
2238  break;
2239  case Intrinsic::fshr:
2240  // FSHR has same costs so don't duplicate.
2241  ISD = ISD::FSHL;
2242  if (Args[0] == Args[1])
2243  ISD = ISD::ROTR;
2244  break;
2245  }
2246 
2247  if (ISD != ISD::DELETED_NODE) {
2248  // Legalize the type.
2249  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
2250  MVT MTy = LT.second;
2251 
2252  // Attempt to lookup cost.
2253  if (ST->hasAVX512())
2254  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2255  return LT.first * Entry->Cost;
2256 
2257  if (ST->hasXOP())
2258  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2259  return LT.first * Entry->Cost;
2260 
2261  if (ST->is64Bit())
2262  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2263  return LT.first * Entry->Cost;
2264 
2265  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2266  return LT.first * Entry->Cost;
2267  }
2268 
2269  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
2270 }
2271 
2272 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
2273  assert(Val->isVectorTy() && "This must be a vector type");
2274 
2275  Type *ScalarType = Val->getScalarType();
2276 
2277  if (Index != -1U) {
2278  // Legalize the type.
2279  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
2280 
2281  // This type is legalized to a scalar type.
2282  if (!LT.second.isVector())
2283  return 0;
2284 
2285  // The type may be split. Normalize the index to the new type.
2286  unsigned Width = LT.second.getVectorNumElements();
2287  Index = Index % Width;
2288 
2289  // Floating point scalars are already located in index #0.
2290  if (ScalarType->isFloatingPointTy() && Index == 0)
2291  return 0;
2292  }
2293 
2294  // Add to the base cost if we know that the extracted element of a vector is
2295  // destined to be moved to and used in the integer register file.
2296  int RegisterFileMoveCost = 0;
2297  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
2298  RegisterFileMoveCost = 1;
2299 
2300  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
2301 }
2302 
2303 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
2304  unsigned AddressSpace, const Instruction *I) {
2305  // Handle non-power-of-two vectors such as <3 x float>
2306  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
2307  unsigned NumElem = VTy->getVectorNumElements();
2308 
2309  // Handle a few common cases:
2310  // <3 x float>
2311  if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
2312  // Cost = 64 bit store + extract + 32 bit store.
2313  return 3;
2314 
2315  // <3 x double>
2316  if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
2317  // Cost = 128 bit store + unpack + 64 bit store.
2318  return 3;
2319 
2320  // Assume that all other non-power-of-two numbers are scalarized.
2321  if (!isPowerOf2_32(NumElem)) {
2322  int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
2323  AddressSpace);
2324  int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
2325  Opcode == Instruction::Store);
2326  return NumElem * Cost + SplitCost;
2327  }
2328  }
2329 
2330  // Legalize the type.
2331  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
2332  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
2333  "Invalid Opcode");
2334 
2335  // Each load/store unit costs 1.
2336  int Cost = LT.first * 1;
2337 
2338  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
2339  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
2340  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
2341  Cost *= 2;
2342 
2343  return Cost;
2344 }
2345 
2346 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
2347  unsigned Alignment,
2348  unsigned AddressSpace) {
2349  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
2350  if (!SrcVTy)
2351  // To calculate scalar take the regular cost, without mask
2352  return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
2353 
2354  unsigned NumElem = SrcVTy->getVectorNumElements();
2355  VectorType *MaskTy =
2356  VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
2357  if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
2358  (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
2359  !isPowerOf2_32(NumElem)) {
2360  // Scalarization
2361  int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
2362  int ScalarCompareCost = getCmpSelInstrCost(
2363  Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
2364  int BranchCost = getCFInstrCost(Instruction::Br);
2365  int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
2366 
2367  int ValueSplitCost = getScalarizationOverhead(
2368  SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
2369  int MemopCost =
2370  NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2371  Alignment, AddressSpace);
2372  return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
2373  }
2374 
2375  // Legalize the type.
2376  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
2377  auto VT = TLI->getValueType(DL, SrcVTy);
2378  int Cost = 0;
2379  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
2380  LT.second.getVectorNumElements() == NumElem)
2381  // Promotion requires expand/truncate for data and a shuffle for mask.
2382  Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) +
2383  getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr);
2384 
2385  else if (LT.second.getVectorNumElements() > NumElem) {
2386  VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
2387  LT.second.getVectorNumElements());
2388  // Expanding requires fill mask with zeroes
2389  Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
2390  }
2391  if (!ST->hasAVX512())
2392  return Cost + LT.first*4; // Each maskmov costs 4
2393 
2394  // AVX-512 masked load/store is cheapper
2395  return Cost+LT.first;
2396 }
2397 
2399  const SCEV *Ptr) {
2400  // Address computations in vectorized code with non-consecutive addresses will
2401  // likely result in more instructions compared to scalar code where the
2402  // computation can more often be merged into the index mode. The resulting
2403  // extra micro-ops can significantly decrease throughput.
2404  const unsigned NumVectorInstToHideOverhead = 10;
2405 
2406  // Cost modeling of Strided Access Computation is hidden by the indexing
2407  // modes of X86 regardless of the stride value. We dont believe that there
2408  // is a difference between constant strided access in gerenal and constant
2409  // strided value which is less than or equal to 64.
2410  // Even in the case of (loop invariant) stride whose value is not known at
2411  // compile time, the address computation will not incur more than one extra
2412  // ADD instruction.
2413  if (Ty->isVectorTy() && SE) {
2414  if (!BaseT::isStridedAccess(Ptr))
2415  return NumVectorInstToHideOverhead;
2416  if (!BaseT::getConstantStrideStep(SE, Ptr))
2417  return 1;
2418  }
2419 
2420  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
2421 }
2422 
2423 int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
2424  bool IsPairwise) {
2425 
2426  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2427 
2428  MVT MTy = LT.second;
2429 
2430  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2431  assert(ISD && "Invalid opcode");
2432 
2433  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2434  // and make it as the cost.
2435 
2436  static const CostTblEntry SSE42CostTblPairWise[] = {
2437  { ISD::FADD, MVT::v2f64, 2 },
2438  { ISD::FADD, MVT::v4f32, 4 },
2439  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
2440  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
2441  { ISD::ADD, MVT::v8i16, 5 },
2442  };
2443 
2444  static const CostTblEntry AVX1CostTblPairWise[] = {
2445  { ISD::FADD, MVT::v4f32, 4 },
2446  { ISD::FADD, MVT::v4f64, 5 },
2447  { ISD::FADD, MVT::v8f32, 7 },
2448  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
2449  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
2450  { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
2451  { ISD::ADD, MVT::v8i16, 5 },
2452  { ISD::ADD, MVT::v8i32, 5 },
2453  };
2454 
2455  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2456  { ISD::FADD, MVT::v2f64, 2 },
2457  { ISD::FADD, MVT::v4f32, 4 },
2458  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
2459  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
2460  { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
2461  };
2462 
2463  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2464  { ISD::FADD, MVT::v4f32, 3 },
2465  { ISD::FADD, MVT::v4f64, 3 },
2466  { ISD::FADD, MVT::v8f32, 4 },
2467  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
2468  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8".
2469  { ISD::ADD, MVT::v4i64, 3 },
2470  { ISD::ADD, MVT::v8i16, 4 },
2471  { ISD::ADD, MVT::v8i32, 5 },
2472  };
2473 
2474  if (IsPairwise) {
2475  if (ST->hasAVX())
2476  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2477  return LT.first * Entry->Cost;
2478 
2479  if (ST->hasSSE42())
2480  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2481  return LT.first * Entry->Cost;
2482  } else {
2483  if (ST->hasAVX())
2484  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2485  return LT.first * Entry->Cost;
2486 
2487  if (ST->hasSSE42())
2488  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2489  return LT.first * Entry->Cost;
2490  }
2491 
2492  static const CostTblEntry AVX2BoolReduction[] = {
2493  { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
2494  { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
2495  { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
2496  { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
2497  };
2498 
2499  static const CostTblEntry AVX1BoolReduction[] = {
2500  { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
2501  { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
2502  { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
2503  { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
2504  { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
2505  { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
2506  { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
2507  { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
2508  };
2509 
2510  static const CostTblEntry SSE2BoolReduction[] = {
2511  { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
2512  { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
2513  { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
2514  { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
2515  { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
2516  { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
2517  { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
2518  { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
2519  };
2520 
2521  // Handle bool allof/anyof patterns.
2522  if (ValTy->getVectorElementType()->isIntegerTy(1)) {
2523  if (ST->hasAVX2())
2524  if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
2525  return LT.first * Entry->Cost;
2526  if (ST->hasAVX())
2527  if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
2528  return LT.first * Entry->Cost;
2529  if (ST->hasSSE2())
2530  if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
2531  return LT.first * Entry->Cost;
2532  }
2533 
2534  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
2535 }
2536 
2538  bool IsPairwise, bool IsUnsigned) {
2539  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2540 
2541  MVT MTy = LT.second;
2542 
2543  int ISD;
2544  if (ValTy->isIntOrIntVectorTy()) {
2545  ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
2546  } else {
2547  assert(ValTy->isFPOrFPVectorTy() &&
2548  "Expected float point or integer vector type.");
2549  ISD = ISD::FMINNUM;
2550  }
2551 
2552  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2553  // and make it as the cost.
2554 
2555  static const CostTblEntry SSE1CostTblPairWise[] = {
2556  {ISD::FMINNUM, MVT::v4f32, 4},
2557  };
2558 
2559  static const CostTblEntry SSE2CostTblPairWise[] = {
2560  {ISD::FMINNUM, MVT::v2f64, 3},
2561  {ISD::SMIN, MVT::v2i64, 6},
2562  {ISD::UMIN, MVT::v2i64, 8},
2563  {ISD::SMIN, MVT::v4i32, 6},
2564  {ISD::UMIN, MVT::v4i32, 8},
2565  {ISD::SMIN, MVT::v8i16, 4},
2566  {ISD::UMIN, MVT::v8i16, 6},
2567  {ISD::SMIN, MVT::v16i8, 8},
2568  {ISD::UMIN, MVT::v16i8, 6},
2569  };
2570 
2571  static const CostTblEntry SSE41CostTblPairWise[] = {
2572  {ISD::FMINNUM, MVT::v4f32, 2},
2573  {ISD::SMIN, MVT::v2i64, 9},
2574  {ISD::UMIN, MVT::v2i64,10},
2575  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2576  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2577  {ISD::SMIN, MVT::v8i16, 2},
2578  {ISD::UMIN, MVT::v8i16, 2},
2579  {ISD::SMIN, MVT::v16i8, 3},
2580  {ISD::UMIN, MVT::v16i8, 3},
2581  };
2582 
2583  static const CostTblEntry SSE42CostTblPairWise[] = {
2584  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2585  {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
2586  };
2587 
2588  static const CostTblEntry AVX1CostTblPairWise[] = {
2589  {ISD::FMINNUM, MVT::v4f32, 1},
2590  {ISD::FMINNUM, MVT::v4f64, 1},
2591  {ISD::FMINNUM, MVT::v8f32, 2},
2592  {ISD::SMIN, MVT::v2i64, 3},
2593  {ISD::UMIN, MVT::v2i64, 3},
2594  {ISD::SMIN, MVT::v4i32, 1},
2595  {ISD::UMIN, MVT::v4i32, 1},
2596  {ISD::SMIN, MVT::v8i16, 1},
2597  {ISD::UMIN, MVT::v8i16, 1},
2598  {ISD::SMIN, MVT::v16i8, 2},
2599  {ISD::UMIN, MVT::v16i8, 2},
2600  {ISD::SMIN, MVT::v4i64, 7},
2601  {ISD::UMIN, MVT::v4i64, 7},
2602  {ISD::SMIN, MVT::v8i32, 3},
2603  {ISD::UMIN, MVT::v8i32, 3},
2604  {ISD::SMIN, MVT::v16i16, 3},
2605  {ISD::UMIN, MVT::v16i16, 3},
2606  {ISD::SMIN, MVT::v32i8, 3},
2607  {ISD::UMIN, MVT::v32i8, 3},
2608  };
2609 
2610  static const CostTblEntry AVX2CostTblPairWise[] = {
2611  {ISD::SMIN, MVT::v4i64, 2},
2612  {ISD::UMIN, MVT::v4i64, 2},
2613  {ISD::SMIN, MVT::v8i32, 1},
2614  {ISD::UMIN, MVT::v8i32, 1},
2615  {ISD::SMIN, MVT::v16i16, 1},
2616  {ISD::UMIN, MVT::v16i16, 1},
2617  {ISD::SMIN, MVT::v32i8, 2},
2618  {ISD::UMIN, MVT::v32i8, 2},
2619  };
2620 
2621  static const CostTblEntry AVX512CostTblPairWise[] = {
2622  {ISD::FMINNUM, MVT::v8f64, 1},
2623  {ISD::FMINNUM, MVT::v16f32, 2},
2624  {ISD::SMIN, MVT::v8i64, 2},
2625  {ISD::UMIN, MVT::v8i64, 2},
2626  {ISD::SMIN, MVT::v16i32, 1},
2627  {ISD::UMIN, MVT::v16i32, 1},
2628  };
2629 
2630  static const CostTblEntry SSE1CostTblNoPairWise[] = {
2631  {ISD::FMINNUM, MVT::v4f32, 4},
2632  };
2633 
2634  static const CostTblEntry SSE2CostTblNoPairWise[] = {
2635  {ISD::FMINNUM, MVT::v2f64, 3},
2636  {ISD::SMIN, MVT::v2i64, 6},
2637  {ISD::UMIN, MVT::v2i64, 8},
2638  {ISD::SMIN, MVT::v4i32, 6},
2639  {ISD::UMIN, MVT::v4i32, 8},
2640  {ISD::SMIN, MVT::v8i16, 4},
2641  {ISD::UMIN, MVT::v8i16, 6},
2642  {ISD::SMIN, MVT::v16i8, 8},
2643  {ISD::UMIN, MVT::v16i8, 6},
2644  };
2645 
2646  static const CostTblEntry SSE41CostTblNoPairWise[] = {
2647  {ISD::FMINNUM, MVT::v4f32, 3},
2648  {ISD::SMIN, MVT::v2i64, 9},
2649  {ISD::UMIN, MVT::v2i64,11},
2650  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2651  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2652  {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
2653  {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
2654  {ISD::SMIN, MVT::v16i8, 3},
2655  {ISD::UMIN, MVT::v16i8, 3},
2656  };
2657 
2658  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2659  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2660  {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
2661  };
2662 
2663  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2664  {ISD::FMINNUM, MVT::v4f32, 1},
2665  {ISD::FMINNUM, MVT::v4f64, 1},
2666  {ISD::FMINNUM, MVT::v8f32, 1},
2667  {ISD::SMIN, MVT::v2i64, 3},
2668  {ISD::UMIN, MVT::v2i64, 3},
2669  {ISD::SMIN, MVT::v4i32, 1},
2670  {ISD::UMIN, MVT::v4i32, 1},
2671  {ISD::SMIN, MVT::v8i16, 1},
2672  {ISD::UMIN, MVT::v8i16, 1},
2673  {ISD::SMIN, MVT::v16i8, 2},
2674  {ISD::UMIN, MVT::v16i8, 2},
2675  {ISD::SMIN, MVT::v4i64, 7},
2676  {ISD::UMIN, MVT::v4i64, 7},
2677  {ISD::SMIN, MVT::v8i32, 2},
2678  {ISD::UMIN, MVT::v8i32, 2},
2679  {ISD::SMIN, MVT::v16i16, 2},
2680  {ISD::UMIN, MVT::v16i16, 2},
2681  {ISD::SMIN, MVT::v32i8, 2},
2682  {ISD::UMIN, MVT::v32i8, 2},
2683  };
2684 
2685  static const CostTblEntry AVX2CostTblNoPairWise[] = {
2686  {ISD::SMIN, MVT::v4i64, 1},
2687  {ISD::UMIN, MVT::v4i64, 1},
2688  {ISD::SMIN, MVT::v8i32, 1},
2689  {ISD::UMIN, MVT::v8i32, 1},
2690  {ISD::SMIN, MVT::v16i16, 1},
2691  {ISD::UMIN, MVT::v16i16, 1},
2692  {ISD::SMIN, MVT::v32i8, 1},
2693  {ISD::UMIN, MVT::v32i8, 1},
2694  };
2695 
2696  static const CostTblEntry AVX512CostTblNoPairWise[] = {
2697  {ISD::FMINNUM, MVT::v8f64, 1},
2698  {ISD::FMINNUM, MVT::v16f32, 2},
2699  {ISD::SMIN, MVT::v8i64, 1},
2700  {ISD::UMIN, MVT::v8i64, 1},
2701  {ISD::SMIN, MVT::v16i32, 1},
2702  {ISD::UMIN, MVT::v16i32, 1},
2703  };
2704 
2705  if (IsPairwise) {
2706  if (ST->hasAVX512())
2707  if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
2708  return LT.first * Entry->Cost;
2709 
2710  if (ST->hasAVX2())
2711  if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
2712  return LT.first * Entry->Cost;
2713 
2714  if (ST->hasAVX())
2715  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2716  return LT.first * Entry->Cost;
2717 
2718  if (ST->hasSSE42())
2719  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2720  return LT.first * Entry->Cost;
2721 
2722  if (ST->hasSSE41())
2723  if (const auto *Entry = CostTableLookup(SSE41CostTblPairWise, ISD, MTy))
2724  return LT.first * Entry->Cost;
2725 
2726  if (ST->hasSSE2())
2727  if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
2728  return LT.first * Entry->Cost;
2729 
2730  if (ST->hasSSE1())
2731  if (const auto *Entry = CostTableLookup(SSE1CostTblPairWise, ISD, MTy))
2732  return LT.first * Entry->Cost;
2733  } else {
2734  if (ST->hasAVX512())
2735  if (const auto *Entry =
2736  CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
2737  return LT.first * Entry->Cost;
2738 
2739  if (ST->hasAVX2())
2740  if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
2741  return LT.first * Entry->Cost;
2742 
2743  if (ST->hasAVX())
2744  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2745  return LT.first * Entry->Cost;
2746 
2747  if (ST->hasSSE42())
2748  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2749  return LT.first * Entry->Cost;
2750 
2751  if (ST->hasSSE41())
2752  if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
2753  return LT.first * Entry->Cost;
2754 
2755  if (ST->hasSSE2())
2756  if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
2757  return LT.first * Entry->Cost;
2758 
2759  if (ST->hasSSE1())
2760  if (const auto *Entry = CostTableLookup(SSE1CostTblNoPairWise, ISD, MTy))
2761  return LT.first * Entry->Cost;
2762  }
2763 
2764  return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
2765 }
2766 
2767 /// Calculate the cost of materializing a 64-bit value. This helper
2768 /// method might only calculate a fraction of a larger immediate. Therefore it
2769 /// is valid to return a cost of ZERO.
2770 int X86TTIImpl::getIntImmCost(int64_t Val) {
2771  if (Val == 0)
2772  return TTI::TCC_Free;
2773 
2774  if (isInt<32>(Val))
2775  return TTI::TCC_Basic;
2776 
2777  return 2 * TTI::TCC_Basic;
2778 }
2779 
2780 int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
2781  assert(Ty->isIntegerTy());
2782 
2783  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2784  if (BitSize == 0)
2785  return ~0U;
2786 
2787  // Never hoist constants larger than 128bit, because this might lead to
2788  // incorrect code generation or assertions in codegen.
2789  // Fixme: Create a cost model for types larger than i128 once the codegen
2790  // issues have been fixed.
2791  if (BitSize > 128)
2792  return TTI::TCC_Free;
2793 
2794  if (Imm == 0)
2795  return TTI::TCC_Free;
2796 
2797  // Sign-extend all constants to a multiple of 64-bit.
2798  APInt ImmVal = Imm;
2799  if (BitSize % 64 != 0)
2800  ImmVal = Imm.sext(alignTo(BitSize, 64));
2801 
2802  // Split the constant into 64-bit chunks and calculate the cost for each
2803  // chunk.
2804  int Cost = 0;
2805  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
2806  APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
2807  int64_t Val = Tmp.getSExtValue();
2808  Cost += getIntImmCost(Val);
2809  }
2810  // We need at least one instruction to materialize the constant.
2811  return std::max(1, Cost);
2812 }
2813 
2814 int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
2815  Type *Ty) {
2816  assert(Ty->isIntegerTy());
2817 
2818  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2819  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2820  // here, so that constant hoisting will ignore this constant.
2821  if (BitSize == 0)
2822  return TTI::TCC_Free;
2823 
2824  unsigned ImmIdx = ~0U;
2825  switch (Opcode) {
2826  default:
2827  return TTI::TCC_Free;
2828  case Instruction::GetElementPtr:
2829  // Always hoist the base address of a GetElementPtr. This prevents the
2830  // creation of new constants for every base constant that gets constant
2831  // folded with the offset.
2832  if (Idx == 0)
2833  return 2 * TTI::TCC_Basic;
2834  return TTI::TCC_Free;
2835  case Instruction::Store:
2836  ImmIdx = 0;
2837  break;
2838  case Instruction::ICmp:
2839  // This is an imperfect hack to prevent constant hoisting of
2840  // compares that might be trying to check if a 64-bit value fits in
2841  // 32-bits. The backend can optimize these cases using a right shift by 32.
2842  // Ideally we would check the compare predicate here. There also other
2843  // similar immediates the backend can use shifts for.
2844  if (Idx == 1 && Imm.getBitWidth() == 64) {
2845  uint64_t ImmVal = Imm.getZExtValue();
2846  if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
2847  return TTI::TCC_Free;
2848  }
2849  ImmIdx = 1;
2850  break;
2851  case Instruction::And:
2852  // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
2853  // by using a 32-bit operation with implicit zero extension. Detect such
2854  // immediates here as the normal path expects bit 31 to be sign extended.
2855  if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
2856  return TTI::TCC_Free;
2857  ImmIdx = 1;
2858  break;
2859  case Instruction::Add:
2860  case Instruction::Sub:
2861  // For add/sub, we can use the opposite instruction for INT32_MIN.
2862  if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
2863  return TTI::TCC_Free;
2864  ImmIdx = 1;
2865  break;
2866  case Instruction::UDiv:
2867  case Instruction::SDiv:
2868  case Instruction::URem:
2869  case Instruction::SRem:
2870  // Division by constant is typically expanded later into a different
2871  // instruction sequence. This completely changes the constants.
2872  // Report them as "free" to stop ConstantHoist from marking them as opaque.
2873  return TTI::TCC_Free;
2874  case Instruction::Mul:
2875  case Instruction::Or:
2876  case Instruction::Xor:
2877  ImmIdx = 1;
2878  break;
2879  // Always return TCC_Free for the shift value of a shift instruction.
2880  case Instruction::Shl:
2881  case Instruction::LShr:
2882  case Instruction::AShr:
2883  if (Idx == 1)
2884  return TTI::TCC_Free;
2885  break;
2886  case Instruction::Trunc:
2887  case Instruction::ZExt:
2888  case Instruction::SExt:
2889  case Instruction::IntToPtr:
2890  case Instruction::PtrToInt:
2891  case Instruction::BitCast:
2892  case Instruction::PHI:
2893  case Instruction::Call:
2894  case Instruction::Select:
2895  case Instruction::Ret:
2896  case Instruction::Load:
2897  break;
2898  }
2899 
2900  if (Idx == ImmIdx) {
2901  int NumConstants = divideCeil(BitSize, 64);
2902  int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
2903  return (Cost <= NumConstants * TTI::TCC_Basic)
2904  ? static_cast<int>(TTI::TCC_Free)
2905  : Cost;
2906  }
2907 
2908  return X86TTIImpl::getIntImmCost(Imm, Ty);
2909 }
2910 
2911 int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
2912  Type *Ty) {
2913  assert(Ty->isIntegerTy());
2914 
2915  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2916  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2917  // here, so that constant hoisting will ignore this constant.
2918  if (BitSize == 0)
2919  return TTI::TCC_Free;
2920 
2921  switch (IID) {
2922  default:
2923  return TTI::TCC_Free;
2924  case Intrinsic::sadd_with_overflow:
2925  case Intrinsic::uadd_with_overflow:
2926  case Intrinsic::ssub_with_overflow:
2927  case Intrinsic::usub_with_overflow:
2928  case Intrinsic::smul_with_overflow:
2929  case Intrinsic::umul_with_overflow:
2930  if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
2931  return TTI::TCC_Free;
2932  break;
2933  case Intrinsic::experimental_stackmap:
2934  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2935  return TTI::TCC_Free;
2936  break;
2937  case Intrinsic::experimental_patchpoint_void:
2938  case Intrinsic::experimental_patchpoint_i64:
2939  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2940  return TTI::TCC_Free;
2941  break;
2942  }
2943  return X86TTIImpl::getIntImmCost(Imm, Ty);
2944 }
2945 
2946 unsigned X86TTIImpl::getUserCost(const User *U,
2947  ArrayRef<const Value *> Operands) {
2948  if (isa<StoreInst>(U)) {
2949  Value *Ptr = U->getOperand(1);
2950  // Store instruction with index and scale costs 2 Uops.
2951  // Check the preceding GEP to identify non-const indices.
2952  if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
2953  if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
2954  return TTI::TCC_Basic * 2;
2955  }
2956  return TTI::TCC_Basic;
2957  }
2958  return BaseT::getUserCost(U, Operands);
2959 }
2960 
2961 // Return an average cost of Gather / Scatter instruction, maybe improved later
2962 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
2963  unsigned Alignment, unsigned AddressSpace) {
2964 
2965  assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
2966  unsigned VF = SrcVTy->getVectorNumElements();
2967 
2968  // Try to reduce index size from 64 bit (default for GEP)
2969  // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
2970  // operation will use 16 x 64 indices which do not fit in a zmm and needs
2971  // to split. Also check that the base pointer is the same for all lanes,
2972  // and that there's at most one variable index.
2973  auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
2974  unsigned IndexSize = DL.getPointerSizeInBits();
2976  if (IndexSize < 64 || !GEP)
2977  return IndexSize;
2978 
2979  unsigned NumOfVarIndices = 0;
2980  Value *Ptrs = GEP->getPointerOperand();
2981  if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
2982  return IndexSize;
2983  for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
2984  if (isa<Constant>(GEP->getOperand(i)))
2985  continue;
2986  Type *IndxTy = GEP->getOperand(i)->getType();
2987  if (IndxTy->isVectorTy())
2988  IndxTy = IndxTy->getVectorElementType();
2989  if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
2990  !isa<SExtInst>(GEP->getOperand(i))) ||
2991  ++NumOfVarIndices > 1)
2992  return IndexSize; // 64
2993  }
2994  return (unsigned)32;
2995  };
2996 
2997 
2998  // Trying to reduce IndexSize to 32 bits for vector 16.
2999  // By default the IndexSize is equal to pointer size.
3000  unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
3001  ? getIndexSizeInBits(Ptr, DL)
3003 
3004  Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
3005  IndexSize), VF);
3006  std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
3007  std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
3008  int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
3009  if (SplitFactor > 1) {
3010  // Handle splitting of vector of pointers
3011  Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
3012  return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
3013  AddressSpace);
3014  }
3015 
3016  // The gather / scatter cost is given by Intel architects. It is a rough
3017  // number since we are looking at one instruction in a time.
3018  const int GSOverhead = (Opcode == Instruction::Load)
3019  ? ST->getGatherOverhead()
3020  : ST->getScatterOverhead();
3021  return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3022  Alignment, AddressSpace);
3023 }
3024 
3025 /// Return the cost of full scalarization of gather / scatter operation.
3026 ///
3027 /// Opcode - Load or Store instruction.
3028 /// SrcVTy - The type of the data vector that should be gathered or scattered.
3029 /// VariableMask - The mask is non-constant at compile time.
3030 /// Alignment - Alignment for one element.
3031 /// AddressSpace - pointer[s] address space.
3032 ///
3033 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
3034  bool VariableMask, unsigned Alignment,
3035  unsigned AddressSpace) {
3036  unsigned VF = SrcVTy->getVectorNumElements();
3037 
3038  int MaskUnpackCost = 0;
3039  if (VariableMask) {
3040  VectorType *MaskTy =
3041  VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
3042  MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
3043  int ScalarCompareCost =
3044  getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
3045  nullptr);
3046  int BranchCost = getCFInstrCost(Instruction::Br);
3047  MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
3048  }
3049 
3050  // The cost of the scalar loads/stores.
3051  int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3052  Alignment, AddressSpace);
3053 
3054  int InsertExtractCost = 0;
3055  if (Opcode == Instruction::Load)
3056  for (unsigned i = 0; i < VF; ++i)
3057  // Add the cost of inserting each scalar load into the vector
3058  InsertExtractCost +=
3059  getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
3060  else
3061  for (unsigned i = 0; i < VF; ++i)
3062  // Add the cost of extracting each element out of the data vector
3063  InsertExtractCost +=
3064  getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
3065 
3066  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
3067 }
3068 
3069 /// Calculate the cost of Gather / Scatter operation
3070 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
3071  Value *Ptr, bool VariableMask,
3072  unsigned Alignment) {
3073  assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
3074  unsigned VF = SrcVTy->getVectorNumElements();
3075  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
3076  if (!PtrTy && Ptr->getType()->isVectorTy())
3077  PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
3078  assert(PtrTy && "Unexpected type for Ptr argument");
3079  unsigned AddressSpace = PtrTy->getAddressSpace();
3080 
3081  bool Scalarize = false;
3082  if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
3083  (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
3084  Scalarize = true;
3085  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
3086  // Vector-4 of gather/scatter instruction does not exist on KNL.
3087  // We can extend it to 8 elements, but zeroing upper bits of
3088  // the mask vector will add more instructions. Right now we give the scalar
3089  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
3090  // is better in the VariableMask case.
3091  if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
3092  Scalarize = true;
3093 
3094  if (Scalarize)
3095  return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
3096  AddressSpace);
3097 
3098  return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
3099 }
3100 
3103  // X86 specific here are "instruction number 1st priority".
3104  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
3105  C1.NumIVMuls, C1.NumBaseAdds,
3106  C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3107  std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
3108  C2.NumIVMuls, C2.NumBaseAdds,
3109  C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3110 }
3111 
3113  return ST->hasMacroFusion() || ST->hasBranchFusion();
3114 }
3115 
3117  if (!ST->hasAVX())
3118  return false;
3119 
3120  // The backend can't handle a single element vector.
3121  if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
3122  return false;
3123  Type *ScalarTy = DataTy->getScalarType();
3124 
3125  if (ScalarTy->isPointerTy())
3126  return true;
3127 
3128  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
3129  return true;
3130 
3131  if (!ScalarTy->isIntegerTy())
3132  return false;
3133 
3134  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
3135  return IntWidth == 32 || IntWidth == 64 ||
3136  ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
3137 }
3138 
3140  return isLegalMaskedLoad(DataType);
3141 }
3142 
3144  if (!isa<VectorType>(DataTy))
3145  return false;
3146 
3147  if (!ST->hasAVX512())
3148  return false;
3149 
3150  // The backend can't handle a single element vector.
3151  if (DataTy->getVectorNumElements() == 1)
3152  return false;
3153 
3154  Type *ScalarTy = DataTy->getVectorElementType();
3155 
3156  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
3157  return true;
3158 
3159  if (!ScalarTy->isIntegerTy())
3160  return false;
3161 
3162  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
3163  return IntWidth == 32 || IntWidth == 64 ||
3164  ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
3165 }
3166 
3168  return isLegalMaskedExpandLoad(DataTy);
3169 }
3170 
3172  // Some CPUs have better gather performance than others.
3173  // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
3174  // enable gather with a -march.
3175  if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())))
3176  return false;
3177 
3178  // This function is called now in two cases: from the Loop Vectorizer
3179  // and from the Scalarizer.
3180  // When the Loop Vectorizer asks about legality of the feature,
3181  // the vectorization factor is not calculated yet. The Loop Vectorizer
3182  // sends a scalar type and the decision is based on the width of the
3183  // scalar element.
3184  // Later on, the cost model will estimate usage this intrinsic based on
3185  // the vector type.
3186  // The Scalarizer asks again about legality. It sends a vector type.
3187  // In this case we can reject non-power-of-2 vectors.
3188  // We also reject single element vectors as the type legalizer can't
3189  // scalarize it.
3190  if (isa<VectorType>(DataTy)) {
3191  unsigned NumElts = DataTy->getVectorNumElements();
3192  if (NumElts == 1 || !isPowerOf2_32(NumElts))
3193  return false;
3194  }
3195  Type *ScalarTy = DataTy->getScalarType();
3196  if (ScalarTy->isPointerTy())
3197  return true;
3198 
3199  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
3200  return true;
3201 
3202  if (!ScalarTy->isIntegerTy())
3203  return false;
3204 
3205  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
3206  return IntWidth == 32 || IntWidth == 64;
3207 }
3208 
3210  // AVX2 doesn't support scatter
3211  if (!ST->hasAVX512())
3212  return false;
3213  return isLegalMaskedGather(DataType);
3214 }
3215 
3216 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
3217  EVT VT = TLI->getValueType(DL, DataType);
3218  return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
3219 }
3220 
3222  return false;
3223 }
3224 
3226  const Function *Callee) const {
3227  const TargetMachine &TM = getTLI()->getTargetMachine();
3228 
3229  // Work this as a subsetting of subtarget features.
3230  const FeatureBitset &CallerBits =
3231  TM.getSubtargetImpl(*Caller)->getFeatureBits();
3232  const FeatureBitset &CalleeBits =
3233  TM.getSubtargetImpl(*Callee)->getFeatureBits();
3234 
3235  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
3236  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
3237  return (RealCallerBits & RealCalleeBits) == RealCalleeBits;
3238 }
3239 
3241  const Function *Caller, const Function *Callee,
3243  if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))
3244  return false;
3245 
3246  // If we get here, we know the target features match. If one function
3247  // considers 512-bit vectors legal and the other does not, consider them
3248  // incompatible.
3249  // FIXME Look at the arguments and only consider 512 bit or larger vectors?
3250  const TargetMachine &TM = getTLI()->getTargetMachine();
3251 
3252  return TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
3253  TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs();
3254 }
3255 
3257 X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
3258  // Only enable vector loads for equality comparison.
3259  // Right now the vector version is not as fast, see #33329.
3260  static const auto ThreeWayOptions = [this]() {
3262  if (ST->is64Bit()) {
3263  Options.LoadSizes.push_back(8);
3264  }
3265  Options.LoadSizes.push_back(4);
3266  Options.LoadSizes.push_back(2);
3267  Options.LoadSizes.push_back(1);
3268  return Options;
3269  }();
3270  static const auto EqZeroOptions = [this]() {
3272  // TODO: enable AVX512 when the DAG is ready.
3273  // if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
3274  if (ST->hasAVX2()) Options.LoadSizes.push_back(32);
3275  if (ST->hasSSE2()) Options.LoadSizes.push_back(16);
3276  if (ST->is64Bit()) {
3277  Options.LoadSizes.push_back(8);
3278  }
3279  Options.LoadSizes.push_back(4);
3280  Options.LoadSizes.push_back(2);
3281  Options.LoadSizes.push_back(1);
3282  // All GPR and vector loads can be unaligned. SIMD compare requires integer
3283  // vectors (SSE2/AVX2).
3284  Options.AllowOverlappingLoads = true;
3285  return Options;
3286  }();
3287  return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;
3288 }
3289 
3291  // TODO: We expect this to be beneficial regardless of arch,
3292  // but there are currently some unexplained performance artifacts on Atom.
3293  // As a temporary solution, disable on Atom.
3294  return !(ST->isAtom());
3295 }
3296 
3297 // Get estimation for interleaved load/store operations for AVX2.
3298 // \p Factor is the interleaved-access factor (stride) - number of
3299 // (interleaved) elements in the group.
3300 // \p Indices contains the indices for a strided load: when the
3301 // interleaved load has gaps they indicate which elements are used.
3302 // If Indices is empty (or if the number of indices is equal to the size
3303 // of the interleaved-access as given in \p Factor) the access has no gaps.
3304 //
3305 // As opposed to AVX-512, AVX2 does not have generic shuffles that allow
3306 // computing the cost using a generic formula as a function of generic
3307 // shuffles. We therefore use a lookup table instead, filled according to
3308 // the instruction sequences that codegen currently generates.
3310  unsigned Factor,
3311  ArrayRef<unsigned> Indices,
3312  unsigned Alignment,
3313  unsigned AddressSpace,
3314  bool UseMaskForCond,
3315  bool UseMaskForGaps) {
3316 
3317  if (UseMaskForCond || UseMaskForGaps)
3318  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3319  Alignment, AddressSpace,
3320  UseMaskForCond, UseMaskForGaps);
3321 
3322  // We currently Support only fully-interleaved groups, with no gaps.
3323  // TODO: Support also strided loads (interleaved-groups with gaps).
3324  if (Indices.size() && Indices.size() != Factor)
3325  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3326  Alignment, AddressSpace);
3327 
3328  // VecTy for interleave memop is <VF*Factor x Elt>.
3329  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
3330  // VecTy = <12 x i32>.
3331  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
3332 
3333  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
3334  // the VF=2, while v2i128 is an unsupported MVT vector type
3335  // (see MachineValueType.h::getVectorVT()).
3336  if (!LegalVT.isVector())
3337  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3338  Alignment, AddressSpace);
3339 
3340  unsigned VF = VecTy->getVectorNumElements() / Factor;
3341  Type *ScalarTy = VecTy->getVectorElementType();
3342 
3343  // Calculate the number of memory operations (NumOfMemOps), required
3344  // for load/store the VecTy.
3345  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
3346  unsigned LegalVTSize = LegalVT.getStoreSize();
3347  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
3348 
3349  // Get the cost of one memory operation.
3350  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
3351  LegalVT.getVectorNumElements());
3352  unsigned MemOpCost =
3353  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
3354 
3355  VectorType *VT = VectorType::get(ScalarTy, VF);
3356  EVT ETy = TLI->getValueType(DL, VT);
3357  if (!ETy.isSimple())
3358  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3359  Alignment, AddressSpace);
3360 
3361  // TODO: Complete for other data-types and strides.
3362  // Each combination of Stride, ElementTy and VF results in a different
3363  // sequence; The cost tables are therefore accessed with:
3364  // Factor (stride) and VectorType=VFxElemType.
3365  // The Cost accounts only for the shuffle sequence;
3366  // The cost of the loads/stores is accounted for separately.
3367  //
3368  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
3369  { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
3370  { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
3371 
3372  { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
3373  { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
3374  { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
3375  { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
3376  { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
3377  { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
3378 
3379  { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
3380  { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
3381  { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
3382  { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
3383  { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
3384 
3385  { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
3386  };
3387 
3388  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
3389  { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
3390  { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
3391 
3392  { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
3393  { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
3394  { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
3395  { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
3396  { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
3397 
3398  { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
3399  { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
3400  { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
3401  { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
3402  { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
3403  };
3404 
3405  if (Opcode == Instruction::Load) {
3406  if (const auto *Entry =
3407  CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
3408  return NumOfMemOps * MemOpCost + Entry->Cost;
3409  } else {
3410  assert(Opcode == Instruction::Store &&
3411  "Expected Store Instruction at this point");
3412  if (const auto *Entry =
3413  CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
3414  return NumOfMemOps * MemOpCost + Entry->Cost;
3415  }
3416 
3417  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3418  Alignment, AddressSpace);
3419 }
3420 
3421 // Get estimation for interleaved load/store operations and strided load.
3422 // \p Indices contains indices for strided load.
3423 // \p Factor - the factor of interleaving.
3424 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
3426  unsigned Factor,
3427  ArrayRef<unsigned> Indices,
3428  unsigned Alignment,
3429  unsigned AddressSpace,
3430  bool UseMaskForCond,
3431  bool UseMaskForGaps) {
3432 
3433  if (UseMaskForCond || UseMaskForGaps)
3434  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3435  Alignment, AddressSpace,
3436  UseMaskForCond, UseMaskForGaps);
3437 
3438  // VecTy for interleave memop is <VF*Factor x Elt>.
3439  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
3440  // VecTy = <12 x i32>.
3441 
3442  // Calculate the number of memory operations (NumOfMemOps), required
3443  // for load/store the VecTy.
3444  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
3445  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
3446  unsigned LegalVTSize = LegalVT.getStoreSize();
3447  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
3448 
3449  // Get the cost of one memory operation.
3450  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
3451  LegalVT.getVectorNumElements());
3452  unsigned MemOpCost =
3453  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
3454 
3455  unsigned VF = VecTy->getVectorNumElements() / Factor;
3456  MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
3457 
3458  if (Opcode == Instruction::Load) {
3459  // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
3460  // contain the cost of the optimized shuffle sequence that the
3461  // X86InterleavedAccess pass will generate.
3462  // The cost of loads and stores are computed separately from the table.
3463 
3464  // X86InterleavedAccess support only the following interleaved-access group.
3465  static const CostTblEntry AVX512InterleavedLoadTbl[] = {
3466  {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
3467  {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
3468  {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
3469  };
3470 
3471  if (const auto *Entry =
3472  CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
3473  return NumOfMemOps * MemOpCost + Entry->Cost;
3474  //If an entry does not exist, fallback to the default implementation.
3475 
3476  // Kind of shuffle depends on number of loaded values.
3477  // If we load the entire data in one register, we can use a 1-src shuffle.
3478  // Otherwise, we'll merge 2 sources in each operation.
3479  TTI::ShuffleKind ShuffleKind =
3480  (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
3481 
3482  unsigned ShuffleCost =
3483  getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
3484 
3485  unsigned NumOfLoadsInInterleaveGrp =
3486  Indices.size() ? Indices.size() : Factor;
3487  Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
3488  VecTy->getVectorNumElements() / Factor);
3489  unsigned NumOfResults =
3490  getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
3491  NumOfLoadsInInterleaveGrp;
3492 
3493  // About a half of the loads may be folded in shuffles when we have only
3494  // one result. If we have more than one result, we do not fold loads at all.
3495  unsigned NumOfUnfoldedLoads =
3496  NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
3497 
3498  // Get a number of shuffle operations per result.
3499  unsigned NumOfShufflesPerResult =
3500  std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
3501 
3502  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
3503  // When we have more than one destination, we need additional instructions
3504  // to keep sources.
3505  unsigned NumOfMoves = 0;
3506  if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
3507  NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
3508 
3509  int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
3510  NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
3511 
3512  return Cost;
3513  }
3514 
3515  // Store.
3516  assert(Opcode == Instruction::Store &&
3517  "Expected Store Instruction at this point");
3518  // X86InterleavedAccess support only the following interleaved-access group.
3519  static const CostTblEntry AVX512InterleavedStoreTbl[] = {
3520  {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
3521  {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
3522  {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
3523 
3524  {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
3525  {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
3526  {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
3527  {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
3528  };
3529 
3530  if (const auto *Entry =
3531  CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
3532  return NumOfMemOps * MemOpCost + Entry->Cost;
3533  //If an entry does not exist, fallback to the default implementation.
3534 
3535  // There is no strided stores meanwhile. And store can't be folded in
3536  // shuffle.
3537  unsigned NumOfSources = Factor; // The number of values to be merged.
3538  unsigned ShuffleCost =
3539  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
3540  unsigned NumOfShufflesPerStore = NumOfSources - 1;
3541 
3542  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
3543  // We need additional instructions to keep sources.
3544  unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
3545  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
3546  NumOfMoves;
3547  return Cost;
3548 }
3549 
3550 int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
3551  unsigned Factor,
3552  ArrayRef<unsigned> Indices,
3553  unsigned Alignment,
3554  unsigned AddressSpace,
3555  bool UseMaskForCond,
3556  bool UseMaskForGaps) {
3557  auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
3558  Type *EltTy = VecTy->getVectorElementType();
3559  if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
3560  EltTy->isIntegerTy(32) || EltTy->isPointerTy())
3561  return true;
3562  if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
3563  return HasBW;
3564  return false;
3565  };
3566  if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
3567  return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
3568  Alignment, AddressSpace,
3569  UseMaskForCond, UseMaskForGaps);
3570  if (ST->hasAVX2())
3571  return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
3572  Alignment, AddressSpace,
3573  UseMaskForCond, UseMaskForGaps);
3574 
3575  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3576  Alignment, AddressSpace,
3577  UseMaskForCond, UseMaskForGaps);
3578 }
bool hasAVX() const
Definition: X86Subtarget.h:575
Type * getVectorElementType() const
Definition: Type.h:370
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:348
X = FP_ROUND(Y, TRUNC) - Rounding &#39;Y&#39; from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:562
bool is64Bit() const
Is this x86_64? (disregarding specific ABI / programming model)
Definition: X86Subtarget.h:536
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:110
bool areFunctionArgsABICompatible(const Function *Caller, const Function *Callee, SmallPtrSetImpl< Argument *> &Args) const
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:622
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
Definition: BasicTTIImpl.h:567
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:172
llvm::Optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1562
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:833
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:41
bool hasSSE41() const
Definition: X86Subtarget.h:573
This class represents lattice values for constants.
Definition: AllocatorList.h:23
unsigned minRequiredElementSize(const Value *Val, bool &isSigned)
static MVT getVectorVT(MVT VT, unsigned NumElements)
Cost tables and simple lookup functions.
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment)
Calculate the cost of Gather / Scatter operation.
bool hasPOPCNT() const
Definition: X86Subtarget.h:583
bool hasAVX2() const
Definition: X86Subtarget.h:576
const Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isVector() const
Return true if this is a vector value type.
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:506
void push_back(const T &Elt)
Definition: SmallVector.h:211
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine...
The main scalar evolution driver.
unsigned getRegisterBitWidth(bool Vector) const
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:252
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:250
bool hasVBMI2() const
Definition: X86Subtarget.h:610
unsigned getVectorNumElements() const
bool isLegalMaskedScatter(Type *DataType)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1185
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:388
bool useAVX512Regs() const
Definition: X86Subtarget.h:713
Type Conversion Cost Table.
Definition: CostTable.h:44
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Definition: MathExtras.h:684
[US]{MIN/MAX} - Binary minimum or maximum or signed or unsigned integers.
Definition: ISDOpcodes.h:408
Hexagon Common GEP
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:229
unsigned getAtomicMemIntrinsicMaxElementSize() const
Cost Table Entry.
Definition: CostTable.h:24
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1508
unsigned getNumberOfRegisters(bool Vector)
int getGatherOverhead() const
Definition: X86Subtarget.h:636
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
unsigned getMaxInterleaveFactor(unsigned VF)
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:209
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
bool isUnalignedMem32Slow() const
Definition: X86Subtarget.h:635
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Value * > Args, FastMathFlags FMF, unsigned VF=1)
Get intrinsic cost based on arguments.
bool isLegalMaskedStore(Type *DataType)
unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value *> Args=ArrayRef< const Value *>())
int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
const FeatureBitset & getFeatureBits() const
Shift and rotation operations.
Definition: ISDOpcodes.h:434
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:196
unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise)
Try to calculate arithmetic and shuffle op costs for reduction operations.
const TypeConversionCostTblEntry * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntry > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table, TypeTy must be comparable to CompareTy by ==. ...
Definition: CostTable.h:54
unsigned getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2)
bool hasVLX() const
Definition: X86Subtarget.h:675
unsigned getSizeInBits() const
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I)
Definition: BasicTTIImpl.h:771
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1574
unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:633
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:244
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:502
PopcntSupportKind
Flags indicating the kind of support for population count.
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:883
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:200
llvm::Optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
Selects elements from the corresponding lane of either source operand.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:202
int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
Reverse the order of the vector.
bool hasDQI() const
Definition: X86Subtarget.h:673
MVT getVectorElementType() const
Value * getOperand(unsigned i) const
Definition: User.h:169
Class to represent pointers.
Definition: DerivedTypes.h:498
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:548
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return &#39;this&#39;.
Definition: Type.h:303
bool is128BitVector() const
Return true if this is a 128-bit vector type.
ExtractSubvector Index indicates start offset.
bool isFloatTy() const
Return true if this is &#39;float&#39;, a 32-bit IEEE fp type.
Definition: Type.h:146
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:873
bool isSLM() const
Definition: X86Subtarget.h:727
bool hasSSSE3() const
Definition: X86Subtarget.h:572
If not nullptr, enable inline expansion of memcmp.
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
Container class for subtarget features.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:428
Machine Value Type.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:849
Simple binary floating point operators.
Definition: ISDOpcodes.h:287
bool isLegalMaskedGather(Type *DataType)
unsigned getScalarSizeInBits() const
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:148
unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise, bool)
Try to calculate op costs for min/max reduction operations.
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:223
int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Expected to fold away in lowering.
unsigned getUserCost(const User *U, ArrayRef< const Value * > Operands)
bool isLegalMaskedLoad(Type *DataType)
const TTI::MemCmpExpansionOptions * enableMemCmpExpansion(bool IsZeroCmp) const
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:526
Merge elements from two source vectors into one with any shuffle mask.
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace)
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:288
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
bool hasXOP() const
Definition: X86Subtarget.h:599
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:580
bool hasSSE42() const
Definition: X86Subtarget.h:574
Extended Value Type.
Definition: ValueTypes.h:33
int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm)
unsigned getUserCost(const User *U, ArrayRef< const Value *> Operands)
const TargetMachine & getTargetMachine() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I=nullptr)
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
OperandValueProperties
Additional properties of an operand&#39;s values.