LLVM 19.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include "llvm/Support/Debug.h"
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
103
106 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
107 // TODO: Currently the __builtin_popcount() implementation using SSE3
108 // instructions is inefficient. Once the problem is fixed, we should
109 // call ST->hasSSE3() instead of ST->hasPOPCNT().
110 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
111}
112
113std::optional<unsigned> X86TTIImpl::getCacheSize(
115 switch (Level) {
117 // - Penryn
118 // - Nehalem
119 // - Westmere
120 // - Sandy Bridge
121 // - Ivy Bridge
122 // - Haswell
123 // - Broadwell
124 // - Skylake
125 // - Kabylake
126 return 32 * 1024; // 32 KByte
128 // - Penryn
129 // - Nehalem
130 // - Westmere
131 // - Sandy Bridge
132 // - Ivy Bridge
133 // - Haswell
134 // - Broadwell
135 // - Skylake
136 // - Kabylake
137 return 256 * 1024; // 256 KByte
138 }
139
140 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
141}
142
143std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
145 // - Penryn
146 // - Nehalem
147 // - Westmere
148 // - Sandy Bridge
149 // - Ivy Bridge
150 // - Haswell
151 // - Broadwell
152 // - Skylake
153 // - Kabylake
154 switch (Level) {
156 [[fallthrough]];
158 return 8;
159 }
160
161 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
162}
163
164unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
165 bool Vector = (ClassID == 1);
166 if (Vector && !ST->hasSSE1())
167 return 0;
168
169 if (ST->is64Bit()) {
170 if (Vector && ST->hasAVX512())
171 return 32;
172 return 16;
173 }
174 return 8;
175}
176
179 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
180 switch (K) {
182 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
184 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
185 return TypeSize::getFixed(512);
186 if (ST->hasAVX() && PreferVectorWidth >= 256)
187 return TypeSize::getFixed(256);
188 if (ST->hasSSE1() && PreferVectorWidth >= 128)
189 return TypeSize::getFixed(128);
190 return TypeSize::getFixed(0);
192 return TypeSize::getScalable(0);
193 }
194
195 llvm_unreachable("Unsupported register kind");
196}
197
200 .getFixedValue();
201}
202
204 // If the loop will not be vectorized, don't interleave the loop.
205 // Let regular unroll to unroll the loop, which saves the overflow
206 // check and memory check cost.
207 if (VF.isScalar())
208 return 1;
209
210 if (ST->isAtom())
211 return 1;
212
213 // Sandybridge and Haswell have multiple execution ports and pipelined
214 // vector units.
215 if (ST->hasAVX())
216 return 4;
217
218 return 2;
219}
220
222 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
225 const Instruction *CxtI) {
226
227 // vXi8 multiplications are always promoted to vXi16.
228 // Sub-128-bit types can be extended/packed more efficiently.
229 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
230 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
231 Type *WideVecTy =
232 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
233 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
235 CostKind) +
236 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
238 CostKind) +
239 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
240 }
241
242 // Legalize the type.
243 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
244
245 int ISD = TLI->InstructionOpcodeToISD(Opcode);
246 assert(ISD && "Invalid opcode");
247
248 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
249 (LT.second.getScalarType() == MVT::i32 ||
250 LT.second.getScalarType() == MVT::i64)) {
251 // Check if the operands can be represented as a smaller datatype.
252 bool Op1Signed = false, Op2Signed = false;
253 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
254 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
255 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
256 bool SignedMode = Op1Signed || Op2Signed;
257
258 // If both vXi32 are representable as i15 and at least one is constant,
259 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
260 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
261 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
262 LT.second.getScalarType() == MVT::i32) {
263 bool Op1Constant =
264 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
265 bool Op2Constant =
266 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
267 bool Op1Sext = isa<SExtInst>(Args[0]) &&
268 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
269 bool Op2Sext = isa<SExtInst>(Args[1]) &&
270 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
271
272 bool IsZeroExtended = !Op1Signed || !Op2Signed;
273 bool IsConstant = Op1Constant || Op2Constant;
274 bool IsSext = Op1Sext || Op2Sext;
275 if (IsConstant || IsZeroExtended || IsSext)
276 LT.second =
277 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
278 }
279
280 // Check if the vXi32 operands can be shrunk into a smaller datatype.
281 // This should match the codegen from reduceVMULWidth.
282 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
283 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
284 if (OpMinSize <= 7)
285 return LT.first * 3; // pmullw/sext
286 if (!SignedMode && OpMinSize <= 8)
287 return LT.first * 3; // pmullw/zext
288 if (OpMinSize <= 15)
289 return LT.first * 5; // pmullw/pmulhw/pshuf
290 if (!SignedMode && OpMinSize <= 16)
291 return LT.first * 5; // pmullw/pmulhw/pshuf
292 }
293
294 // If both vXi64 are representable as (unsigned) i32, then we can perform
295 // the multiple with a single PMULUDQ instruction.
296 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
297 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
298 ISD = X86ISD::PMULUDQ;
299 }
300
301 // Vector multiply by pow2 will be simplified to shifts.
302 // Vector multiply by -pow2 will be simplified to shifts/negates.
303 if (ISD == ISD::MUL && Op2Info.isConstant() &&
304 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
306 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
307 Op1Info.getNoProps(), Op2Info.getNoProps());
308 if (Op2Info.isNegatedPowerOf2())
309 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
310 return Cost;
311 }
312
313 // On X86, vector signed division by constants power-of-two are
314 // normally expanded to the sequence SRA + SRL + ADD + SRA.
315 // The OperandValue properties may not be the same as that of the previous
316 // operation; conservatively assume OP_None.
317 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
318 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
320 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
321 Op1Info.getNoProps(), Op2Info.getNoProps());
322 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
323 Op1Info.getNoProps(), Op2Info.getNoProps());
324 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
325 Op1Info.getNoProps(), Op2Info.getNoProps());
326
327 if (ISD == ISD::SREM) {
328 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
329 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
330 Op2Info.getNoProps());
331 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
332 Op2Info.getNoProps());
333 }
334
335 return Cost;
336 }
337
338 // Vector unsigned division/remainder will be simplified to shifts/masks.
339 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
340 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
341 if (ISD == ISD::UDIV)
342 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
343 Op1Info.getNoProps(), Op2Info.getNoProps());
344 // UREM
345 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
346 Op1Info.getNoProps(), Op2Info.getNoProps());
347 }
348
349 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
350 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
351 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
352 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
353 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
354 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
355 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
356 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
357 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
358 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
359 };
360
361 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
362 if (const auto *Entry =
363 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
364 if (auto KindCost = Entry->Cost[CostKind])
365 return LT.first * *KindCost;
366
367 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
368 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
369 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
370 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
371 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
372 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
373 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
374 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
375 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
376 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
377
378 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
379 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
380 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
381 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
382 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
383 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
384 };
385
386 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
387 if (const auto *Entry =
388 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
389 if (auto KindCost = Entry->Cost[CostKind])
390 return LT.first * *KindCost;
391
392 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
393 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
394 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
395 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
396
397 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
398 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
399 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
400
401 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
402 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
403 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
404 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
405 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
406 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
407
408 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
409 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
410 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
411 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
412 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
413 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
414 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
415
416 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
417 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
418 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
419 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
420 };
421
422 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
423 if (const auto *Entry =
424 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
425 if (auto KindCost = Entry->Cost[CostKind])
426 return LT.first * *KindCost;
427
428 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
429 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
430 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
431 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
432 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
433 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
434 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
435
436 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
437 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
438 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
439 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
440 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
441 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
442
443 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
444 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
445 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
446 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
447 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
448 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
449
450 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
451 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
452 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
453 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
454 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
455 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
456
457 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
458 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
459 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
460 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
461 };
462
463 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
464 if (const auto *Entry =
465 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
466 if (auto KindCost = Entry->Cost[CostKind])
467 return LT.first * *KindCost;
468
469 static const CostKindTblEntry AVXUniformConstCostTable[] = {
470 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
471 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
472 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
473 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
474 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
475 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
476
477 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
478 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
479 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
480 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
481 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
482 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
483
484 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
485 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
486 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
487 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
488 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
489 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
490
491 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
492 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
493 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
494 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
495 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
496 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
497
498 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
499 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
500 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
501 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
502 };
503
504 // XOP has faster vXi8 shifts.
505 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
506 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
507 if (const auto *Entry =
508 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
509 if (auto KindCost = Entry->Cost[CostKind])
510 return LT.first * *KindCost;
511
512 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
513 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
514 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
515 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
516
517 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
518 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
519 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
520
521 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
522 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
523 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
524
525 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
526 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
527 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
528
529 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
530 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
531 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
532 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
533 };
534
535 // XOP has faster vXi8 shifts.
536 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
537 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
538 if (const auto *Entry =
539 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
540 if (auto KindCost = Entry->Cost[CostKind])
541 return LT.first * *KindCost;
542
543 static const CostKindTblEntry AVX512BWConstCostTable[] = {
544 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
545 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
546 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
547 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
548
549 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
550 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
551 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
552 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
553 };
554
555 if (Op2Info.isConstant() && ST->hasBWI())
556 if (const auto *Entry =
557 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
558 if (auto KindCost = Entry->Cost[CostKind])
559 return LT.first * *KindCost;
560
561 static const CostKindTblEntry AVX512ConstCostTable[] = {
562 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
563 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
564 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
565 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
566
567 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
568 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
569 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
570 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
571
572 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
573 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
574 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
575 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
576 };
577
578 if (Op2Info.isConstant() && ST->hasAVX512())
579 if (const auto *Entry =
580 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
581 if (auto KindCost = Entry->Cost[CostKind])
582 return LT.first * *KindCost;
583
584 static const CostKindTblEntry AVX2ConstCostTable[] = {
585 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
586 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
587 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
588 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
589
590 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
591 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
592 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
593 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
594
595 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
596 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
597 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
598 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
599 };
600
601 if (Op2Info.isConstant() && ST->hasAVX2())
602 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
603 if (auto KindCost = Entry->Cost[CostKind])
604 return LT.first * *KindCost;
605
606 static const CostKindTblEntry AVXConstCostTable[] = {
607 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
608 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
609 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
610 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
611
612 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
613 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
614 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
615 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
616
617 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
618 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
619 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
620 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
621 };
622
623 if (Op2Info.isConstant() && ST->hasAVX())
624 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
625 if (auto KindCost = Entry->Cost[CostKind])
626 return LT.first * *KindCost;
627
628 static const CostKindTblEntry SSE41ConstCostTable[] = {
629 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
630 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
631 };
632
633 if (Op2Info.isConstant() && ST->hasSSE41())
634 if (const auto *Entry =
635 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
636 if (auto KindCost = Entry->Cost[CostKind])
637 return LT.first * *KindCost;
638
639 static const CostKindTblEntry SSE2ConstCostTable[] = {
640 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
641 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
642 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
643 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
644
645 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
646 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
647 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
648 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
649
650 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
651 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
652 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
653 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
654 };
655
656 if (Op2Info.isConstant() && ST->hasSSE2())
657 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
658 if (auto KindCost = Entry->Cost[CostKind])
659 return LT.first * *KindCost;
660
661 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
662 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
663 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
664 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
665 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
666 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
667 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
668 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
669 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
670 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
671
672 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
673 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
674 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
675 };
676
677 if (ST->hasBWI() && Op2Info.isUniform())
678 if (const auto *Entry =
679 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
680 if (auto KindCost = Entry->Cost[CostKind])
681 return LT.first * *KindCost;
682
683 static const CostKindTblEntry AVX512UniformCostTable[] = {
684 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
685 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
686 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
687
688 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
689 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
690 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
691
692 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
693 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
694 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
695 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
696 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
697 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
698 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
699 };
700
701 if (ST->hasAVX512() && Op2Info.isUniform())
702 if (const auto *Entry =
703 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
704 if (auto KindCost = Entry->Cost[CostKind])
705 return LT.first * *KindCost;
706
707 static const CostKindTblEntry AVX2UniformCostTable[] = {
708 // Uniform splats are cheaper for the following instructions.
709 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
710 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
711 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
712 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
713 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
714 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
715
716 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
717 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
718 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
719 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
720 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
721 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
722
723 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
724 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
725 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
726 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
727 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
728 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
729
730 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
731 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
732 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
733 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
734 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
735 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
736 };
737
738 if (ST->hasAVX2() && Op2Info.isUniform())
739 if (const auto *Entry =
740 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
741 if (auto KindCost = Entry->Cost[CostKind])
742 return LT.first * *KindCost;
743
744 static const CostKindTblEntry AVXUniformCostTable[] = {
745 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
746 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
747 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
748 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
749 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
750 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
751
752 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
753 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
754 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
755 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
756 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
757 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
758
759 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
760 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
761 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
762 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
763 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
764 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
765
766 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
767 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
768 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
769 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
770 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
771 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
772 };
773
774 // XOP has faster vXi8 shifts.
775 if (ST->hasAVX() && Op2Info.isUniform() &&
776 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
777 if (const auto *Entry =
778 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
779 if (auto KindCost = Entry->Cost[CostKind])
780 return LT.first * *KindCost;
781
782 static const CostKindTblEntry SSE2UniformCostTable[] = {
783 // Uniform splats are cheaper for the following instructions.
784 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
785 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
786 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
787
788 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
789 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
790 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
791
792 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
793 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
794 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
795
796 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
797 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
798 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
799 };
800
801 if (ST->hasSSE2() && Op2Info.isUniform() &&
802 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
803 if (const auto *Entry =
804 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
805 if (auto KindCost = Entry->Cost[CostKind])
806 return LT.first * *KindCost;
807
808 static const CostKindTblEntry AVX512DQCostTable[] = {
809 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
810 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
811 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
812 };
813
814 // Look for AVX512DQ lowering tricks for custom cases.
815 if (ST->hasDQI())
816 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
817 if (auto KindCost = Entry->Cost[CostKind])
818 return LT.first * *KindCost;
819
820 static const CostKindTblEntry AVX512BWCostTable[] = {
821 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
822 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
823 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
824 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
825 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
826 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
827 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
828 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
829 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
830
831 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
832 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
833 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
834 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
835 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
836 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
837 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
838 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
839 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
840
841 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
842 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
843
844 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
845 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
846 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
847 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
848
849 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
850 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
851
852 { ISD::MUL, MVT::v64i8, { 5, 10,10,11 } },
853 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
854
855 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
856 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
857 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
858 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
859 };
860
861 // Look for AVX512BW lowering tricks for custom cases.
862 if (ST->hasBWI())
863 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
864 if (auto KindCost = Entry->Cost[CostKind])
865 return LT.first * *KindCost;
866
867 static const CostKindTblEntry AVX512CostTable[] = {
868 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
869 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
870 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
871
872 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
873 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
874 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
875
876 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
877 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
878 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
879 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
880 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
881 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
882 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
883 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
884 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
885
886 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
887 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
888 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
889 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
890 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
891 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
892 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
893 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
894 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
895
896 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
897 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
898
899 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
900 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
901
902 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
903 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
904 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
905 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
906
907 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
908 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
909 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
910 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
911
912 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
913 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
914 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
915 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
916
917 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
918 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
919 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
920 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
921 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
922
923 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
924
925 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
926 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
927 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
928 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
929 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
930 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
931 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
932 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
933 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
934
935 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
936 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
937 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
938 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
939
940 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
941 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
942 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
943 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
944 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
945 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
946 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
947 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
948 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
949
950 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
951 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
952 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
953 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
954 };
955
956 if (ST->hasAVX512())
957 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
958 if (auto KindCost = Entry->Cost[CostKind])
959 return LT.first * *KindCost;
960
961 static const CostKindTblEntry AVX2ShiftCostTable[] = {
962 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
963 // customize them to detect the cases where shift amount is a scalar one.
964 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
965 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
966 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
967 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
968 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
969 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
970 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
971 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
972 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
973 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
974 };
975
976 if (ST->hasAVX512()) {
977 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
978 // On AVX512, a packed v32i16 shift left by a constant build_vector
979 // is lowered into a vector multiply (vpmullw).
980 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
981 Op1Info.getNoProps(), Op2Info.getNoProps());
982 }
983
984 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
985 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
986 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
987 Op2Info.isConstant())
988 // On AVX2, a packed v16i16 shift left by a constant build_vector
989 // is lowered into a vector multiply (vpmullw).
990 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
991 Op1Info.getNoProps(), Op2Info.getNoProps());
992
993 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
994 if (auto KindCost = Entry->Cost[CostKind])
995 return LT.first * *KindCost;
996 }
997
998 static const CostKindTblEntry XOPShiftCostTable[] = {
999 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1000 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1001 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1002 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1003 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1004 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1005 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1006 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1007 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1008 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1009 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1010 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1011 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1012 // 256bit shifts require splitting if AVX2 didn't catch them above.
1013 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1014 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1015 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1016 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1017 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1018 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1019 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1020 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1021 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1022 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1023 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1024 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1025 };
1026
1027 // Look for XOP lowering tricks.
1028 if (ST->hasXOP()) {
1029 // If the right shift is constant then we'll fold the negation so
1030 // it's as cheap as a left shift.
1031 int ShiftISD = ISD;
1032 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1033 ShiftISD = ISD::SHL;
1034 if (const auto *Entry =
1035 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1036 if (auto KindCost = Entry->Cost[CostKind])
1037 return LT.first * *KindCost;
1038 }
1039
1040 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1041 MVT VT = LT.second;
1042 // Vector shift left by non uniform constant can be lowered
1043 // into vector multiply.
1044 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1045 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1046 ISD = ISD::MUL;
1047 }
1048
1049 static const CostKindTblEntry GLMCostTable[] = {
1050 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1051 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1052 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1053 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1054 };
1055
1056 if (ST->useGLMDivSqrtCosts())
1057 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1058 if (auto KindCost = Entry->Cost[CostKind])
1059 return LT.first * *KindCost;
1060
1061 static const CostKindTblEntry SLMCostTable[] = {
1062 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1063 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1064 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1065 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1066 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1067 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1068 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1069 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1070 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1071 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1072 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1073 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1074 // v2i64/v4i64 mul is custom lowered as a series of long:
1075 // multiplies(3), shifts(3) and adds(2)
1076 // slm muldq version throughput is 2 and addq throughput 4
1077 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1078 // 3X4 (addq throughput) = 17
1079 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1080 // slm addq\subq throughput is 4
1081 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1082 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1083 };
1084
1085 if (ST->useSLMArithCosts())
1086 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1087 if (auto KindCost = Entry->Cost[CostKind])
1088 return LT.first * *KindCost;
1089
1090 static const CostKindTblEntry AVX2CostTable[] = {
1091 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1092 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1093 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1094 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1095
1096 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1097 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1098 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1099 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1100
1101 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1102 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1103 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1104 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1105 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1106 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1107
1108 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1109 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1110 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1111 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1112 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1113 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1114 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1115 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1116
1117 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1118 { ISD::MUL, MVT::v32i8, { 6, 11,10,19 } }, // unpack/pmullw
1119 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1120 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1121 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1122 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1123 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1124
1125 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1126
1127 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1128 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1129
1130 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1131 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1132 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1133 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1134 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1135 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1136
1137 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1138 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1139 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1140 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1141 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1142 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1143
1144 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1145 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1146 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1147 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1148 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1149 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1150
1151 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1152 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1153 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1154 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1155 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1156 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1157 };
1158
1159 // Look for AVX2 lowering tricks for custom cases.
1160 if (ST->hasAVX2())
1161 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1162 if (auto KindCost = Entry->Cost[CostKind])
1163 return LT.first * *KindCost;
1164
1165 static const CostKindTblEntry AVX1CostTable[] = {
1166 // We don't have to scalarize unsupported ops. We can issue two half-sized
1167 // operations and we only need to extract the upper YMM half.
1168 // Two ops + 1 extract + 1 insert = 4.
1169 { ISD::MUL, MVT::v32i8, { 12, 13, 22, 23 } }, // unpack/pmullw + split
1170 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1171 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1172 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1173 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1174
1175 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1176 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1177 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1178 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1179
1180 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1181 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1182 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1183 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1184
1185 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1186 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1187 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1188 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1189
1190 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1191 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1192 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1193 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1194 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1195 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1196 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1197 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1198 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1199 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1200
1201 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1202 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1203 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1204 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1205 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1206 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1207 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1208 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1209
1210 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1211 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1212 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1213 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1214 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1215 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1216 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1217 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1218
1219 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1220 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1221 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1222 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1223 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1224 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1225 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1226 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1227
1228 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1229 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1230
1231 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1232 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1233 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1234 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1235 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1236 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1237
1238 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1239 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1240 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1241 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1242 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1243 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1244
1245 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1246 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1247 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1248 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1249 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1250 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1251
1252 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1253 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1254 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1255 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1256 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1257 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1258 };
1259
1260 if (ST->hasAVX())
1261 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1262 if (auto KindCost = Entry->Cost[CostKind])
1263 return LT.first * *KindCost;
1264
1265 static const CostKindTblEntry SSE42CostTable[] = {
1266 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1267 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1268 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1269 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1270
1271 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1272 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1273 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1274 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1275
1276 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1277 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1278 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1279 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1280
1281 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1282 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1283 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1284 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1285
1286 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1287 };
1288
1289 if (ST->hasSSE42())
1290 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1291 if (auto KindCost = Entry->Cost[CostKind])
1292 return LT.first * *KindCost;
1293
1294 static const CostKindTblEntry SSE41CostTable[] = {
1295 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1296 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1297 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1298
1299 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1300 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1301 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1302 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1303
1304 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1305 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1306 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1307 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1308
1309 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack
1310 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1311 };
1312
1313 if (ST->hasSSE41())
1314 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1315 if (auto KindCost = Entry->Cost[CostKind])
1316 return LT.first * *KindCost;
1317
1318 static const CostKindTblEntry SSE2CostTable[] = {
1319 // We don't correctly identify costs of casts because they are marked as
1320 // custom.
1321 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1322 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1323 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1324 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1325
1326 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1327 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1328 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1329 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1330
1331 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1332 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1333 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1334 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1335
1336 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1337 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1338 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1339 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1340
1341 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1342 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1343 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1344 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1345
1346 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1347 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1348 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1349 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1350
1351 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1352 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1353
1354 { ISD::MUL, MVT::v16i8, { 5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1355 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1356 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1357 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1358
1359 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1360
1361 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1362 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1363 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1364 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1365
1366 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1367 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1368 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1369 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1370
1371 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1372 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1373 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1374
1375 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1376 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1377 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1378
1379 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1380 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1381 };
1382
1383 if (ST->hasSSE2())
1384 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1385 if (auto KindCost = Entry->Cost[CostKind])
1386 return LT.first * *KindCost;
1387
1388 static const CostKindTblEntry SSE1CostTable[] = {
1389 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1390 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1391
1392 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1393 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1394
1395 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1396 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1397
1398 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1399 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1400
1401 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1402 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1403 };
1404
1405 if (ST->hasSSE1())
1406 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1407 if (auto KindCost = Entry->Cost[CostKind])
1408 return LT.first * *KindCost;
1409
1410 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1411 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1412 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1413 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1414 };
1415
1416 if (ST->is64Bit())
1417 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1418 if (auto KindCost = Entry->Cost[CostKind])
1419 return LT.first * *KindCost;
1420
1421 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1422 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1423 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1424 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1425
1426 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1427 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1428 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1429
1430 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1431 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1432 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1433
1434 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1435 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1436 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1437 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1438 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1439 };
1440
1441 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1442 if (auto KindCost = Entry->Cost[CostKind])
1443 return LT.first * *KindCost;
1444
1445 // It is not a good idea to vectorize division. We have to scalarize it and
1446 // in the process we will often end up having to spilling regular
1447 // registers. The overhead of division is going to dominate most kernels
1448 // anyways so try hard to prevent vectorization of division - it is
1449 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1450 // to hide "20 cycles" for each lane.
1451 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1452 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1453 ISD == ISD::UREM)) {
1454 InstructionCost ScalarCost =
1456 Op1Info.getNoProps(), Op2Info.getNoProps());
1457 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1458 }
1459
1460 // Handle some basic single instruction code size cases.
1461 if (CostKind == TTI::TCK_CodeSize) {
1462 switch (ISD) {
1463 case ISD::FADD:
1464 case ISD::FSUB:
1465 case ISD::FMUL:
1466 case ISD::FDIV:
1467 case ISD::FNEG:
1468 case ISD::AND:
1469 case ISD::OR:
1470 case ISD::XOR:
1471 return LT.first;
1472 break;
1473 }
1474 }
1475
1476 // Fallback to the default implementation.
1477 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1478 Args, CxtI);
1479}
1480
1483 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1485 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1486 return TTI::TCC_Basic;
1488}
1489
1491 TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask,
1493 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1494 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1495 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1496 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1497
1498 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1499
1500 // Recognize a basic concat_vector shuffle.
1501 if (Kind == TTI::SK_PermuteTwoSrc &&
1502 Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1503 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1506 CostKind, Mask.size() / 2, BaseTp);
1507
1508 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1509 if (Kind == TTI::SK_Transpose)
1510 Kind = TTI::SK_PermuteTwoSrc;
1511
1512 if (Kind == TTI::SK_Broadcast) {
1513 // For Broadcasts we are splatting the first element from the first input
1514 // register, so only need to reference that input and all the output
1515 // registers are the same.
1516 LT.first = 1;
1517
1518 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1519 using namespace PatternMatch;
1520 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1521 (ST->hasAVX2() ||
1522 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1523 return TTI::TCC_Free;
1524 }
1525
1526 // Treat <X x bfloat> shuffles as <X x half>.
1527 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1528 LT.second = LT.second.changeVectorElementType(MVT::f16);
1529
1530 // Subvector extractions are free if they start at the beginning of a
1531 // vector and cheap if the subvectors are aligned.
1532 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1533 int NumElts = LT.second.getVectorNumElements();
1534 if ((Index % NumElts) == 0)
1535 return 0;
1536 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1537 if (SubLT.second.isVector()) {
1538 int NumSubElts = SubLT.second.getVectorNumElements();
1539 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1540 return SubLT.first;
1541 // Handle some cases for widening legalization. For now we only handle
1542 // cases where the original subvector was naturally aligned and evenly
1543 // fit in its legalized subvector type.
1544 // FIXME: Remove some of the alignment restrictions.
1545 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1546 // vectors.
1547 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1548 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1549 (NumSubElts % OrigSubElts) == 0 &&
1550 LT.second.getVectorElementType() ==
1551 SubLT.second.getVectorElementType() &&
1552 LT.second.getVectorElementType().getSizeInBits() ==
1554 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1555 "Unexpected number of elements!");
1556 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1557 LT.second.getVectorNumElements());
1558 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1559 SubLT.second.getVectorNumElements());
1560 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1561 InstructionCost ExtractCost =
1562 getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt,
1563 CostKind, ExtractIndex, SubTy);
1564
1565 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1566 // if we have SSSE3 we can use pshufb.
1567 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1568 return ExtractCost + 1; // pshufd or pshufb
1569
1570 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1571 "Unexpected vector size");
1572
1573 return ExtractCost + 2; // worst case pshufhw + pshufd
1574 }
1575 }
1576 // If the extract subvector is not optimal, treat it as single op shuffle.
1578 }
1579
1580 // Subvector insertions are cheap if the subvectors are aligned.
1581 // Note that in general, the insertion starting at the beginning of a vector
1582 // isn't free, because we need to preserve the rest of the wide vector.
1583 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1584 int NumElts = LT.second.getVectorNumElements();
1585 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1586 if (SubLT.second.isVector()) {
1587 int NumSubElts = SubLT.second.getVectorNumElements();
1588 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1589 return SubLT.first;
1590 }
1591
1592 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1593 Kind = TTI::SK_PermuteTwoSrc;
1594 }
1595
1596 // Handle some common (illegal) sub-vector types as they are often very cheap
1597 // to shuffle even on targets without PSHUFB.
1598 EVT VT = TLI->getValueType(DL, BaseTp);
1599 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1600 !ST->hasSSSE3()) {
1601 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1602 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1603 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1604 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1605 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1606 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1607
1608 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1609 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1610 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1611 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1612
1613 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1614 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1615 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1616 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1617
1618 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1619 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1620 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1621 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1622 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1623
1624 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1625 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1626 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1627 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1628 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1629 };
1630
1631 if (ST->hasSSE2())
1632 if (const auto *Entry =
1633 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1634 return Entry->Cost;
1635 }
1636
1637 // We are going to permute multiple sources and the result will be in multiple
1638 // destinations. Providing an accurate cost only for splits where the element
1639 // type remains the same.
1640 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1641 MVT LegalVT = LT.second;
1642 if (LegalVT.isVector() &&
1643 LegalVT.getVectorElementType().getSizeInBits() ==
1645 LegalVT.getVectorNumElements() <
1646 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1647 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1648 unsigned LegalVTSize = LegalVT.getStoreSize();
1649 // Number of source vectors after legalization:
1650 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1651 // Number of destination vectors after legalization:
1652 InstructionCost NumOfDests = LT.first;
1653
1654 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1655 LegalVT.getVectorNumElements());
1656
1657 if (!Mask.empty() && NumOfDests.isValid()) {
1658 // Try to perform better estimation of the permutation.
1659 // 1. Split the source/destination vectors into real registers.
1660 // 2. Do the mask analysis to identify which real registers are
1661 // permuted. If more than 1 source registers are used for the
1662 // destination register building, the cost for this destination register
1663 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1664 // source register is used, build mask and calculate the cost as a cost
1665 // of PermuteSingleSrc.
1666 // Also, for the single register permute we try to identify if the
1667 // destination register is just a copy of the source register or the
1668 // copy of the previous destination register (the cost is
1669 // TTI::TCC_Basic). If the source register is just reused, the cost for
1670 // this operation is 0.
1671 NumOfDests =
1673 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1674 .first;
1675 unsigned E = *NumOfDests.getValue();
1676 unsigned NormalizedVF =
1677 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1678 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1679 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1680 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1681 copy(Mask, NormalizedMask.begin());
1682 unsigned PrevSrcReg = 0;
1683 ArrayRef<int> PrevRegMask;
1686 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1687 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1688 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1689 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1690 // Check if the previous register can be just copied to the next
1691 // one.
1692 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1693 PrevRegMask != RegMask)
1695 RegMask, CostKind, 0, nullptr);
1696 else
1697 // Just a copy of previous destination register.
1699 return;
1700 }
1701 if (SrcReg != DestReg &&
1702 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1703 // Just a copy of the source register.
1705 }
1706 PrevSrcReg = SrcReg;
1707 PrevRegMask = RegMask;
1708 },
1709 [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1710 unsigned /*Unused*/,
1711 unsigned /*Unused*/) {
1712 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1713 CostKind, 0, nullptr);
1714 });
1715 return Cost;
1716 }
1717
1718 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1719 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1720 std::nullopt, CostKind, 0, nullptr);
1721 }
1722
1723 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1724 }
1725
1726 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1727 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1728 // We assume that source and destination have the same vector type.
1729 InstructionCost NumOfDests = LT.first;
1730 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1731 LT.first = NumOfDests * NumOfShufflesPerDest;
1732 }
1733
1734 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1735 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1736 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1737
1738 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1739 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1740
1741 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1742 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1743 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1744 };
1745
1746 if (ST->hasVBMI())
1747 if (const auto *Entry =
1748 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1749 return LT.first * Entry->Cost;
1750
1751 static const CostTblEntry AVX512BWShuffleTbl[] = {
1752 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1753 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1754 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1755
1756 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1757 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1758 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1759 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1760
1761 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1762 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1763 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1764 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1765 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1766
1767 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1768 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1769 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1770 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1771 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1772
1773 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1774 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1775
1776 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1777 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1778 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1779 };
1780
1781 if (ST->hasBWI())
1782 if (const auto *Entry =
1783 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1784 return LT.first * Entry->Cost;
1785
1786 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1787 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1788 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1789 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1790 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1791 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1792 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1793 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1794
1795 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1796 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1797 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1798 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1799 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1800 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1801 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1802
1803 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1804 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1805 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1806 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1807 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1808 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1809 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1810 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1811 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1812 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1813 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1814
1815 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1816 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1817 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1818 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1819 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1820 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1821 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1822 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1823 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1824 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1825 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1826 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1827 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1828
1829 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1830 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1831 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1832 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1833 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1834 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1835 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1836 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1837 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1838 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1839 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1840 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1841
1842 // FIXME: This just applies the type legalization cost rules above
1843 // assuming these completely split.
1844 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1845 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1846 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1847 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1848 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1849 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1850
1851 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1852 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1853 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1854 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1855 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1856 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1857 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1858 };
1859
1860 if (ST->hasAVX512())
1861 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1862 if (auto KindCost = Entry->Cost[CostKind])
1863 return LT.first * *KindCost;
1864
1865 static const CostTblEntry AVX2ShuffleTbl[] = {
1866 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1867 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1868 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1869 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1870 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1871 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1872 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1873
1874 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1875 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1876 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1877 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1878 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1879 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1880 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1881
1882 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1883 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1884 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1885
1886 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1887 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1888 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1889 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1890 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1891
1892 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1893 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1894 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1895 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1896 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1897 // + vpblendvb
1898 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1899 // + vpblendvb
1900 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1901 // + vpblendvb
1902
1903 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1904 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1905 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1906 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1907 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1908 // + vpblendvb
1909 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1910 // + vpblendvb
1911 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1912 // + vpblendvb
1913 };
1914
1915 if (ST->hasAVX2())
1916 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1917 return LT.first * Entry->Cost;
1918
1919 static const CostTblEntry XOPShuffleTbl[] = {
1920 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1921 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1922 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1923 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1924 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1925 // + vinsertf128
1926 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1927 // + vinsertf128
1928
1929 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1930 // + vinsertf128
1931 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1932 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1933 // + vinsertf128
1934 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1935 };
1936
1937 if (ST->hasXOP())
1938 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1939 return LT.first * Entry->Cost;
1940
1941 static const CostTblEntry AVX1ShuffleTbl[] = {
1942 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1943 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1944 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1945 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1946 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1947 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1948 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1949
1950 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1951 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1952 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1953 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1954 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1955 // + vinsertf128
1956 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1957 // + vinsertf128
1958 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1959 // + vinsertf128
1960
1961 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1962 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1963 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1964 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1965 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1966 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
1967 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1968
1969 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
1970 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
1971 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1972 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1973 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1974 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1975 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1976
1977 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1978 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1979 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1980 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1981 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1982 // + 2*por + vinsertf128
1983 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
1984 // + 2*por + vinsertf128
1985 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1986 // + 2*por + vinsertf128
1987
1988 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1989 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1990 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1991 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1992 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1993 // + 4*por + vinsertf128
1994 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
1995 // + 4*por + vinsertf128
1996 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1997 // + 4*por + vinsertf128
1998 };
1999
2000 if (ST->hasAVX())
2001 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2002 return LT.first * Entry->Cost;
2003
2004 static const CostTblEntry SSE41ShuffleTbl[] = {
2005 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
2006 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2007 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
2008 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
2009 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
2010 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
2011 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
2012 };
2013
2014 if (ST->hasSSE41())
2015 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2016 return LT.first * Entry->Cost;
2017
2018 static const CostTblEntry SSSE3ShuffleTbl[] = {
2019 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
2020 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
2021 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
2022
2023 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
2024 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
2025 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
2026
2027 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
2028 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
2029 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
2030
2031 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
2032 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
2033 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
2034 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
2035 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
2036
2037 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
2038 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
2039 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
2040
2041 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
2042 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
2043 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
2044 };
2045
2046 if (ST->hasSSSE3())
2047 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2048 return LT.first * Entry->Cost;
2049
2050 static const CostTblEntry SSE2ShuffleTbl[] = {
2051 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2052 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2053 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2054 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2055 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2056 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2057
2058 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2059 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2060 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2061 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2062 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2063 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2064 // + 2*pshufd + 2*unpck + packus
2065
2066 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2067 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2068 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2069 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2070 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2071 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2072
2073 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2074 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2075 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2076 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2077 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2078 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2079
2080 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2081 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2082 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2083 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2084 // + pshufd/unpck
2085 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2086 // + pshufd/unpck
2087 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2088 // + 2*pshufd + 2*unpck + 2*packus
2089
2090 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2091 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2092 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2093 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2094 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2095 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2096 };
2097
2098 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2099 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2100 };
2101
2102 if (ST->hasSSE2()) {
2103 bool IsLoad =
2104 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2105 if (ST->hasSSE3() && IsLoad)
2106 if (const auto *Entry =
2107 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2109 LT.second.getVectorElementCount()) &&
2110 "Table entry missing from isLegalBroadcastLoad()");
2111 return LT.first * Entry->Cost;
2112 }
2113
2114 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2115 return LT.first * Entry->Cost;
2116 }
2117
2118 static const CostTblEntry SSE1ShuffleTbl[] = {
2119 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2120 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2121 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2122 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2123 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2124 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2125 };
2126
2127 if (ST->hasSSE1())
2128 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2129 return LT.first * Entry->Cost;
2130
2131 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2132}
2133
2135 Type *Src,
2138 const Instruction *I) {
2139 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2140 assert(ISD && "Invalid opcode");
2141
2142 // The cost tables include both specific, custom (non-legal) src/dst type
2143 // conversions and generic, legalized types. We test for customs first, before
2144 // falling back to legalization.
2145 // FIXME: Need a better design of the cost table to handle non-simple types of
2146 // potential massive combinations (elem_num x src_type x dst_type).
2147 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2148 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2149 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2150
2151 // Mask sign extend has an instruction.
2152 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2153 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2154 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2155 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2156 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2157 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2158 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2159 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2160 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2161 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2162 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2163 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2164 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2165 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2166 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2167 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2168 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2169
2170 // Mask zero extend is a sext + shift.
2171 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2172 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2173 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2174 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2175 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2176 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2177 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2178 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2179 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2180 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2181 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2182 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2183 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2184 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2185 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2186 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2187 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2188
2189 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2190 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2191 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2192 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2193 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2194 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2195 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2196 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2197 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2198 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2199 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2200 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2201 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2202 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2203 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2204 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2205 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2206
2207 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2208 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2209 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2210 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2211 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2212 };
2213
2214 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2215 // Mask sign extend has an instruction.
2216 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2217 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2218 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2219 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2220 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2221 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2222 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2223 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2224
2225 // Mask zero extend is a sext + shift.
2226 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2227 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2228 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2229 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2230 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2231 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2232 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2233 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2234
2235 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2236 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2237 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2238 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2239 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2240 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2241 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2242 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2243
2244 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2245 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2246
2247 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2248 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2249
2250 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2251 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2252
2253 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2254 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2255 };
2256
2257 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2258 // 256-bit wide vectors.
2259
2260 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2261 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2262 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2263 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2264 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2265
2266 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2267 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2268 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2269 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2270 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2271 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2272 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2273 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2274 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2275 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2276 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2277 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2278 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2279 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2280 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2281 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2282 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2283 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2284 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2285 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2286 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2287 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2288 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2289 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2290 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2291 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2292 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2293 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2294 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2295 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2296 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2297 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2298 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2299 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2300
2301 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2302 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2303 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2304
2305 // Sign extend is zmm vpternlogd+vptruncdb.
2306 // Zero extend is zmm broadcast load+vptruncdw.
2307 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2308 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2309 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2310 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2311 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2312 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2313 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2314 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2315
2316 // Sign extend is zmm vpternlogd+vptruncdw.
2317 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2318 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2319 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2320 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2321 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2322 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2323 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2324 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2325 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2326
2327 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2328 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2329 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2330 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2331 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2332 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2333 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2334 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2335 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2336 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2337
2338 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2339 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2340 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2341 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2342
2343 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2344 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2345 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2346 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2347 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2348 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2349 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2350 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2351 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2352 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2353
2354 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2355 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2356
2357 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2358 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2359 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2360 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2361 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2362 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2363 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2364 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2365
2366 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2367 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2368 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2369 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2370 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2371 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2372 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2373 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2374 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2375 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2376
2377 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2378 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2379 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2380 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2381 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2382 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2383 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2384 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2385 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2386 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2387 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2388
2389 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2390 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2391 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2392 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2393 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2394 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2395 };
2396
2397 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2398 // Mask sign extend has an instruction.
2399 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2400 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2401 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2402 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2403 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2404 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2405 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2406 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2407 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2408 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2409 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2410 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2411 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2412 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2413 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2414 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2415 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2416
2417 // Mask zero extend is a sext + shift.
2418 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2419 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2420 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2421 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2422 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2423 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2424 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2425 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2426 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2427 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2428 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2429 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2430 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2431 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2432 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2433 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2434 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2435
2436 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2437 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2438 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2439 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2440 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2441 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2442 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2443 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2444 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2445 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2446 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2447 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2448 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2449 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2450 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2451 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2452 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2453
2454 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2455 };
2456
2457 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2458 // Mask sign extend has an instruction.
2459 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2460 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2461 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2462 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2463 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2464 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2465 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2466 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2467
2468 // Mask zero extend is a sext + shift.
2469 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2470 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2471 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2472 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2473 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2474 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2475 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2476 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2477
2478 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2479 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2480 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2481 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2482 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2483 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2484 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2485 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2486
2487 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2488 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2489 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2490 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2491
2492 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2493 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2494 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2495 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2496
2497 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2498 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2499 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2500 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2501
2502 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2503 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2504 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2505 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2506 };
2507
2508 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2509 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2510 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2511 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2512 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2513 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2514 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2515 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2516 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2517 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2518 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2519 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2520 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2521 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2522 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2523 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2524 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2525 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2526 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2527
2528 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2529 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2530 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2531 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2532 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2533 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2534 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2535 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2536 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2537 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2538
2539 // sign extend is vpcmpeq+maskedmove+vpmovdw
2540 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2541 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2542 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2543 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2544 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2545 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2546 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2547 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2548 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2549
2550 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2551 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2552 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2553 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2554 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2555 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2556 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2557 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2558
2559 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2560 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2561 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2562 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2563
2564 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2565 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2566 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2567 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2568 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2569 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2570 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2571 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2572 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2573 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2574 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2575 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2576
2577 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2578 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2579 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2580 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2581
2582 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2583 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2584 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2585 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2586 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2587 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2588 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2589 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2590 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2591 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2592 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2593 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2594 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2595
2596 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2597 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2598 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2599
2600 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2601 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2602 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2603 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2604 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2605 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2606 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2607 };
2608
2609 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2610 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2611 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2612 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2613 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2614 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2615 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2616
2617 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2618 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2619 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2620 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2621 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2622 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2623 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2624 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2625 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2626 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2627 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2628 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2629 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2630 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2631
2632 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2633
2634 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2635 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2636 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2637 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2638 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2639 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2640 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2641 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2642 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2643 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2644 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2645 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2646
2647 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2648 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2649
2650 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2651 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2652 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2653 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2654
2655 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2656 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2657 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2658 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2659 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2660 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2661 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2662 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2663
2664 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2665 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2666 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2667 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2668 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2669 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2670 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2671
2672 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2673 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2674 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2675 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2676 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2677 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2678 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2679 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2680 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2681 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2682 };
2683
2684 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2685 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2686 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2687 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2688 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2689 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2690 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2691
2692 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2693 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2694 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2695 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2696 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2697 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2698 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2699 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2700 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2701 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2702 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2703 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2704
2705 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2706 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2707 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2708 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2709 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2710
2711 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2712 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2713 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2714 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2715 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2716 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2717 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2718 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2719
2720 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2721 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2722 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2723 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2724 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2725 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2726 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2727 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2728 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2729 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2730 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2731 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2732
2733 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2734 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2735 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2736 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2737 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2738 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2739 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2740 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2741 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2742 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2743 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2744 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2745 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2746 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2747 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2748 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2749 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2750
2751 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2752 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2753 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2754 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2755 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2756 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2757 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2758 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2759 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2760 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2761 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2762
2763 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2764 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2765 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2766 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2767 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2768 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2769 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2770 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2771 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2772 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2773 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
2774 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
2775 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
2776
2777 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
2778 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
2779 };
2780
2781 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2782 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2783 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2784 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2785 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2786 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2787 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2788 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2789 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2790 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2791 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2792 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2793 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2794
2795 // These truncates end up widening elements.
2796 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
2797 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
2798 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
2799
2800 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
2801 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
2802 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
2803
2804 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2805 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2806 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2807 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2808 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2809 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2810 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2811 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2812 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2813 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
2814 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2815
2816 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2817 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2818 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
2819 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
2820 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2821 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2822 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2823 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2824 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
2825 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2826 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
2827 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
2828 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
2829 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
2830
2831 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2832 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2833 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2834 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2835 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2836 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2837 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2838 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2839 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2840 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2841
2842 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2843 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2844 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2845 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
2846 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2847 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2848 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2849 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2850 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
2851 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2852 };
2853
2854 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
2855 // These are somewhat magic numbers justified by comparing the
2856 // output of llvm-mca for our various supported scheduler models
2857 // and basing it off the worst case scenario.
2858 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2859 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2860 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
2861 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
2862 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
2863 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2864 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
2865 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
2866 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2867 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
2868 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
2869 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
2870
2871 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2872 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2873 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
2874 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
2875 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2876 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
2877 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
2878 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
2879 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
2880 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
2881 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2882 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
2883 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
2884
2885 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
2886 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2887 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
2888 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
2889 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
2890 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
2891 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
2892 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
2893 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
2894 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2895
2896 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
2897 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2898 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
2899 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
2900 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
2901 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
2902 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
2903 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
2904 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
2905 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
2906
2907 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
2908 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
2909 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
2910 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
2911 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2912 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
2913 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
2914 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
2915 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2916 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
2917 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2918 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
2919
2920 // These truncates are really widening elements.
2921 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
2922 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
2923 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
2924 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
2925 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
2926 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
2927
2928 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
2929 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
2930 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
2931 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
2932 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
2933 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
2934 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2935 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
2936 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
2937 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
2938 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
2939 };
2940
2941 // Attempt to map directly to (simple) MVT types to let us match custom entries.
2942 EVT SrcTy = TLI->getValueType(DL, Src);
2943 EVT DstTy = TLI->getValueType(DL, Dst);
2944
2945 // The function getSimpleVT only handles simple value types.
2946 if (SrcTy.isSimple() && DstTy.isSimple()) {
2947 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2948 MVT SimpleDstTy = DstTy.getSimpleVT();
2949
2950 if (ST->useAVX512Regs()) {
2951 if (ST->hasBWI())
2952 if (const auto *Entry = ConvertCostTableLookup(
2953 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2954 if (auto KindCost = Entry->Cost[CostKind])
2955 return *KindCost;
2956
2957 if (ST->hasDQI())
2958 if (const auto *Entry = ConvertCostTableLookup(
2959 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2960 if (auto KindCost = Entry->Cost[CostKind])
2961 return *KindCost;
2962
2963 if (ST->hasAVX512())
2964 if (const auto *Entry = ConvertCostTableLookup(
2965 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2966 if (auto KindCost = Entry->Cost[CostKind])
2967 return *KindCost;
2968 }
2969
2970 if (ST->hasBWI())
2971 if (const auto *Entry = ConvertCostTableLookup(
2972 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2973 if (auto KindCost = Entry->Cost[CostKind])
2974 return *KindCost;
2975
2976 if (ST->hasDQI())
2977 if (const auto *Entry = ConvertCostTableLookup(
2978 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2979 if (auto KindCost = Entry->Cost[CostKind])
2980 return *KindCost;
2981
2982 if (ST->hasAVX512())
2983 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2984 SimpleDstTy, SimpleSrcTy))
2985 if (auto KindCost = Entry->Cost[CostKind])
2986 return *KindCost;
2987
2988 if (ST->hasAVX2()) {
2989 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2990 SimpleDstTy, SimpleSrcTy))
2991 if (auto KindCost = Entry->Cost[CostKind])
2992 return *KindCost;
2993 }
2994
2995 if (ST->hasAVX()) {
2996 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2997 SimpleDstTy, SimpleSrcTy))
2998 if (auto KindCost = Entry->Cost[CostKind])
2999 return *KindCost;
3000 }
3001
3002 if (ST->hasSSE41()) {
3003 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3004 SimpleDstTy, SimpleSrcTy))
3005 if (auto KindCost = Entry->Cost[CostKind])
3006 return *KindCost;
3007 }
3008
3009 if (ST->hasSSE2()) {
3010 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3011 SimpleDstTy, SimpleSrcTy))
3012 if (auto KindCost = Entry->Cost[CostKind])
3013 return *KindCost;
3014 }
3015 }
3016
3017 // Fall back to legalized types.
3018 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3019 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3020
3021 // If we're truncating to the same legalized type - just assume its free.
3022 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3023 return TTI::TCC_Free;
3024
3025 if (ST->useAVX512Regs()) {
3026 if (ST->hasBWI())
3027 if (const auto *Entry = ConvertCostTableLookup(
3028 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3029 if (auto KindCost = Entry->Cost[CostKind])
3030 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3031
3032 if (ST->hasDQI())
3033 if (const auto *Entry = ConvertCostTableLookup(
3034 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3035 if (auto KindCost = Entry->Cost[CostKind])
3036 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3037
3038 if (ST->hasAVX512())
3039 if (const auto *Entry = ConvertCostTableLookup(
3040 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3041 if (auto KindCost = Entry->Cost[CostKind])
3042 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3043 }
3044
3045 if (ST->hasBWI())
3046 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3047 LTDest.second, LTSrc.second))
3048 if (auto KindCost = Entry->Cost[CostKind])
3049 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3050
3051 if (ST->hasDQI())
3052 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3053 LTDest.second, LTSrc.second))
3054 if (auto KindCost = Entry->Cost[CostKind])
3055 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3056
3057 if (ST->hasAVX512())
3058 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3059 LTDest.second, LTSrc.second))
3060 if (auto KindCost = Entry->Cost[CostKind])
3061 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3062
3063 if (ST->hasAVX2())
3064 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3065 LTDest.second, LTSrc.second))
3066 if (auto KindCost = Entry->Cost[CostKind])
3067 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3068
3069 if (ST->hasAVX())
3070 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3071 LTDest.second, LTSrc.second))
3072 if (auto KindCost = Entry->Cost[CostKind])
3073 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3074
3075 if (ST->hasSSE41())
3076 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3077 LTDest.second, LTSrc.second))
3078 if (auto KindCost = Entry->Cost[CostKind])
3079 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3080
3081 if (ST->hasSSE2())
3082 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3083 LTDest.second, LTSrc.second))
3084 if (auto KindCost = Entry->Cost[CostKind])
3085 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3086
3087 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3088 // sitofp.
3089 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3090 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3091 Type *ExtSrc = Src->getWithNewBitWidth(32);
3092 unsigned ExtOpc =
3093 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3094
3095 // For scalar loads the extend would be free.
3096 InstructionCost ExtCost = 0;
3097 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3098 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3099
3100 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3102 }
3103
3104 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3105 // i32.
3106 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3107 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3108 Type *TruncDst = Dst->getWithNewBitWidth(32);
3109 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3110 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3112 }
3113
3114 // TODO: Allow non-throughput costs that aren't binary.
3115 auto AdjustCost = [&CostKind](InstructionCost Cost,
3118 return Cost == 0 ? 0 : N;
3119 return Cost * N;
3120 };
3121 return AdjustCost(
3122 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3123}
3124
3126 Type *CondTy,
3127 CmpInst::Predicate VecPred,
3129 const Instruction *I) {
3130 // Early out if this type isn't scalar/vector integer/float.
3131 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3132 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3133 I);
3134
3135 // Legalize the type.
3136 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3137
3138 MVT MTy = LT.second;
3139
3140 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3141 assert(ISD && "Invalid opcode");
3142
3143 InstructionCost ExtraCost = 0;
3144 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3145 // Some vector comparison predicates cost extra instructions.
3146 // TODO: Adjust ExtraCost based on CostKind?
3147 // TODO: Should we invert this and assume worst case cmp costs
3148 // and reduce for particular predicates?
3149 if (MTy.isVector() &&
3150 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3151 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3152 ST->hasBWI())) {
3153 // Fallback to I if a specific predicate wasn't specified.
3154 CmpInst::Predicate Pred = VecPred;
3155 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3157 Pred = cast<CmpInst>(I)->getPredicate();
3158
3159 bool CmpWithConstant = false;
3160 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3161 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3162
3163 switch (Pred) {
3165 // xor(cmpeq(x,y),-1)
3166 ExtraCost = CmpWithConstant ? 0 : 1;
3167 break;
3170 // xor(cmpgt(x,y),-1)
3171 ExtraCost = CmpWithConstant ? 0 : 1;
3172 break;
3175 // cmpgt(xor(x,signbit),xor(y,signbit))
3176 // xor(cmpeq(pmaxu(x,y),x),-1)
3177 ExtraCost = CmpWithConstant ? 1 : 2;
3178 break;
3181 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3182 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3183 // cmpeq(psubus(x,y),0)
3184 // cmpeq(pminu(x,y),x)
3185 ExtraCost = 1;
3186 } else {
3187 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3188 ExtraCost = CmpWithConstant ? 2 : 3;
3189 }
3190 break;
3193 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3194 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3195 if (CondTy && !ST->hasAVX())
3196 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3198 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3200 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3201
3202 break;
3205 // Assume worst case scenario and add the maximum extra cost.
3206 ExtraCost = 3;
3207 break;
3208 default:
3209 break;
3210 }
3211 }
3212 }
3213
3214 static const CostKindTblEntry SLMCostTbl[] = {
3215 // slm pcmpeq/pcmpgt throughput is 2
3216 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3217 // slm pblendvb/blendvpd/blendvps throughput is 4
3218 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3219 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3220 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3221 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3222 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3223 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3224 };
3225
3226 static const CostKindTblEntry AVX512BWCostTbl[] = {
3227 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3228 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3229 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3230 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3231
3232 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3233 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3234 };
3235
3236 static const CostKindTblEntry AVX512CostTbl[] = {
3237 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3238 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3239 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3240 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3241
3242 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3243 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3244 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3245 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3246 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3247 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3248 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3249
3250 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3251 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3252 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3253 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3254 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3255 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3256 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3257 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3258 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3259 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3260 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3261 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3262 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3263 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3264
3265 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3266 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3267 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3268 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3269 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3270 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3271 };
3272
3273 static const CostKindTblEntry AVX2CostTbl[] = {
3274 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3275 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3276 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3277 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3278 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3279 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3280
3281 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3282 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3283 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3284 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3285
3286 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3287 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3288 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3289 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3290 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3291 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3292 };
3293
3294 static const CostKindTblEntry XOPCostTbl[] = {
3295 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3296 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3297 };
3298
3299 static const CostKindTblEntry AVX1CostTbl[] = {
3300 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3301 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3302 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3303 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3304 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3305 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3306
3307 // AVX1 does not support 8-wide integer compare.
3308 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3309 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3310 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3311 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3312
3313 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3314 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3315 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3316 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3317 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3318 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3319 };
3320
3321 static const CostKindTblEntry SSE42CostTbl[] = {
3322 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3323 };
3324
3325 static const CostKindTblEntry SSE41CostTbl[] = {
3326 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3327 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3328
3329 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3330 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3331 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3332 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3333 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3334 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3335 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3336 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3337 };
3338
3339 static const CostKindTblEntry SSE2CostTbl[] = {
3340 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3341 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3342
3343 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3344 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3345 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3346 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3347
3348 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3349 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3350 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3351 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3352 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3353 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3354 };
3355
3356 static const CostKindTblEntry SSE1CostTbl[] = {
3357 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3358 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3359
3360 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3361 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3362 };
3363
3364 if (ST->useSLMArithCosts())
3365 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3366 if (auto KindCost = Entry->Cost[CostKind])
3367 return LT.first * (ExtraCost + *KindCost);
3368
3369 if (ST->hasBWI())
3370 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3371 if (auto KindCost = Entry->Cost[CostKind])
3372 return LT.first * (ExtraCost + *KindCost);
3373
3374 if (ST->hasAVX512())
3375 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3376 if (auto KindCost = Entry->Cost[CostKind])
3377 return LT.first * (ExtraCost + *KindCost);
3378
3379 if (ST->hasAVX2())
3380 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3381 if (auto KindCost = Entry->Cost[CostKind])
3382 return LT.first * (ExtraCost + *KindCost);
3383
3384 if (ST->hasXOP())
3385 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3386 if (auto KindCost = Entry->Cost[CostKind])
3387 return LT.first * (ExtraCost + *KindCost);
3388
3389 if (ST->hasAVX())
3390 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3391 if (auto KindCost = Entry->Cost[CostKind])
3392 return LT.first * (ExtraCost + *KindCost);
3393
3394 if (ST->hasSSE42())
3395 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3396 if (auto KindCost = Entry->Cost[CostKind])
3397 return LT.first * (ExtraCost + *KindCost);
3398
3399 if (ST->hasSSE41())
3400 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3401 if (auto KindCost = Entry->Cost[CostKind])
3402 return LT.first * (ExtraCost + *KindCost);
3403
3404 if (ST->hasSSE2())
3405 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3406 if (auto KindCost = Entry->Cost[CostKind])
3407 return LT.first * (ExtraCost + *KindCost);
3408
3409 if (ST->hasSSE1())
3410 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3411 if (auto KindCost = Entry->Cost[CostKind])
3412 return LT.first * (ExtraCost + *KindCost);
3413
3414 // Assume a 3cy latency for fp select ops.
3415 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3416 if (ValTy->getScalarType()->isFloatingPointTy())
3417 return 3;
3418
3419 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3420}
3421
3423
3427 // Costs should match the codegen from:
3428 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3429 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3430 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3431 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3432 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3433
3434 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3435 // specialized in these tables yet.
3436 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3437 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3438 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3439 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3440 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3441 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3442 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3443 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3444 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3445 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3446 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3447 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3448 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3449 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3450 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3451 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3452 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3453 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3454 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3455 };
3456 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3457 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3458 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3459 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3460 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3461 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3462 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3463 };
3464 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3465 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3466 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3467 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3468 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3469 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3470 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3471 };
3472 static const CostKindTblEntry AVX512CDCostTbl[] = {
3473 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3474 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3475 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3476 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3477 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3478 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3479 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3480 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3481 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3482 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3483 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3484 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3485
3486 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3487 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3488 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3489 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3490 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3491 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3492 };
3493 static const CostKindTblEntry AVX512BWCostTbl[] = {
3494 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3495 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3496 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3497 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3498 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3499 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3500 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3501 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3502 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3503 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3504 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3505 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3506 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3507 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3508 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3509 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3510 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3511 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3512 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3513 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3514 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3515 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3516 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3517 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3518 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3519 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3520 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3521 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3522 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3523 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3524 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3525 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3526 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3527 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3528 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3529 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3530 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3531 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3532 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3533 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3534 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3535 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3536 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3537 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3538 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3539 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3540 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3541 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3542 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3543 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3544 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3545 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3546 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3547 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3548 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3549 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3550 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3551 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3552 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3553 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3554 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3555 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3556 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3557 { ISD::SADDSAT, MVT::v32i16, { 1 } },
3558 { ISD::SADDSAT, MVT::v64i8, { 1 } },
3559 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3560 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3561 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3562 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3563 { ISD::SSUBSAT, MVT::v32i16, { 1 } },
3564 { ISD::SSUBSAT, MVT::v64i8, { 1 } },
3565 { ISD::UADDSAT, MVT::v32i16, { 1 } },
3566 { ISD::UADDSAT, MVT::v64i8, { 1 } },
3567 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3568 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3569 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3570 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3571 { ISD::USUBSAT, MVT::v32i16, { 1 } },
3572 { ISD::USUBSAT, MVT::v64i8, { 1 } },
3573 };
3574 static const CostKindTblEntry AVX512CostTbl[] = {
3575 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3576 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3577 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3578 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3579 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3580 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3581 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3582 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3583 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3584 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3585 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3586 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3587 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3588 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3589 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3590 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3591 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3592 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3593 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3594 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3595 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3596 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3597 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3598 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3599 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3600 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3601 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3602 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3603 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3604 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3605 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3606 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3607 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3608 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3609 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3610 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3611 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3612 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3613 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3614 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3615 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3616 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3617 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3618 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3619 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3620 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3621 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3622 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3623 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3624 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3625 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3626 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3627 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3628 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3629 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3630 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3631 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3632 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3633 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3634 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3635 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3636 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3637 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3638 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3639 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3640 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3641 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3642 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3643 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3644 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3645 { ISD::USUBSAT, MVT::v16i32, { 2 } }, // pmaxud + psubd
3646 { ISD::USUBSAT, MVT::v2i64, { 2 } }, // pmaxuq + psubq
3647 { ISD::USUBSAT, MVT::v4i64, { 2 } }, // pmaxuq + psubq
3648 { ISD::USUBSAT, MVT::v8i64, { 2 } }, // pmaxuq + psubq
3649 { ISD::UADDSAT, MVT::v16i32, { 3 } }, // not + pminud + paddd
3650 { ISD::UADDSAT, MVT::v2i64, { 3 } }, // not + pminuq + paddq
3651 { ISD::UADDSAT, MVT::v4i64, { 3 } }, // not + pminuq + paddq
3652 { ISD::UADDSAT, MVT::v8i64, { 3 } }, // not + pminuq + paddq
3653 { ISD::SADDSAT, MVT::v32i16, { 2 } },
3654 { ISD::SADDSAT, MVT::v64i8, { 2 } },
3655 { ISD::SSUBSAT, MVT::v32i16, { 2 } },
3656 { ISD::SSUBSAT, MVT::v64i8, { 2 } },
3657 { ISD::UADDSAT, MVT::v32i16, { 2 } },
3658 { ISD::UADDSAT, MVT::v64i8, { 2 } },
3659 { ISD::USUBSAT, MVT::v32i16, { 2 } },
3660 { ISD::USUBSAT, MVT::v64i8, { 2 } },
3661 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3662 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3663 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3664 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3665 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3666 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3667 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3668 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3669 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3670 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3671 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3672 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3673 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3674 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3675 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3676 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3677 };
3678 static const CostKindTblEntry XOPCostTbl[] = {
3679 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3680 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3681 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3682 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3683 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3684 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3685 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3686 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3687 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3688 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3689 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3690 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3691 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3692 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3693 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3694 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3695 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3696 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3697 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3698 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3699 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3700 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3701 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3702 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3703 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3704 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3705 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3706 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3707 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3708 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3709 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3710 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3711 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3712 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3713 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3714 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3715 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3716 };
3717 static const CostKindTblEntry AVX2CostTbl[] = {
3718 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3719 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3720 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3721 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3722 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3723 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3724 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3725 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3726 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3727 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3728 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3729 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3730 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3731 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3732 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3733 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3734 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3735 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3736 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3737 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3738 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3739 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3740 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3741 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3742 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3743 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3744 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3745 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3746 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3747 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3748 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3749 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3750 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3751 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3752 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3753 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3754 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3755 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3756 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3757 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3758 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3759 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3760 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3761 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3762 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3763 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3764 { ISD::SADDSAT, MVT::v16i16, { 1 } },
3765 { ISD::SADDSAT, MVT::v32i8, { 1 } },
3766 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
3767 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
3768 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3769 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3770 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3771 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
3772 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
3773 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3774 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3775 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3776 { ISD::SSUBSAT, MVT::v16i16, { 1 } },
3777 { ISD::SSUBSAT, MVT::v32i8, { 1 } },
3778 { ISD::UADDSAT, MVT::v16i16, { 1 } },
3779 { ISD::UADDSAT, MVT::v32i8, { 1 } },
3780 { ISD::UADDSAT, MVT::v8i32, { 3 } }, // not + pminud + paddd
3781 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
3782 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
3783 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3784 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3785 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3786 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
3787 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
3788 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3789 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3790 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3791 { ISD::USUBSAT, MVT::v16i16, { 1 } },
3792 { ISD::USUBSAT, MVT::v32i8, { 1 } },
3793 { ISD::USUBSAT, MVT::v8i32, { 2 } }, // pmaxud + psubd
3794 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3795 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3796 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3797 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3798 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3799 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3800 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
3801 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
3802 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
3803 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
3804 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
3805 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
3806 };
3807 static const CostKindTblEntry AVX1CostTbl[] = {
3808 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3809 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
3810 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
3811 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
3812 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3813 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
3814 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3815 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
3816 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3817 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
3818 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
3819 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
3820 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
3821 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
3822 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
3823 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
3824 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
3825 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
3826 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
3827 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
3828 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
3829 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
3830 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
3831 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
3832 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3833 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
3834 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3835 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
3836 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3837 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
3838 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
3839 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
3840 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
3841 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
3842 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
3843 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
3844 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
3845 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
3846 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3847 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
3848 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
3849 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
3850 { ISD::SADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3851 { ISD::SADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3852 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3853 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
3854 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3855 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3856 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3857 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3858 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3859 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3860 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3861 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3862 { ISD::SSUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3863 { ISD::SSUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3864 { ISD::UADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3865 { ISD::UADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3866 { ISD::UADDSAT, MVT::v8i32, { 8 } }, // 2 x 128-bit Op + extract/insert
3867 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3868 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
3869 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3870 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3871 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3872 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3873 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
3874 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3875 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3876 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3877 { ISD::USUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3878 { ISD::USUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3879 { ISD::USUBSAT, MVT::v8i32, { 6 } }, // 2 x 128-bit Op + extract/insert
3880 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3881 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3882 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3883 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3884 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3885 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3886 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
3887 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
3888 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
3889 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
3890 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
3891 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
3892 };
3893 static const CostKindTblEntry GFNICostTbl[] = {
3894 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
3895 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
3896 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
3897 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
3898 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3899 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3900 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3901 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
3902 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
3903 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
3904 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
3905 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
3906 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
3907 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
3908 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3909 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3910 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3911 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3912 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3913 };
3914 static const CostKindTblEntry GLMCostTbl[] = {
3915 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
3916 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
3917 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
3918 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
3919 };
3920 static const CostKindTblEntry SLMCostTbl[] = {
3921 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
3922 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
3923 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
3924 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
3925 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
3926 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
3927 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
3928 };
3929 static const CostKindTblEntry SSE42CostTbl[] = {
3930 { ISD::USUBSAT, MVT::v4i32, { 2 } }, // pmaxud + psubd
3931 { ISD::UADDSAT, MVT::v4i32, { 3 } }, // not + pminud + paddd
3932 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3933 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3934 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3935 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3936 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3937 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3938 };
3939 static const CostKindTblEntry SSE41CostTbl[] = {
3940 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
3941 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
3942 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3943 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
3944 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3945 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3946 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
3947 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
3948 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3949 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
3950 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
3951 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3952 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
3953 };
3954 static const CostKindTblEntry SSSE3CostTbl[] = {
3955 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
3956 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
3957 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
3958 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
3959 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
3960 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
3961 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
3962 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
3963 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
3964 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
3965 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
3966 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
3967 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
3968 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
3969 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
3970 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
3971 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
3972 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
3973 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
3974 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
3975 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
3976 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
3977 };
3978 static const CostKindTblEntry SSE2CostTbl[] = {
3979 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
3980 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
3981 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
3982 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
3983 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
3984 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
3985 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
3986 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
3987 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
3988 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
3989 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
3990 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
3991 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
3992 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
3993 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
3994 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
3995 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
3996 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
3997 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
3998 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
3999 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4000 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4001 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4002 { ISD::SADDSAT, MVT::v8i16, { 1 } },
4003 { ISD::SADDSAT, MVT::v16i8, { 1 } },
4004 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4005 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4006 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4007 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4008 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4009 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4010 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4011 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4012 { ISD::SSUBSAT, MVT::v8i16, { 1 } },
4013 { ISD::SSUBSAT, MVT::v16i8, { 1 } },
4014 { ISD::UADDSAT, MVT::v8i16, { 1 } },
4015 { ISD::UADDSAT, MVT::v16i8, { 1 } },
4016 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4017 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4018 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4019 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4020 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4021 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4022 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4023 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4024 { ISD::USUBSAT, MVT::v8i16, { 1 } },
4025 { ISD::USUBSAT, MVT::v16i8, { 1 } },
4026 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4027 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4028 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4029 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4030 };
4031 static const CostKindTblEntry SSE1CostTbl[] = {
4032 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4033 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4034 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4035 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4036 };
4037 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4038 { ISD::CTTZ, MVT::i64, { 1 } },
4039 };
4040 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4041 { ISD::CTTZ, MVT::i32, { 1 } },
4042 { ISD::CTTZ, MVT::i16, { 1 } },
4043 { ISD::CTTZ, MVT::i8, { 1 } },
4044 };
4045 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4046 { ISD::CTLZ, MVT::i64, { 1 } },
4047 };
4048 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4049 { ISD::CTLZ, MVT::i32, { 1 } },
4050 { ISD::CTLZ, MVT::i16, { 2 } },
4051 { ISD::CTLZ, MVT::i8, { 2 } },
4052 };
4053 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4054 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4055 };
4056 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4057 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4058 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4059 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4060 };
4061 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4062 { ISD::ABS, MVT::i64, { 1, 2, 3, 4 } }, // SUB+CMOV
4063 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4064 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4065 { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4066 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR+XOR
4067 { ISD::CTTZ, MVT::i64, { 3 } }, // TEST+BSF+CMOV/BRANCH
4068 { I