LLVM 18.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include "llvm/Support/Debug.h"
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
102
105 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
106 // TODO: Currently the __builtin_popcount() implementation using SSE3
107 // instructions is inefficient. Once the problem is fixed, we should
108 // call ST->hasSSE3() instead of ST->hasPOPCNT().
109 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
110}
111
112std::optional<unsigned> X86TTIImpl::getCacheSize(
114 switch (Level) {
116 // - Penryn
117 // - Nehalem
118 // - Westmere
119 // - Sandy Bridge
120 // - Ivy Bridge
121 // - Haswell
122 // - Broadwell
123 // - Skylake
124 // - Kabylake
125 return 32 * 1024; // 32 KByte
127 // - Penryn
128 // - Nehalem
129 // - Westmere
130 // - Sandy Bridge
131 // - Ivy Bridge
132 // - Haswell
133 // - Broadwell
134 // - Skylake
135 // - Kabylake
136 return 256 * 1024; // 256 KByte
137 }
138
139 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
140}
141
142std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
144 // - Penryn
145 // - Nehalem
146 // - Westmere
147 // - Sandy Bridge
148 // - Ivy Bridge
149 // - Haswell
150 // - Broadwell
151 // - Skylake
152 // - Kabylake
153 switch (Level) {
155 [[fallthrough]];
157 return 8;
158 }
159
160 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
161}
162
163unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
164 bool Vector = (ClassID == 1);
165 if (Vector && !ST->hasSSE1())
166 return 0;
167
168 if (ST->is64Bit()) {
169 if (Vector && ST->hasAVX512())
170 return 32;
171 return 16;
172 }
173 return 8;
174}
175
178 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
179 switch (K) {
181 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
183 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
184 return TypeSize::getFixed(512);
185 if (ST->hasAVX() && PreferVectorWidth >= 256)
186 return TypeSize::getFixed(256);
187 if (ST->hasSSE1() && PreferVectorWidth >= 128)
188 return TypeSize::getFixed(128);
189 return TypeSize::getFixed(0);
191 return TypeSize::getScalable(0);
192 }
193
194 llvm_unreachable("Unsupported register kind");
195}
196
199 .getFixedValue();
200}
201
203 // If the loop will not be vectorized, don't interleave the loop.
204 // Let regular unroll to unroll the loop, which saves the overflow
205 // check and memory check cost.
206 if (VF.isScalar())
207 return 1;
208
209 if (ST->isAtom())
210 return 1;
211
212 // Sandybridge and Haswell have multiple execution ports and pipelined
213 // vector units.
214 if (ST->hasAVX())
215 return 4;
216
217 return 2;
218}
219
224 const Instruction *CxtI) {
225
226 // vXi8 multiplications are always promoted to vXi16.
227 // Sub-128-bit types can be extended/packed more efficiently.
228 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
229 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
230 Type *WideVecTy =
231 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
232 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
234 CostKind) +
235 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
237 CostKind) +
238 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
239 }
240
241 // Legalize the type.
242 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
243
244 int ISD = TLI->InstructionOpcodeToISD(Opcode);
245 assert(ISD && "Invalid opcode");
246
247 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
248 (LT.second.getScalarType() == MVT::i32 ||
249 LT.second.getScalarType() == MVT::i64)) {
250 // Check if the operands can be represented as a smaller datatype.
251 bool Op1Signed = false, Op2Signed = false;
252 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
253 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
254 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
255 bool SignedMode = Op1Signed || Op2Signed;
256
257 // If both vXi32 are representable as i15 and at least one is constant,
258 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
259 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
260 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
261 LT.second.getScalarType() == MVT::i32) {
262 bool Op1Constant =
263 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
264 bool Op2Constant =
265 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
266 bool Op1Sext = isa<SExtInst>(Args[0]) &&
267 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
268 bool Op2Sext = isa<SExtInst>(Args[1]) &&
269 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
270
271 bool IsZeroExtended = !Op1Signed || !Op2Signed;
272 bool IsConstant = Op1Constant || Op2Constant;
273 bool IsSext = Op1Sext || Op2Sext;
274 if (IsConstant || IsZeroExtended || IsSext)
275 LT.second =
276 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
277 }
278
279 // Check if the vXi32 operands can be shrunk into a smaller datatype.
280 // This should match the codegen from reduceVMULWidth.
281 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
282 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
283 if (OpMinSize <= 7)
284 return LT.first * 3; // pmullw/sext
285 if (!SignedMode && OpMinSize <= 8)
286 return LT.first * 3; // pmullw/zext
287 if (OpMinSize <= 15)
288 return LT.first * 5; // pmullw/pmulhw/pshuf
289 if (!SignedMode && OpMinSize <= 16)
290 return LT.first * 5; // pmullw/pmulhw/pshuf
291 }
292
293 // If both vXi64 are representable as (unsigned) i32, then we can perform
294 // the multiple with a single PMULUDQ instruction.
295 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
296 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
297 ISD = X86ISD::PMULUDQ;
298 }
299
300 // Vector multiply by pow2 will be simplified to shifts.
301 // Vector multiply by -pow2 will be simplified to shifts/negates.
302 if (ISD == ISD::MUL && Op2Info.isConstant() &&
303 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
305 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
306 Op1Info.getNoProps(), Op2Info.getNoProps());
307 if (Op2Info.isNegatedPowerOf2())
308 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
309 return Cost;
310 }
311
312 // On X86, vector signed division by constants power-of-two are
313 // normally expanded to the sequence SRA + SRL + ADD + SRA.
314 // The OperandValue properties may not be the same as that of the previous
315 // operation; conservatively assume OP_None.
316 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
317 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
319 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
320 Op1Info.getNoProps(), Op2Info.getNoProps());
321 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
322 Op1Info.getNoProps(), Op2Info.getNoProps());
323 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
324 Op1Info.getNoProps(), Op2Info.getNoProps());
325
326 if (ISD == ISD::SREM) {
327 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
328 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
329 Op2Info.getNoProps());
330 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
331 Op2Info.getNoProps());
332 }
333
334 return Cost;
335 }
336
337 // Vector unsigned division/remainder will be simplified to shifts/masks.
338 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
339 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
340 if (ISD == ISD::UDIV)
341 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
342 Op1Info.getNoProps(), Op2Info.getNoProps());
343 // UREM
344 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
345 Op1Info.getNoProps(), Op2Info.getNoProps());
346 }
347
348 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
349 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
350 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
351 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
352 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
353 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
354 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
355 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
356 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
357 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
358
359 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
360 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
361 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
362 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
363 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
364 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
365 };
366
367 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
368 if (const auto *Entry =
369 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
370 if (auto KindCost = Entry->Cost[CostKind])
371 return LT.first * *KindCost;
372
373 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
374 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
375 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
376 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
377
378 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
379 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
380 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
381
382 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
383 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
384 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
385 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
386 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
387 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
388
389 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
390 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
391 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
392 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
393 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
394 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
395 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
396
397 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
398 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
399 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
400 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
401 };
402
403 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
404 if (const auto *Entry =
405 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
406 if (auto KindCost = Entry->Cost[CostKind])
407 return LT.first * *KindCost;
408
409 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
410 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
411 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
412 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
413 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
414 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
415 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
416
417 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
418 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
419 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
420 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
421 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
422 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
423
424 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
425 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
426 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
427 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
428 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
429 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
430
431 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
432 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
433 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
434 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
435 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
436 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
437
438 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
439 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
440 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
441 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
442 };
443
444 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
445 if (const auto *Entry =
446 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
447 if (auto KindCost = Entry->Cost[CostKind])
448 return LT.first * *KindCost;
449
450 static const CostKindTblEntry AVXUniformConstCostTable[] = {
451 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
452 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
453 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
454 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
455 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
456 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
457
458 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
459 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
460 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
461 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
462 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
463 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
464
465 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
466 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
467 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
468 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
469 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
470 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
471
472 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
473 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
474 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
475 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
476 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
477 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
478
479 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
480 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
481 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
482 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
483 };
484
485 // XOP has faster vXi8 shifts.
486 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
487 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
488 if (const auto *Entry =
489 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
490 if (auto KindCost = Entry->Cost[CostKind])
491 return LT.first * *KindCost;
492
493 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
494 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
495 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
496 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
497
498 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
499 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
500 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
501
502 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
503 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
504 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
505
506 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
507 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
508 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
509
510 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
511 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
512 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
513 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
514 };
515
516 // XOP has faster vXi8 shifts.
517 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
518 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
519 if (const auto *Entry =
520 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
521 if (auto KindCost = Entry->Cost[CostKind])
522 return LT.first * *KindCost;
523
524 static const CostKindTblEntry AVX512BWConstCostTable[] = {
525 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
526 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
527 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
528 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
529
530 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
531 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
532 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
533 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
534 };
535
536 if (Op2Info.isConstant() && ST->hasBWI())
537 if (const auto *Entry =
538 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
539 if (auto KindCost = Entry->Cost[CostKind])
540 return LT.first * *KindCost;
541
542 static const CostKindTblEntry AVX512ConstCostTable[] = {
543 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
544 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
545 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
546 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
547
548 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
549 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
550 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
551 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
552
553 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
554 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
555 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
556 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
557 };
558
559 if (Op2Info.isConstant() && ST->hasAVX512())
560 if (const auto *Entry =
561 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
562 if (auto KindCost = Entry->Cost[CostKind])
563 return LT.first * *KindCost;
564
565 static const CostKindTblEntry AVX2ConstCostTable[] = {
566 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
567 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
568 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
569 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
570
571 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
572 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
573 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
574 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
575
576 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
577 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
578 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
579 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
580 };
581
582 if (Op2Info.isConstant() && ST->hasAVX2())
583 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
584 if (auto KindCost = Entry->Cost[CostKind])
585 return LT.first * *KindCost;
586
587 static const CostKindTblEntry AVXConstCostTable[] = {
588 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
589 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
590 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
591 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
592
593 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
594 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
595 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
596 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
597
598 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
599 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
600 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
601 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
602 };
603
604 if (Op2Info.isConstant() && ST->hasAVX())
605 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
606 if (auto KindCost = Entry->Cost[CostKind])
607 return LT.first * *KindCost;
608
609 static const CostKindTblEntry SSE41ConstCostTable[] = {
610 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
611 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
612 };
613
614 if (Op2Info.isConstant() && ST->hasSSE41())
615 if (const auto *Entry =
616 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
617 if (auto KindCost = Entry->Cost[CostKind])
618 return LT.first * *KindCost;
619
620 static const CostKindTblEntry SSE2ConstCostTable[] = {
621 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
622 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
623 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
624 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
625
626 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
627 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
628 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
629 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
630
631 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
632 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
633 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
634 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
635 };
636
637 if (Op2Info.isConstant() && ST->hasSSE2())
638 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
639 if (auto KindCost = Entry->Cost[CostKind])
640 return LT.first * *KindCost;
641
642 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
643 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
644 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
645 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
646 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
647 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
648 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
649 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
650 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
651 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
652
653 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
654 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
655 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
656 };
657
658 if (ST->hasBWI() && Op2Info.isUniform())
659 if (const auto *Entry =
660 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
661 if (auto KindCost = Entry->Cost[CostKind])
662 return LT.first * *KindCost;
663
664 static const CostKindTblEntry AVX512UniformCostTable[] = {
665 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
666 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
667 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
668
669 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
670 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
671 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
672
673 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
674 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
675 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
676 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
677 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
678 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
679 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
680 };
681
682 if (ST->hasAVX512() && Op2Info.isUniform())
683 if (const auto *Entry =
684 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
685 if (auto KindCost = Entry->Cost[CostKind])
686 return LT.first * *KindCost;
687
688 static const CostKindTblEntry AVX2UniformCostTable[] = {
689 // Uniform splats are cheaper for the following instructions.
690 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
691 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
692 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
693 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
694 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
695 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
696
697 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
698 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
699 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
700 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
701 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
702 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
703
704 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
705 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
706 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
707 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
708 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
709 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
710
711 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
712 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
713 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
714 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
715 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
716 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
717 };
718
719 if (ST->hasAVX2() && Op2Info.isUniform())
720 if (const auto *Entry =
721 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
722 if (auto KindCost = Entry->Cost[CostKind])
723 return LT.first * *KindCost;
724
725 static const CostKindTblEntry AVXUniformCostTable[] = {
726 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
727 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
728 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
729 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
730 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
731 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
732
733 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
734 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
735 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
736 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
737 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
738 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
739
740 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
741 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
742 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
743 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
744 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
745 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
746
747 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
748 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
749 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
750 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
751 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
752 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
753 };
754
755 // XOP has faster vXi8 shifts.
756 if (ST->hasAVX() && Op2Info.isUniform() &&
757 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
758 if (const auto *Entry =
759 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
760 if (auto KindCost = Entry->Cost[CostKind])
761 return LT.first * *KindCost;
762
763 static const CostKindTblEntry SSE2UniformCostTable[] = {
764 // Uniform splats are cheaper for the following instructions.
765 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
766 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
767 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
768
769 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
770 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
771 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
772
773 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
774 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
775 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
776
777 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
778 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
779 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
780 };
781
782 if (ST->hasSSE2() && Op2Info.isUniform() &&
783 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
784 if (const auto *Entry =
785 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
786 if (auto KindCost = Entry->Cost[CostKind])
787 return LT.first * *KindCost;
788
789 static const CostKindTblEntry AVX512DQCostTable[] = {
790 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
791 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
792 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
793 };
794
795 // Look for AVX512DQ lowering tricks for custom cases.
796 if (ST->hasDQI())
797 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
798 if (auto KindCost = Entry->Cost[CostKind])
799 return LT.first * *KindCost;
800
801 static const CostKindTblEntry AVX512BWCostTable[] = {
802 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
803 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
804 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
805 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
806 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
807 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
808 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
809 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
810 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
811
812 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
813 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
814 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
815 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
816 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
817 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
818 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
819 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
820 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
821
822 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
823 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
824
825 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
826 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
827 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
828 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
829
830 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
831 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
832
833 { ISD::MUL, MVT::v64i8, { 5, 10,10,11 } },
834 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
835
836 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
837 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
838 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
839 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
840 };
841
842 // Look for AVX512BW lowering tricks for custom cases.
843 if (ST->hasBWI())
844 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
845 if (auto KindCost = Entry->Cost[CostKind])
846 return LT.first * *KindCost;
847
848 static const CostKindTblEntry AVX512CostTable[] = {
849 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
850 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
851 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
852
853 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
854 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
855 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
856
857 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
858 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
859 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
860 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
861 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
862 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
863 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
864 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
865 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
866
867 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
868 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
869 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
870 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
871 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
872 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
873 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
874 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
875 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
876
877 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
878 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
879
880 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
881 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
882
883 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
884 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
885 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
886 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
887
888 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
889 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
890 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
891 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
892
893 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
894 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
895 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
896 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
897
898 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
899 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
900 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
901 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
902 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
903
904 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
905
906 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
907 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
908 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
909 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
910 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
911 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
912 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
913 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
914 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
915
916 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
917 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
918 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
919 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
920
921 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
922 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
923 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
924 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
925 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
926 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
927 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
928 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
929 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
930
931 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
932 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
933 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
934 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
935 };
936
937 if (ST->hasAVX512())
938 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
939 if (auto KindCost = Entry->Cost[CostKind])
940 return LT.first * *KindCost;
941
942 static const CostKindTblEntry AVX2ShiftCostTable[] = {
943 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
944 // customize them to detect the cases where shift amount is a scalar one.
945 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
946 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
947 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
948 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
949 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
950 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
951 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
952 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
953 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
954 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
955 };
956
957 if (ST->hasAVX512()) {
958 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
959 // On AVX512, a packed v32i16 shift left by a constant build_vector
960 // is lowered into a vector multiply (vpmullw).
961 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
962 Op1Info.getNoProps(), Op2Info.getNoProps());
963 }
964
965 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
966 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
967 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
968 Op2Info.isConstant())
969 // On AVX2, a packed v16i16 shift left by a constant build_vector
970 // is lowered into a vector multiply (vpmullw).
971 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
972 Op1Info.getNoProps(), Op2Info.getNoProps());
973
974 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
975 if (auto KindCost = Entry->Cost[CostKind])
976 return LT.first * *KindCost;
977 }
978
979 static const CostKindTblEntry XOPShiftCostTable[] = {
980 // 128bit shifts take 1cy, but right shifts require negation beforehand.
981 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
982 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
983 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
984 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
985 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
986 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
987 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
988 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
989 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
990 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
991 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
992 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
993 // 256bit shifts require splitting if AVX2 didn't catch them above.
994 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
995 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
996 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
997 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
998 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
999 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1000 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1001 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1002 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1003 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1004 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1005 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1006 };
1007
1008 // Look for XOP lowering tricks.
1009 if (ST->hasXOP()) {
1010 // If the right shift is constant then we'll fold the negation so
1011 // it's as cheap as a left shift.
1012 int ShiftISD = ISD;
1013 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1014 ShiftISD = ISD::SHL;
1015 if (const auto *Entry =
1016 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1017 if (auto KindCost = Entry->Cost[CostKind])
1018 return LT.first * *KindCost;
1019 }
1020
1021 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1022 MVT VT = LT.second;
1023 // Vector shift left by non uniform constant can be lowered
1024 // into vector multiply.
1025 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1026 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1027 ISD = ISD::MUL;
1028 }
1029
1030 static const CostKindTblEntry GLMCostTable[] = {
1031 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1032 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1033 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1034 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1035 };
1036
1037 if (ST->useGLMDivSqrtCosts())
1038 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1039 if (auto KindCost = Entry->Cost[CostKind])
1040 return LT.first * *KindCost;
1041
1042 static const CostKindTblEntry SLMCostTable[] = {
1043 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1044 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1045 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1046 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1047 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1048 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1049 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1050 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1051 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1052 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1053 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1054 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1055 // v2i64/v4i64 mul is custom lowered as a series of long:
1056 // multiplies(3), shifts(3) and adds(2)
1057 // slm muldq version throughput is 2 and addq throughput 4
1058 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1059 // 3X4 (addq throughput) = 17
1060 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1061 // slm addq\subq throughput is 4
1062 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1063 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1064 };
1065
1066 if (ST->useSLMArithCosts())
1067 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1068 if (auto KindCost = Entry->Cost[CostKind])
1069 return LT.first * *KindCost;
1070
1071 static const CostKindTblEntry AVX2CostTable[] = {
1072 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1073 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1074 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1075 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1076
1077 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1078 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1079 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1080 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1081
1082 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1083 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1084 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1085 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1086 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1087 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1088
1089 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1090 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1091 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1092 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1093 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1094 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1095 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1096 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1097
1098 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1099 { ISD::MUL, MVT::v32i8, { 6, 11,10,19 } }, // unpack/pmullw
1100 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1101 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1102 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1103 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1104 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1105
1106 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1107
1108 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1109 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1110
1111 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1112 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1113 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1114 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1115 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1116 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1117
1118 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1119 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1120 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1121 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1122 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1123 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1124
1125 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1126 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1127 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1128 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1129 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1130 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1131
1132 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1133 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1134 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1135 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1136 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1137 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1138 };
1139
1140 // Look for AVX2 lowering tricks for custom cases.
1141 if (ST->hasAVX2())
1142 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1143 if (auto KindCost = Entry->Cost[CostKind])
1144 return LT.first * *KindCost;
1145
1146 static const CostKindTblEntry AVX1CostTable[] = {
1147 // We don't have to scalarize unsupported ops. We can issue two half-sized
1148 // operations and we only need to extract the upper YMM half.
1149 // Two ops + 1 extract + 1 insert = 4.
1150 { ISD::MUL, MVT::v32i8, { 12, 13, 22, 23 } }, // unpack/pmullw + split
1151 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1152 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1153 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1154 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1155
1156 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1157 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1158 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1159 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1160
1161 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1162 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1163 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1164 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1165
1166 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1167 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1168 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1169 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1170
1171 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1172 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1173 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1174 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1175 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1176 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1177 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1178 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1179 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1180 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1181
1182 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1183 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1184 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1185 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1186 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1187 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1188 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1189 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1190
1191 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1192 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1193 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1194 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1195 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1196 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1197 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1198 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1199
1200 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1201 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1202 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1203 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1204 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1205 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1206 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1207 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1208
1209 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1210 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1211
1212 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1213 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1214 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1215 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1216 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1217 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1218
1219 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1220 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1221 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1222 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1223 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1224 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1225
1226 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1227 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1228 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1229 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1230 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1231 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1232
1233 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1234 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1235 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1236 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1237 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1238 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1239 };
1240
1241 if (ST->hasAVX())
1242 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1243 if (auto KindCost = Entry->Cost[CostKind])
1244 return LT.first * *KindCost;
1245
1246 static const CostKindTblEntry SSE42CostTable[] = {
1247 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1248 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1249 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1250 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1251
1252 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1253 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1254 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1255 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1256
1257 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1258 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1259 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1260 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1261
1262 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1263 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1264 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1265 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1266
1267 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1268 };
1269
1270 if (ST->hasSSE42())
1271 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1272 if (auto KindCost = Entry->Cost[CostKind])
1273 return LT.first * *KindCost;
1274
1275 static const CostKindTblEntry SSE41CostTable[] = {
1276 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1277 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1278 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1279
1280 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1281 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1282 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1283 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1284
1285 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1286 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1287 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1288 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1289
1290 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack
1291 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1292 };
1293
1294 if (ST->hasSSE41())
1295 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1296 if (auto KindCost = Entry->Cost[CostKind])
1297 return LT.first * *KindCost;
1298
1299 static const CostKindTblEntry SSE2CostTable[] = {
1300 // We don't correctly identify costs of casts because they are marked as
1301 // custom.
1302 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1303 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1304 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1305 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1306
1307 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1308 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1309 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1310 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1311
1312 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1313 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1314 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1315 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1316
1317 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1318 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1319 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1320 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1321
1322 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1323 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1324 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1325 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1326
1327 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1328 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1329 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1330 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1331
1332 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1333 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1334
1335 { ISD::MUL, MVT::v16i8, { 5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1336 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1337 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1338 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1339
1340 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1341
1342 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1343 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1344 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1345 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1346
1347 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1348 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1349 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1350 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1351
1352 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1353 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1354 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1355
1356 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1357 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1358 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1359
1360 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1361 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1362 };
1363
1364 if (ST->hasSSE2())
1365 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1366 if (auto KindCost = Entry->Cost[CostKind])
1367 return LT.first * *KindCost;
1368
1369 static const CostKindTblEntry SSE1CostTable[] = {
1370 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1371 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1372
1373 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1374 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1375
1376 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1377 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1378
1379 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1380 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1381
1382 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1383 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1384 };
1385
1386 if (ST->hasSSE1())
1387 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1388 if (auto KindCost = Entry->Cost[CostKind])
1389 return LT.first * *KindCost;
1390
1391 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1392 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1393 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1394 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1395 };
1396
1397 if (ST->is64Bit())
1398 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1399 if (auto KindCost = Entry->Cost[CostKind])
1400 return LT.first * *KindCost;
1401
1402 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1403 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1404 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1405 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1406
1407 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1408 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1409 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1410
1411 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1412 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1413 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1414
1415 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1416 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1417 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1418 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1419 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1420 };
1421
1422 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1423 if (auto KindCost = Entry->Cost[CostKind])
1424 return LT.first * *KindCost;
1425
1426 // It is not a good idea to vectorize division. We have to scalarize it and
1427 // in the process we will often end up having to spilling regular
1428 // registers. The overhead of division is going to dominate most kernels
1429 // anyways so try hard to prevent vectorization of division - it is
1430 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1431 // to hide "20 cycles" for each lane.
1432 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1433 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1434 ISD == ISD::UREM)) {
1435 InstructionCost ScalarCost =
1437 Op1Info.getNoProps(), Op2Info.getNoProps());
1438 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1439 }
1440
1441 // Handle some basic single instruction code size cases.
1442 if (CostKind == TTI::TCK_CodeSize) {
1443 switch (ISD) {
1444 case ISD::FADD:
1445 case ISD::FSUB:
1446 case ISD::FMUL:
1447 case ISD::FDIV:
1448 case ISD::FNEG:
1449 case ISD::AND:
1450 case ISD::OR:
1451 case ISD::XOR:
1452 return LT.first;
1453 break;
1454 }
1455 }
1456
1457 // Fallback to the default implementation.
1458 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1459 Args, CxtI);
1460}
1461
1463 VectorType *BaseTp,
1464 ArrayRef<int> Mask,
1466 int Index, VectorType *SubTp,
1468 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1469 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1470 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1471
1472 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1473
1474 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1475 if (Kind == TTI::SK_Transpose)
1476 Kind = TTI::SK_PermuteTwoSrc;
1477
1478 // For Broadcasts we are splatting the first element from the first input
1479 // register, so only need to reference that input and all the output
1480 // registers are the same.
1481 if (Kind == TTI::SK_Broadcast)
1482 LT.first = 1;
1483
1484 // Treat <X x bfloat> shuffles as <X x half>.
1485 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1486 LT.second = LT.second.changeVectorElementType(MVT::f16);
1487
1488 // Subvector extractions are free if they start at the beginning of a
1489 // vector and cheap if the subvectors are aligned.
1490 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1491 int NumElts = LT.second.getVectorNumElements();
1492 if ((Index % NumElts) == 0)
1493 return 0;
1494 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1495 if (SubLT.second.isVector()) {
1496 int NumSubElts = SubLT.second.getVectorNumElements();
1497 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1498 return SubLT.first;
1499 // Handle some cases for widening legalization. For now we only handle
1500 // cases where the original subvector was naturally aligned and evenly
1501 // fit in its legalized subvector type.
1502 // FIXME: Remove some of the alignment restrictions.
1503 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1504 // vectors.
1505 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1506 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1507 (NumSubElts % OrigSubElts) == 0 &&
1508 LT.second.getVectorElementType() ==
1509 SubLT.second.getVectorElementType() &&
1510 LT.second.getVectorElementType().getSizeInBits() ==
1512 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1513 "Unexpected number of elements!");
1514 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1515 LT.second.getVectorNumElements());
1516 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1517 SubLT.second.getVectorNumElements());
1518 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1519 InstructionCost ExtractCost =
1520 getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt,
1521 CostKind, ExtractIndex, SubTy);
1522
1523 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1524 // if we have SSSE3 we can use pshufb.
1525 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1526 return ExtractCost + 1; // pshufd or pshufb
1527
1528 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1529 "Unexpected vector size");
1530
1531 return ExtractCost + 2; // worst case pshufhw + pshufd
1532 }
1533 }
1534 }
1535
1536 // Subvector insertions are cheap if the subvectors are aligned.
1537 // Note that in general, the insertion starting at the beginning of a vector
1538 // isn't free, because we need to preserve the rest of the wide vector.
1539 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1540 int NumElts = LT.second.getVectorNumElements();
1541 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1542 if (SubLT.second.isVector()) {
1543 int NumSubElts = SubLT.second.getVectorNumElements();
1544 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1545 return SubLT.first;
1546 }
1547
1548 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1549 Kind = TTI::SK_PermuteTwoSrc;
1550 }
1551
1552 // Handle some common (illegal) sub-vector types as they are often very cheap
1553 // to shuffle even on targets without PSHUFB.
1554 EVT VT = TLI->getValueType(DL, BaseTp);
1555 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1556 !ST->hasSSSE3()) {
1557 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1558 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1559 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1560 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1561 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1562 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1563
1564 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1565 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1566 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1567 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1568
1569 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1570 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1571 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1572 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1573
1574 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1575 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1576 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1577 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1578 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1579
1580 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1581 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1582 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1583 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1584 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1585 };
1586
1587 if (ST->hasSSE2())
1588 if (const auto *Entry =
1589 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1590 return Entry->Cost;
1591 }
1592
1593 // We are going to permute multiple sources and the result will be in multiple
1594 // destinations. Providing an accurate cost only for splits where the element
1595 // type remains the same.
1596 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1597 MVT LegalVT = LT.second;
1598 if (LegalVT.isVector() &&
1599 LegalVT.getVectorElementType().getSizeInBits() ==
1601 LegalVT.getVectorNumElements() <
1602 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1603 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1604 unsigned LegalVTSize = LegalVT.getStoreSize();
1605 // Number of source vectors after legalization:
1606 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1607 // Number of destination vectors after legalization:
1608 InstructionCost NumOfDests = LT.first;
1609
1610 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1611 LegalVT.getVectorNumElements());
1612
1613 if (!Mask.empty() && NumOfDests.isValid()) {
1614 // Try to perform better estimation of the permutation.
1615 // 1. Split the source/destination vectors into real registers.
1616 // 2. Do the mask analysis to identify which real registers are
1617 // permuted. If more than 1 source registers are used for the
1618 // destination register building, the cost for this destination register
1619 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1620 // source register is used, build mask and calculate the cost as a cost
1621 // of PermuteSingleSrc.
1622 // Also, for the single register permute we try to identify if the
1623 // destination register is just a copy of the source register or the
1624 // copy of the previous destination register (the cost is
1625 // TTI::TCC_Basic). If the source register is just reused, the cost for
1626 // this operation is 0.
1627 NumOfDests =
1629 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1630 .first;
1631 unsigned E = *NumOfDests.getValue();
1632 unsigned NormalizedVF =
1633 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1634 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1635 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1636 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1637 copy(Mask, NormalizedMask.begin());
1638 unsigned PrevSrcReg = 0;
1639 ArrayRef<int> PrevRegMask;
1642 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1643 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1644 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1646 // Check if the previous register can be just copied to the next
1647 // one.
1648 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1649 PrevRegMask != RegMask)
1651 RegMask, CostKind, 0, nullptr);
1652 else
1653 // Just a copy of previous destination register.
1655 return;
1656 }
1657 if (SrcReg != DestReg &&
1658 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1659 // Just a copy of the source register.
1661 }
1662 PrevSrcReg = SrcReg;
1663 PrevRegMask = RegMask;
1664 },
1665 [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1666 unsigned /*Unused*/,
1667 unsigned /*Unused*/) {
1669 CostKind, 0, nullptr);
1670 });
1671 return Cost;
1672 }
1673
1674 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1675 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1676 std::nullopt, CostKind, 0, nullptr);
1677 }
1678
1679 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1680 }
1681
1682 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1683 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1684 // We assume that source and destination have the same vector type.
1685 InstructionCost NumOfDests = LT.first;
1686 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1687 LT.first = NumOfDests * NumOfShufflesPerDest;
1688 }
1689
1690 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1691 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1692 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1693
1694 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1695 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1696
1697 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1698 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1699 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1700 };
1701
1702 if (ST->hasVBMI())
1703 if (const auto *Entry =
1704 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1705 return LT.first * Entry->Cost;
1706
1707 static const CostTblEntry AVX512BWShuffleTbl[] = {
1708 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1709 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1710 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1711
1712 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1713 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1714 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1715 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1716
1717 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1718 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1719 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1720 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1721 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1722
1723 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1724 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1725 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1726 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1727 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1728
1729 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1730 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1731
1732 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1733 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1734 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1735 };
1736
1737 if (ST->hasBWI())
1738 if (const auto *Entry =
1739 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1740 return LT.first * Entry->Cost;
1741
1742 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1743 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1744 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1745 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1746 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1747 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1748 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1749 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1750
1751 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1752 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1753 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1754 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1755 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1756 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1757 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1758
1759 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1760 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1761 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1762 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1763 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1764 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1765 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1766 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1767 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1768 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1769 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1770
1771 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1772 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1773 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1774 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1775 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1776 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1777 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1778 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1779 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1780 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1781 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1782 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1783 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1784
1785 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1786 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1787 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1788 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1789 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1790 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1791 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1792 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1793 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1794 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1795 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1796 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1797
1798 // FIXME: This just applies the type legalization cost rules above
1799 // assuming these completely split.
1800 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1801 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1802 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1803 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1804 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1805 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1806
1807 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1808 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1809 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1810 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1811 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1812 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1813 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1814 };
1815
1816 if (ST->hasAVX512())
1817 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1818 if (auto KindCost = Entry->Cost[CostKind])
1819 return LT.first * *KindCost;
1820
1821 static const CostTblEntry AVX2ShuffleTbl[] = {
1822 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1823 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1824 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1825 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1826 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1827 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1828 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1829
1830 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1831 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1832 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1833 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1834 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1835 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1836 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1837
1838 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1839 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1840 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1841
1842 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1843 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1844 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1845 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1846 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1847
1848 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1849 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1850 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1851 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1852 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1853 // + vpblendvb
1854 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1855 // + vpblendvb
1856 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1857 // + vpblendvb
1858
1859 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1860 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1861 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1862 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1863 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1864 // + vpblendvb
1865 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1866 // + vpblendvb
1867 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1868 // + vpblendvb
1869 };
1870
1871 if (ST->hasAVX2())
1872 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1873 return LT.first * Entry->Cost;
1874
1875 static const CostTblEntry XOPShuffleTbl[] = {
1876 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1877 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1878 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1879 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1880 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1881 // + vinsertf128
1882 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1883 // + vinsertf128
1884
1885 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1886 // + vinsertf128
1887 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1888 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1889 // + vinsertf128
1890 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1891 };
1892
1893 if (ST->hasXOP())
1894 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1895 return LT.first * Entry->Cost;
1896
1897 static const CostTblEntry AVX1ShuffleTbl[] = {
1898 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1899 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1900 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1901 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1902 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1903 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1904 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1905
1906 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1907 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1908 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1909 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1910 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1911 // + vinsertf128
1912 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1913 // + vinsertf128
1914 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1915 // + vinsertf128
1916
1917 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1918 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1919 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1920 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1921 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1922 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
1923 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1924
1925 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
1926 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
1927 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1928 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1929 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1930 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1931 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1932
1933 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1934 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1935 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1936 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1937 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1938 // + 2*por + vinsertf128
1939 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
1940 // + 2*por + vinsertf128
1941 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1942 // + 2*por + vinsertf128
1943
1944 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1945 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1946 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1947 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1948 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1949 // + 4*por + vinsertf128
1950 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
1951 // + 4*por + vinsertf128
1952 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1953 // + 4*por + vinsertf128
1954 };
1955
1956 if (ST->hasAVX())
1957 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1958 return LT.first * Entry->Cost;
1959
1960 static const CostTblEntry SSE41ShuffleTbl[] = {
1961 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1962 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1963 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1964 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1965 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1966 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
1967 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1968 };
1969
1970 if (ST->hasSSE41())
1971 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1972 return LT.first * Entry->Cost;
1973
1974 static const CostTblEntry SSSE3ShuffleTbl[] = {
1975 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1976 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
1977 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1978
1979 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1980 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
1981 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1982
1983 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1984 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
1985 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1986
1987 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
1988 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
1989 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
1990 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
1991 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
1992
1993 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1994 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
1995 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1996
1997 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1998 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
1999 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
2000 };
2001
2002 if (ST->hasSSSE3())
2003 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2004 return LT.first * Entry->Cost;
2005
2006 static const CostTblEntry SSE2ShuffleTbl[] = {
2007 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2008 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2009 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2010 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2011 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2012 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2013
2014 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2015 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2016 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2017 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2018 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2019 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2020 // + 2*pshufd + 2*unpck + packus
2021
2022 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2023 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2024 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2025 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2026 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2027 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2028
2029 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2030 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2031 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2032 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2033 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2034 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2035
2036 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2037 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2038 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2039 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2040 // + pshufd/unpck
2041 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2042 // + pshufd/unpck
2043 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2044 // + 2*pshufd + 2*unpck + 2*packus
2045
2046 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2047 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2048 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2049 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2050 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2051 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2052 };
2053
2054 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2055 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2056 };
2057
2058 if (ST->hasSSE2()) {
2059 bool IsLoad =
2060 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2061 if (ST->hasSSE3() && IsLoad)
2062 if (const auto *Entry =
2063 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2065 LT.second.getVectorElementCount()) &&
2066 "Table entry missing from isLegalBroadcastLoad()");
2067 return LT.first * Entry->Cost;
2068 }
2069
2070 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2071 return LT.first * Entry->Cost;
2072 }
2073
2074 static const CostTblEntry SSE1ShuffleTbl[] = {
2075 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2076 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2077 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2078 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2079 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2080 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2081 };
2082
2083 if (ST->hasSSE1())
2084 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2085 return LT.first * Entry->Cost;
2086
2087 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2088}
2089
2091 Type *Src,
2094 const Instruction *I) {
2095 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2096 assert(ISD && "Invalid opcode");
2097
2098 // TODO: Allow non-throughput costs that aren't binary.
2099 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2101 return Cost == 0 ? 0 : 1;
2102 return Cost;
2103 };
2104
2105 // The cost tables include both specific, custom (non-legal) src/dst type
2106 // conversions and generic, legalized types. We test for customs first, before
2107 // falling back to legalization.
2108 // FIXME: Need a better design of the cost table to handle non-simple types of
2109 // potential massive combinations (elem_num x src_type x dst_type).
2110 static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
2111 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2112 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2113
2114 // Mask sign extend has an instruction.
2115 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
2116 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
2117 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
2118 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
2119 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
2120 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
2121 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
2122 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
2123 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
2124 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
2125 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
2126 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
2127 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2128 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
2129 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
2130 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
2131 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 },
2132
2133 // Mask zero extend is a sext + shift.
2134 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
2135 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
2136 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
2137 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
2138 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
2139 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
2140 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
2141 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
2142 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
2143 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
2144 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
2145 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
2146 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
2147 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
2148 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
2149 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
2150 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 },
2151
2152 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
2153 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
2154 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
2155 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
2156 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
2157 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
2158 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
2159 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
2160 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
2161 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
2162 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
2163 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
2164 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
2165 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
2166 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
2167 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
2168 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 },
2169
2170 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
2171 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
2172 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
2173 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
2174 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
2175 };
2176
2177 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
2178 // Mask sign extend has an instruction.
2179 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
2180 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
2181 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
2182 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
2183 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
2184 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 },
2185 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 },
2186 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 },
2187
2188 // Mask zero extend is a sext + shift.
2189 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
2190 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
2191 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
2192 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
2193 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
2194 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 },
2195 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 },
2196 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
2197
2198 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
2199 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
2200 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
2201 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
2202 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2203 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 },
2204 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 },
2205 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 },
2206
2207 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
2208 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
2209
2210 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
2211 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
2212
2213 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
2214 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
2215
2216 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
2217 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
2218 };
2219
2220 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2221 // 256-bit wide vectors.
2222
2223 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
2224 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
2225 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
2226 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
2227
2228 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2229 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2230 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2231 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
2232 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2233 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2234 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2235 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
2236 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
2237 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
2238 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
2239 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
2240 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
2241 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
2242 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
2243 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
2244 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
2245 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
2246 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb
2247 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb
2248 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw
2249 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw
2250 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
2251 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
2252 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
2253 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb
2254 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb
2255 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb
2256 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
2257 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw
2258 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw
2259 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
2260 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
2261 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
2262
2263 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
2264 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 },
2265 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 },
2266
2267 // Sign extend is zmm vpternlogd+vptruncdb.
2268 // Zero extend is zmm broadcast load+vptruncdw.
2269 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
2270 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
2271 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
2272 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
2273 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
2274 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
2275 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
2276 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },
2277
2278 // Sign extend is zmm vpternlogd+vptruncdw.
2279 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2280 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
2281 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
2282 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
2283 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
2284 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
2285 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
2286 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
2287 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2288
2289 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
2290 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
2291 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
2292 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
2293 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
2294 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
2295 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
2296 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
2297 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
2298 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
2299
2300 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
2301 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
2302 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
2303 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
2304
2305 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
2306 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
2307 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2308 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2309 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
2310 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
2311 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
2312 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
2313 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
2314 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
2315
2316 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
2317 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
2318
2319 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
2320 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
2321 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
2322 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
2323 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
2324 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
2325 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
2326 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
2327
2328 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
2329 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
2330 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
2331 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
2332 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
2333 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
2334 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
2335 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
2336 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
2337 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
2338
2339 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
2340 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 },
2341 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 },
2342 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 },
2343 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 },
2344 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
2345 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 },
2346 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 },
2347 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 },
2348 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 },
2349 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 },
2350
2351 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
2352 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
2353 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 },
2354 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
2355 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 },
2356 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 },
2357 };
2358
2359 static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
2360 // Mask sign extend has an instruction.
2361 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
2362 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
2363 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
2364 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
2365 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
2366 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
2367 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
2368 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
2369 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
2370 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
2371 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
2372 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
2373 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2374 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
2375 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 },
2376 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 },
2377 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 },
2378
2379 // Mask zero extend is a sext + shift.
2380 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
2381 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
2382 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
2383 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
2384 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
2385 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
2386 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
2387 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
2388 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
2389 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
2390 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
2391 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
2392 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
2393 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
2394 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 },
2395 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 },
2396 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 },
2397
2398 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
2399 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
2400 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
2401 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
2402 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
2403 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
2404 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
2405 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
2406 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
2407 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
2408 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
2409 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
2410 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
2411 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
2412 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 },
2413 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 },
2414 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 },
2415
2416 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
2417 };
2418
2419 static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
2420 // Mask sign extend has an instruction.
2421 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
2422 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
2423 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
2424 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 },
2425 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
2426 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 },
2427 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 },
2428 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
2429
2430 // Mask zero extend is a sext + shift.
2431 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
2432 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
2433 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
2434 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 },
2435 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
2436 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 },
2437 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 },
2438 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
2439
2440 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 },
2441 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 },
2442 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
2443 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
2444 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
2445 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
2446 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 },
2447 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2448
2449 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
2450 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2451 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
2452 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
2453
2454 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
2455 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2456 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
2457 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
2458
2459 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 },
2460 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
2461 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
2462 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
2463
2464 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 },
2465 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
2466 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
2467 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
2468 };
2469
2470 static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
2471 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2472 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2473 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2474 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
2475 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2476 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2477 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2478 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
2479 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
2480 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
2481 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2482 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2483 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
2484 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
2485 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
2486 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
2487 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
2488 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
2489
2490 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2491 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2492 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
2493 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
2494 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
2495 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
2496 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
2497 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
2498 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
2499 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },
2500
2501 // sign extend is vpcmpeq+maskedmove+vpmovdw
2502 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2503 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
2504 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
2505 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
2506 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
2507 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
2508 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
2509 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
2510 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
2511
2512 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
2513 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
2514 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
2515 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
2516 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
2517 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
2518 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, // vpternlogd
2519 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, // vpternlogd+psrld
2520
2521 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
2522 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
2523 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
2524 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
2525
2526 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
2527 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
2528 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
2529 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
2530 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
2531 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
2532 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
2533 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
2534 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
2535 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
2536 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
2537 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
2538
2539 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2540 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
2541 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2542 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
2543
2544 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
2545 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
2546 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2547 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
2548 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2549 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
2550 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2551 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2552 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
2553 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
2554 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
2555 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2556 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
2557
2558 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2559 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
2560 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 },
2561
2562 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
2563 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
2564 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
2565 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 },
2566 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
2567 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
2568 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
2569 };
2570
2571 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
2572 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
2573 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
2574 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
2575 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
2576 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2577 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2578
2579 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
2580 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
2581 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
2582 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
2583 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2584 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2585 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
2586 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
2587 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2588 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2589 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2590 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2591 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2592 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2593
2594 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2595
2596 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 },
2597 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 },
2598 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 },
2599 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 },
2600 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 },
2601 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 },
2602 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 },
2603 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 },
2604 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 },
2605 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 },
2606 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
2607 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
2608
2609 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
2610 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
2611
2612 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 },
2613 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 },
2614 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 },
2615 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 },
2616
2617 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 },
2618 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 },
2619 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 },
2620 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2621 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2622 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 },
2623 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 },
2624 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 },
2625
2626 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
2627 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
2628 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
2629 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
2630 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
2631 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
2632 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 },
2633
2634 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
2635 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
2636 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
2637 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
2638 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
2639 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
2640 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 },
2641 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2642 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2643 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2644 };
2645
2646 static const TypeConversionCostTblEntry AVXConversionTbl[] = {
2647 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
2648 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
2649 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
2650 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
2651 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2652 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2653
2654 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2655 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2656 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2657 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2658 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2659 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2660 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2661 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2662 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2663 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2664 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2665 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2666
2667 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
2668 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
2669 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
2670 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
2671 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
2672
2673 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
2674 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
2675 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
2676 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 },
2677 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2678 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 },
2679 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
2680 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
2681
2682 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
2683 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
2684 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
2685 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2686 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2687 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2688 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2689 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2690 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2691 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2692 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 },
2693 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 },
2694
2695 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
2696 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
2697 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
2698 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2699 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2700 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2701 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2702 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 },
2703 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 },
2704 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2705 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
2706 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
2707 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 },
2708 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 },
2709 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 },
2710 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2711 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 },
2712
2713 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2714 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 },
2715 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 },
2716 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 },
2717 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 },
2718 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 },
2719 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 },
2720 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 },
2721 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 },
2722 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 },
2723 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 },
2724
2725 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 },
2726 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 },
2727 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 },
2728 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 },
2729 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 },
2730 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 },
2731 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 },
2732 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 },
2733 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2734 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2735 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 },
2736 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 },
2737 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 },
2738
2739 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
2740 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
2741 };
2742
2743 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2744 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2745 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2746 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2747 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2748 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2749 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2750 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2751 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2752 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2753 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2754 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2755 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2756
2757 // These truncates end up widening elements.
2758 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
2759 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
2760 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
2761
2762 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 },
2763 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 },
2764 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 },
2765
2766 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 },
2767 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 },
2768 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 },
2769 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 },
2770 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2771 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2772 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2773 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2774 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2775 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 },
2776 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2777
2778 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 },
2779 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 },
2780 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
2781 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
2782 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2783 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2784 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2785 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2786 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 },
2787 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2788 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 },
2789 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 },
2790 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 },
2791 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 },
2792
2793 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 },
2794 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 },
2795 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 },
2796 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 },
2797 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 },
2798 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 },
2799 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 },
2800 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 },
2801 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
2802 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 },
2803
2804 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 },
2805 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2806 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 },
2807 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
2808 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 },
2809 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 },
2810 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 },
2811 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 },
2812 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 },
2813 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2814 };
2815
2816 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2817 // These are somewhat magic numbers justified by comparing the
2818 // output of llvm-mca for our various supported scheduler models
2819 // and basing it off the worst case scenario.
2820 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 },
2821 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 },
2822 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 },
2823 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 },
2824 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 },
2825 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2826 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 },
2827 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2828 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2829 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 },
2830 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 },
2831 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 },
2832
2833 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 },
2834 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 },
2835 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 },
2836 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 },
2837 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2838 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 },
2839 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 },
2840 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2841 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 },
2842 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 },
2843 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2844 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 },
2845 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 },
2846
2847 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 },
2848 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 },
2849 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 },
2850 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 },
2851 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 },
2852 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 },
2853 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 },
2854 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 },
2855 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 },
2856 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 },
2857
2858 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 },
2859 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2860 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 },
2861 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
2862 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 },
2863 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 },
2864 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 },
2865 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 },
2866 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 },
2867 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 },
2868
2869 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2870 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2871 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 },
2872 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 },
2873 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2874 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 },
2875 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 },
2876 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 },
2877 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2878 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 },
2879 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2880 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 },
2881
2882 // These truncates are really widening elements.
2883 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2884 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2885 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2886 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2887 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2888 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2889
2890 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2891 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
2892 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
2893 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
2894 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
2895 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 },
2896 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2897 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 },
2898 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2899 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2900 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
2901 };
2902
2903 // Attempt to map directly to (simple) MVT types to let us match custom entries.
2904 EVT SrcTy = TLI->getValueType(DL, Src);
2905 EVT DstTy = TLI->getValueType(DL, Dst);
2906
2907 // The function getSimpleVT only handles simple value types.
2908 if (SrcTy.isSimple() && DstTy.isSimple()) {
2909 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2910 MVT SimpleDstTy = DstTy.getSimpleVT();
2911
2912 if (ST->useAVX512Regs()) {
2913 if (ST->hasBWI())
2914 if (const auto *Entry = ConvertCostTableLookup(
2915 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2916 return AdjustCost(Entry->Cost);
2917
2918 if (ST->hasDQI())
2919 if (const auto *Entry = ConvertCostTableLookup(
2920 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2921 return AdjustCost(Entry->Cost);
2922
2923 if (ST->hasAVX512())
2924 if (const auto *Entry = ConvertCostTableLookup(
2925 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2926 return AdjustCost(Entry->Cost);
2927 }
2928
2929 if (ST->hasBWI())
2930 if (const auto *Entry = ConvertCostTableLookup(
2931 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2932 return AdjustCost(Entry->Cost);
2933
2934 if (ST->hasDQI())
2935 if (const auto *Entry = ConvertCostTableLookup(
2936 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2937 return AdjustCost(Entry->Cost);
2938
2939 if (ST->hasAVX512())
2940 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2941 SimpleDstTy, SimpleSrcTy))
2942 return AdjustCost(Entry->Cost);
2943
2944 if (ST->hasAVX2()) {
2945 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2946 SimpleDstTy, SimpleSrcTy))
2947 return AdjustCost(Entry->Cost);
2948 }
2949
2950 if (ST->hasAVX()) {
2951 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2952 SimpleDstTy, SimpleSrcTy))
2953 return AdjustCost(Entry->Cost);
2954 }
2955
2956 if (ST->hasSSE41()) {
2957 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2958 SimpleDstTy, SimpleSrcTy))
2959 return AdjustCost(Entry->Cost);
2960 }
2961
2962 if (ST->hasSSE2()) {
2963 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2964 SimpleDstTy, SimpleSrcTy))
2965 return AdjustCost(Entry->Cost);
2966 }
2967 }
2968
2969 // Fall back to legalized types.
2970 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
2971 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
2972
2973 // If we're truncating to the same legalized type - just assume its free.
2974 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
2975 return TTI::TCC_Free;
2976
2977 if (ST->useAVX512Regs()) {
2978 if (ST->hasBWI())
2979 if (const auto *Entry = ConvertCostTableLookup(
2980 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
2981 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2982
2983 if (ST->hasDQI())
2984 if (const auto *Entry = ConvertCostTableLookup(
2985 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
2986 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2987
2988 if (ST->hasAVX512())
2989 if (const auto *Entry = ConvertCostTableLookup(
2990 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
2991 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2992 }
2993
2994 if (ST->hasBWI())
2995 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2996 LTDest.second, LTSrc.second))
2997 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2998
2999 if (ST->hasDQI())
3000 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3001 LTDest.second, LTSrc.second))
3002 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3003
3004 if (ST->hasAVX512())
3005 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3006 LTDest.second, LTSrc.second))
3007 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3008
3009 if (ST->hasAVX2())
3010 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3011 LTDest.second, LTSrc.second))
3012 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3013
3014 if (ST->hasAVX())
3015 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3016 LTDest.second, LTSrc.second))
3017 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3018
3019 if (ST->hasSSE41())
3020 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3021 LTDest.second, LTSrc.second))
3022 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3023
3024 if (ST->hasSSE2())
3025 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3026 LTDest.second, LTSrc.second))
3027 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3028
3029 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3030 // sitofp.
3031 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3032 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3033 Type *ExtSrc = Src->getWithNewBitWidth(32);
3034 unsigned ExtOpc =
3035 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3036
3037 // For scalar loads the extend would be free.
3038 InstructionCost ExtCost = 0;
3039 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3040 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3041
3042 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3044 }
3045
3046 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3047 // i32.
3048 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3049 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3050 Type *TruncDst = Dst->getWithNewBitWidth(32);
3051 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3052 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3054 }
3055
3056 return AdjustCost(
3057 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3058}
3059
3061 Type *CondTy,
3062 CmpInst::Predicate VecPred,
3064 const Instruction *I) {
3065 // Early out if this type isn't scalar/vector integer/float.
3066 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3067 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3068 I);
3069
3070 // Legalize the type.
3071 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3072
3073 MVT MTy = LT.second;
3074
3075 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3076 assert(ISD && "Invalid opcode");
3077
3078 InstructionCost ExtraCost = 0;
3079 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3080 // Some vector comparison predicates cost extra instructions.
3081 // TODO: Should we invert this and assume worst case cmp costs
3082 // and reduce for particular predicates?
3083 if (MTy.isVector() &&
3084 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3085 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3086 ST->hasBWI())) {
3087 // Fallback to I if a specific predicate wasn't specified.
3088 CmpInst::Predicate Pred = VecPred;
3089 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3091 Pred = cast<CmpInst>(I)->getPredicate();
3092
3093 switch (Pred) {
3095 // xor(cmpeq(x,y),-1)
3096 ExtraCost = 1;
3097 break;
3100 // xor(cmpgt(x,y),-1)
3101 ExtraCost = 1;
3102 break;
3105 // cmpgt(xor(x,signbit),xor(y,signbit))
3106 // xor(cmpeq(pmaxu(x,y),x),-1)
3107 ExtraCost = 2;
3108 break;
3111 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3112 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3113 // cmpeq(psubus(x,y),0)
3114 // cmpeq(pminu(x,y),x)
3115 ExtraCost = 1;
3116 } else {
3117 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3118 ExtraCost = 3;
3119 }
3120 break;
3123 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3124 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3125 if (CondTy && !ST->hasAVX())
3126 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3128 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3130 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3131
3132 break;
3135 // Assume worst case scenario and add the maximum extra cost.
3136 ExtraCost = 3;
3137 break;
3138 default:
3139 break;
3140 }
3141 }
3142 }
3143
3144 static const CostKindTblEntry SLMCostTbl[] = {
3145 // slm pcmpeq/pcmpgt throughput is 2
3146 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3147 // slm pblendvb/blendvpd/blendvps throughput is 4
3148 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3149 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3150 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3151 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3152 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3153 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3154 };
3155
3156 static const CostKindTblEntry AVX512BWCostTbl[] = {
3157 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3158 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3159 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3160 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3161
3162 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3163 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3164 };
3165
3166 static const CostKindTblEntry AVX512CostTbl[] = {
3167 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3168 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3169 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3170 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3171
3172 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3173 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3174 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3175 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3176 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3177 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3178 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3179
3180 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3181 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3182 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3183 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3184 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3185 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3186 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3187 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3188 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3189 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3190 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3191 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3192 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3193 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3194
3195 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3196 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3197 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3198 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3199 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3200 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3201 };
3202
3203 static const CostKindTblEntry AVX2CostTbl[] = {
3204 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3205 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3206 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3207 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3208 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3209 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3210
3211 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3212 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3213 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3214 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3215
3216 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3217 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3218 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3219 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3220 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3221 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3222 };
3223
3224 static const CostKindTblEntry XOPCostTbl[] = {
3225 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3226 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3227 };
3228
3229 static const CostKindTblEntry AVX1CostTbl[] = {
3230 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3231 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3232 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3233 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3234 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3235 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3236
3237 // AVX1 does not support 8-wide integer compare.
3238 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3239 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3240 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3241 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3242
3243 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3244 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3245 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3246 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3247 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3248 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3249 };
3250
3251 static const CostKindTblEntry SSE42CostTbl[] = {
3252 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3253 };
3254
3255 static const CostKindTblEntry SSE41CostTbl[] = {
3256 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3257 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3258
3259 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3260 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3261 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3262 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3263 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3264 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3265 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3266 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3267 };
3268
3269 static const CostKindTblEntry SSE2CostTbl[] = {
3270 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3271 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3272
3273 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3274 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3275 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3276 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3277
3278 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3279 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3280 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3281 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3282 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3283 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3284 };
3285
3286 static const CostKindTblEntry SSE1CostTbl[] = {
3287 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3288 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3289
3290 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3291 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3292 };
3293
3294 if (ST->useSLMArithCosts())
3295 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3296 if (auto KindCost = Entry->Cost[CostKind])
3297 return LT.first * (ExtraCost + *KindCost);
3298
3299 if (ST->hasBWI())
3300 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3301 if (auto KindCost = Entry->Cost[CostKind])
3302 return LT.first * (ExtraCost + *KindCost);
3303
3304 if (ST->hasAVX512())
3305 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3306 if (auto KindCost = Entry->Cost[CostKind])
3307 return LT.first * (ExtraCost + *KindCost);
3308
3309 if (ST->hasAVX2())
3310 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3311 if (auto KindCost = Entry->Cost[CostKind])
3312 return LT.first * (ExtraCost + *KindCost);
3313
3314 if (ST->hasXOP())
3315 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3316 if (auto KindCost = Entry->Cost[CostKind])
3317 return LT.first * (ExtraCost + *KindCost);
3318
3319 if (ST->hasAVX())
3320 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3321 if (auto KindCost = Entry->Cost[CostKind])
3322 return LT.first * (ExtraCost + *KindCost);
3323
3324 if (ST->hasSSE42())
3325 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3326 if (auto KindCost = Entry->Cost[CostKind])
3327 return LT.first * (ExtraCost + *KindCost);
3328
3329 if (ST->hasSSE41())
3330 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3331 if (auto KindCost = Entry->Cost[CostKind])
3332 return LT.first * (ExtraCost + *KindCost);
3333
3334 if (ST->hasSSE2())
3335 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3336 if (auto KindCost = Entry->Cost[CostKind])
3337 return LT.first * (ExtraCost + *KindCost);
3338
3339 if (ST->hasSSE1())
3340 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3341 if (auto KindCost = Entry->Cost[CostKind])
3342 return LT.first * (ExtraCost + *KindCost);
3343
3344 // Assume a 3cy latency for fp select ops.
3345 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3346 if (ValTy->getScalarType()->isFloatingPointTy())
3347 return 3;
3348
3349 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3350}
3351
3353
3357 // Costs should match the codegen from:
3358 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3359 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3360 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3361 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3362 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3363
3364 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3365 // specialized in these tables yet.
3366 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3367 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3368 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3369 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3370 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3371 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3372 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3373 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3374 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3375 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3376 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3377 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3378 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3379 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3380 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3381 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3382 };
3383 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3384 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3385 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3386 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3387 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3388 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3389 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3390 };
3391 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3392 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3393 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3394 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3395 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3396 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3397 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3398 };
3399 static const CostKindTblEntry AVX512CDCostTbl[] = {
3400 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3401 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3402 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3403 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3404 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3405 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3406 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3407 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3408 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3409 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3410 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3411 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3412
3413 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3414 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3415 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3416 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3417 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3418 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3419 };
3420 static const CostKindTblEntry AVX512BWCostTbl[] = {
3421 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3422 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3423 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3424 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3425 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3426 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3427 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3428 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3429 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3430 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3431 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3432 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3433 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3434 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3435 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3436 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3437 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3438 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3439 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3440 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3441 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3442 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3443 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3444 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3445 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3446 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3447 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3448 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3449 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3450 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3451 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3452 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3453 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3454 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3455 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3456 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3457 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3458 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3459 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3460 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3461 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3462 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3463 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3464 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3465 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3466 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3467 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3468 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3469 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3470 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3471 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3472 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3473 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3474 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3475 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3476 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3477 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3478 { ISD::SADDSAT, MVT::v32i16, { 1 } },
3479 { ISD::SADDSAT, MVT::v64i8, { 1 } },
3480 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3481 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3482 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3483 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3484 { ISD::SSUBSAT, MVT::v32i16, { 1 } },
3485 { ISD::SSUBSAT, MVT::v64i8, { 1 } },
3486 { ISD::UADDSAT, MVT::v32i16, { 1 } },
3487 { ISD::UADDSAT, MVT::v64i8, { 1 } },
3488 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3489 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3490 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3491 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3492 { ISD::USUBSAT, MVT::v32i16, { 1 } },
3493 { ISD::USUBSAT, MVT::v64i8, { 1 } },
3494 };
3495 static const CostKindTblEntry AVX512CostTbl[] = {
3496 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3497 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3498 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3499 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3500 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3501 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3502 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3503 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3504 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3505 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3506 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3507 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3508 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3509 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3510 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3511 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3512 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3513 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3514 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3515 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3516 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3517 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3518 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3519 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3520 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3521 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3522 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3523 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3524 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3525 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3526 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3527 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3528 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3529 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3530 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3531 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3532 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3533 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3534 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3535 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3536 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3537 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3538 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3539 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3540 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3541 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3542 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3543 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3544 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3545 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3546 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3547 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3548 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3549 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3550 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3551 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3552 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3553 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3554 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3555 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3556 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3557 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3558 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3559 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3560 { ISD::USUBSAT, MVT::v16i32, { 2 } }, // pmaxud + psubd
3561 { ISD::USUBSAT, MVT::v2i64, { 2 } }, // pmaxuq + psubq
3562 { ISD::USUBSAT, MVT::v4i64, { 2 } }, // pmaxuq + psubq
3563 { ISD::USUBSAT, MVT::v8i64, { 2 } }, // pmaxuq + psubq
3564 { ISD::UADDSAT, MVT::v16i32, { 3 } }, // not + pminud + paddd
3565 { ISD::UADDSAT, MVT::v2i64, { 3 } }, // not + pminuq + paddq
3566 { ISD::UADDSAT, MVT::v4i64, { 3 } }, // not + pminuq + paddq
3567 { ISD::UADDSAT, MVT::v8i64, { 3 } }, // not + pminuq + paddq
3568 { ISD::SADDSAT, MVT::v32i16, { 2 } },
3569 { ISD::SADDSAT, MVT::v64i8, { 2 } },
3570 { ISD::SSUBSAT, MVT::v32i16, { 2 } },
3571 { ISD::SSUBSAT, MVT::v64i8, { 2 } },
3572 { ISD::UADDSAT, MVT::v32i16, { 2 } },
3573 { ISD::UADDSAT, MVT::v64i8, { 2 } },
3574 { ISD::USUBSAT, MVT::v32i16, { 2 } },
3575 { ISD::USUBSAT, MVT::v64i8, { 2 } },
3576 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3577 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3578 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3579 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3580 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3581 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3582 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3583 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3584 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3585 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3586 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3587 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3588 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3589 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3590 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3591 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3592 };
3593 static const CostKindTblEntry XOPCostTbl[] = {
3594 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3595 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3596 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3597 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3598 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3599 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3600 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3601 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3602 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3603 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3604 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3605 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3606 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3607 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3608 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3609 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3610 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3611 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3612 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3613 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3614 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3615 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3616 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3617 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3618 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3619 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3620 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3621 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3622 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } }
3623 };
3624 static const CostKindTblEntry AVX2CostTbl[] = {
3625 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3626 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3627 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3628 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3629 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3630 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3631 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3632 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3633 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3634 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3635 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3636 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3637 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3638 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3639 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3640 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3641 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3642 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3643 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3644 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3645 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3646 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3647 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3648 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3649 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3650 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3651 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3652 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3653 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3654 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3655 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3656 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3657 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3658 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3659 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3660 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3661 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3662 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3663 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3664 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3665 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3666 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3667 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3668 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3669 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3670 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3671 { ISD::SADDSAT, MVT::v16i16, { 1 } },
3672 { ISD::SADDSAT, MVT::v32i8, { 1 } },
3673 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
3674 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
3675 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3676 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3677 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3678 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
3679 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
3680 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3681 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3682 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3683 { ISD::SSUBSAT, MVT::v16i16, { 1 } },
3684 { ISD::SSUBSAT, MVT::v32i8, { 1 } },
3685 { ISD::UADDSAT, MVT::v16i16, { 1 } },
3686 { ISD::UADDSAT, MVT::v32i8, { 1 } },
3687 { ISD::UADDSAT, MVT::v8i32, { 3 } }, // not + pminud + paddd
3688 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
3689 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
3690 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3691 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3692 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3693 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
3694 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
3695 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3696 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3697 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3698 { ISD::USUBSAT, MVT::v16i16, { 1 } },
3699 { ISD::USUBSAT, MVT::v32i8, { 1 } },
3700 { ISD::USUBSAT, MVT::v8i32, { 2 } }, // pmaxud + psubd
3701 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3702 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3703 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3704 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3705 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3706 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3707 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
3708 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
3709 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
3710 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
3711 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
3712 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
3713 };
3714 static const CostKindTblEntry AVX1CostTbl[] = {
3715 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3716 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
3717 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
3718 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
3719 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3720 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
3721 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3722 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
3723 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3724 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
3725 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
3726 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
3727 { ISD::BSWAP, MVT::v4i64, { 5, 7, 5, 10 } },
3728 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 3 } },
3729 { ISD::BSWAP, MVT::v8i32, { 5, 7, 5, 10 } },
3730 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 3 } },
3731 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
3732 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
3733 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
3734 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
3735 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
3736 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
3737 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
3738 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
3739 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3740 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
3741 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3742 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
3743 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3744 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
3745 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
3746 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
3747 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
3748 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
3749 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
3750 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
3751 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
3752 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
3753 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3754 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
3755 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
3756 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
3757 { ISD::SADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3758 { ISD::SADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3759 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3760 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
3761 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3762 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3763 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3764 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3765 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3766 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3767 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3768 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3769 { ISD::SSUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3770 { ISD::SSUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3771 { ISD::UADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3772 { ISD::UADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3773 { ISD::UADDSAT, MVT::v8i32, { 8 } }, // 2 x 128-bit Op + extract/insert
3774 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3775 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
3776 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3777 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3778 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3779 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3780 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
3781 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3782 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3783 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3784 { ISD::USUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3785 { ISD::USUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3786 { ISD::USUBSAT, MVT::v8i32, { 6 } }, // 2 x 128-bit Op + extract/insert
3787 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3788 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3789 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3790 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3791 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3792 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3793 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
3794 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
3795 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
3796 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
3797 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
3798 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
3799 };
3800 static const CostKindTblEntry GLMCostTbl[] = {
3801 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
3802 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
3803 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
3804 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
3805 };
3806 static const CostKindTblEntry SLMCostTbl[] = {
3807 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
3808 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
3809 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
3810 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
3811 };
3812 static const CostKindTblEntry SSE42CostTbl[] = {
3813 { ISD::USUBSAT, MVT::v4i32, { 2 } }, // pmaxud + psubd
3814 { ISD::UADDSAT, MVT::v4i32, { 3 } }, // not + pminud + paddd
3815 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3816 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3817 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3818 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3819 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3820 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3821 };
3822 static const CostKindTblEntry SSE41CostTbl[] = {
3823 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
3824 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
3825 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3826 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
3827 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3828 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3829 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
3830 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
3831 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3832 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
3833 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
3834 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3835 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
3836 };
3837 static const CostKindTblEntry SSSE3CostTbl[] = {
3838 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
3839 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
3840 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
3841 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
3842 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
3843 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
3844 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
3845 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
3846 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
3847 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
3848 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
3849 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
3850 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
3851 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
3852 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
3853 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
3854 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
3855 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
3856 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
3857 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
3858 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
3859 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
3860 };
3861 static const CostKindTblEntry SSE2CostTbl[] = {
3862 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
3863 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
3864 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
3865 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
3866 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
3867 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
3868 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
3869 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
3870 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
3871 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
3872 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
3873 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
3874 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
3875 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
3876 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
3877 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
3878 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
3879 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
3880 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
3881 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
3882 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
3883 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
3884 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
3885 { ISD::SADDSAT, MVT::v8i16, { 1 } },
3886 { ISD::SADDSAT, MVT::v16i8, { 1 } },
3887 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
3888 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
3889 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
3890 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
3891 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
3892 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
3893 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
3894 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
3895 { ISD::SSUBSAT, MVT::v8i16, { 1 } },
3896 { ISD::SSUBSAT, MVT::v16i8, { 1 } },
3897 { ISD::UADDSAT, MVT::v8i16, { 1 } },
3898 { ISD::UADDSAT, MVT::v16i8, { 1 } },
3899 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
3900 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
3901 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
3902 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
3903 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
3904 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
3905 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
3906 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
3907 { ISD::USUBSAT, MVT::v8i16, { 1 } },
3908 { ISD::USUBSAT, MVT::v16i8, { 1 } },
3909 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
3910 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
3911 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
3912 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
3913 };
3914 static const CostKindTblEntry SSE1CostTbl[] = {
3915 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
3916 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
3917 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
3918 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
3919 };
3920 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
3921 { ISD::CTTZ, MVT::i64, { 1 } },
3922 };
3923 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
3924 { ISD::CTTZ, MVT::i32, { 1 } },
3925 { ISD::CTTZ, MVT::i16, { 1 } },
3926 { ISD::CTTZ, MVT::i8, { 1 } },
3927 };
3928 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
3929 { ISD::CTLZ, MVT::i64, { 1 } },
3930 };
3931 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
3932 { ISD::CTLZ, MVT::i32, { 1 } },
3933 { ISD::CTLZ, MVT::i16, { 2 } },
3934 { ISD::CTLZ, MVT::i8, { 2 } },
3935 };
3936 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
3937 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
3938 };
3939 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
3940 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
3941 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
3942 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
3943 };
3944 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
3945 { ISD::ABS, MVT::i64, { 1, 2, 3, 4 } }, // SUB+CMOV
3946 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
3947 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
3948 { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
3949 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR+XOR
3950 { ISD::CTTZ, MVT::i64, { 3 } }, // TEST+BSF+CMOV/BRANCH
3951 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR
3952 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
3953 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
3954 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
3955 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
3956 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
3957 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
3958 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
3959 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
3960 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
3961 { ISD::SADDO, MVT::i64, { 1 } },
3962 { ISD::UADDO, MVT::i64, { 1 } },
3963 { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto
3964 };
3965 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
3966 { ISD::ABS, MVT::i32, { 1, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV
3967 { ISD::ABS, MVT::i16, { 2, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV
3968 { ISD::ABS, MVT::i8, { 2, 4, 4, 4 } }, // SUB+XOR+SRA
3969 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
3970 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
3971 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
3972 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
3973 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
3974 { ISD::CTLZ, MVT::i32, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
3975 { ISD::CTLZ, MVT::i16, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
3976 { ISD::CTLZ, MVT::i8, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
3977 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSR+XOR
3978 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 3, 3 } }, // BSR+XOR
3979 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
3980 { ISD::CTTZ, MVT::i32, { 3 } }, // TEST+BSF+CMOV/BRANCH
3981 { ISD::CTTZ, MVT::i16, { 3 } }, // TEST+BSF+CMOV/BRANCH
3982 { ISD::CTTZ, MVT::i8, { 3 } }, // TEST+BSF+CMOV/BRANCH
3983 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSF
3984 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 1 } }, // BSF
3985 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 1 } }, // BSF
3986 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
3987 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
3988 { ISD::CTPOP, MVT::i8, { 7, 6, 13, 13 } },
3989 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
3990 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
3991 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
3992 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
3993 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
3994 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
3995 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
3996 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
3997 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
3998 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
3999 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4000 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4001 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4002 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4003 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4004 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4005 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4006 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4007 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4008 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4009 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4010 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4011 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4012 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4013 { ISD::SADDO, MVT::i32, { 1 } },
4014 { ISD::SADDO, MVT::i16, { 1 } },
4015 { ISD::SADDO, MVT::i8, { 1 } },
4016 { ISD::UADDO, MVT::i32, { 1 } },
4017 { ISD::UADDO, MVT::i16, { 1 } },
4018 { ISD::UADDO, MVT::i8, { 1 } },
4019 { ISD::UMULO, MVT::i32, { 2 } }, // mul + seto
4020 { ISD::UMULO, MVT::i16, { 2 } },
4021 { ISD::UMULO, MVT::i8, { 2 } },
4022 };
4023
4024 Type *RetTy = ICA.getReturnType();
4025 Type *OpTy = RetTy;
4026 Intrinsic::ID IID = ICA.getID();
4027 unsigned ISD = ISD::DELETED_NODE;
4028 switch (IID) {
4029 default:
4030 break;
4031 case Intrinsic::abs:
4032 ISD = ISD::ABS;
4033 break;
4034 case Intrinsic::bitreverse:
4035 ISD = ISD::BITREVERSE;
4036 break;
4037 case Intrinsic::bswap:
4038 ISD = ISD::BSWAP;
4039 break;
4040 case Intrinsic::ctlz:
4041 ISD = ISD::CTLZ;
4042 break;
4043 case Intrinsic::ctpop:
4044 ISD = ISD::CTPOP;
4045 break;
4046 case Intrinsic::cttz:
4047 ISD = ISD::CTTZ;
4048 break;
4049 case Intrinsic::fshl:
4050 ISD = ISD::FSHL;
4051 if (!ICA.isTypeBasedOnly()) {
4052 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4053 if (Args[0] == Args[1]) {
4054 ISD = ISD::ROTL;
4055 // Handle scalar constant rotation amounts.
4056 // TODO: Handle vector + funnel-shift cases.
4057 if (isa_and_nonnull<ConstantInt>(Args[2]))
4058 ISD = X86ISD::VROTLI;
4059 }
4060 }
4061 break;
4062 case Intrinsic::fshr:
4063 // FSHR has same costs so don't duplicate.
4064 ISD = ISD::FSHL;
4065 if (!ICA.isTypeBasedOnly()) {
4066 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4067 if (Args[0] == Args[1]) {
4068 // Handle scalar constant rotation amount.
4069 // TODO: Handle vector + funnel-shift cases.
4070 ISD = ISD::ROTR;
4071 if (isa_and_nonnull<ConstantInt>(Args[2]))
4072 ISD = X86ISD::VROTLI;
4073 }
4074 }
4075 break;
4076 case Intrinsic::maxnum:
4077 case Intrinsic::minnum:
4078 // FMINNUM has same costs so don't duplicate.
4079 ISD = ISD::FMAXNUM;
4080 break;
4081 case Intrinsic::sadd_sat:
4082 ISD = ISD::SADDSAT;
4083 break;
4084 case Intrinsic::smax:
4085 ISD = ISD::SMAX;
4086 break;
4087 case Intrinsic::smin:
4088 ISD = ISD::SMIN;
4089 break;
4090 case Intrinsic::ssub_sat:
4091 ISD = ISD::SSUBSAT;
4092 break;
4093 case Intrinsic::uadd_sat:
4094 ISD = ISD::UADDSAT;
4095 break;
4096 case Intrinsic::umax:
4097 ISD = ISD::UMAX;
4098 break;
4099 case Intrinsic::umin:
4100 ISD = ISD::UMIN;
4101 break;
4102 case Intrinsic::usub_sat:
4103 ISD = ISD::USUBSAT;
4104 break;
4105 case Intrinsic::sqrt:
4106 ISD = ISD::FSQRT;
4107 break;
4108 case Intrinsic::sadd_with_overflow:
4109 case Intrinsic::ssub_with_overflow:
4110 // SSUBO has same costs so don't duplicate.
4111 ISD = ISD::SADDO;
4112 OpTy = RetTy->getContainedType(0);
4113 break;
4114 case Intrinsic::uadd_with_overflow:
4115 case Intrinsic::usub_with_overflow:
4116 // USUBO has same costs so don't duplicate.
4117 ISD = ISD::UADDO;
4118 OpTy = RetTy->getContainedType(0);
4119 break;
4120 case Intrinsic::umul_with_overflow:
4121 case Intrinsic::smul_with_overflow:
4122 // SMULO has same costs so don't duplicate.
4123 ISD = ISD::UMULO;
4124 OpTy = RetTy->getContainedType(0);
4125 break;