LLVM 23.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
103
105X86TTIImpl::getPopcntSupport(unsigned TyWidth) const {
106 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
107 // TODO: Currently the __builtin_popcount() implementation using SSE3
108 // instructions is inefficient. Once the problem is fixed, we should
109 // call ST->hasSSE3() instead of ST->hasPOPCNT().
110 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
111}
112
113std::optional<unsigned> X86TTIImpl::getCacheSize(
115 switch (Level) {
117 // - Penryn
118 // - Nehalem
119 // - Westmere
120 // - Sandy Bridge
121 // - Ivy Bridge
122 // - Haswell
123 // - Broadwell
124 // - Skylake
125 // - Kabylake
126 return 32 * 1024; // 32 KiB
128 // - Penryn
129 // - Nehalem
130 // - Westmere
131 // - Sandy Bridge
132 // - Ivy Bridge
133 // - Haswell
134 // - Broadwell
135 // - Skylake
136 // - Kabylake
137 return 256 * 1024; // 256 KiB
138 }
139
140 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
141}
142
143std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
145 // - Penryn
146 // - Nehalem
147 // - Westmere
148 // - Sandy Bridge
149 // - Ivy Bridge
150 // - Haswell
151 // - Broadwell
152 // - Skylake
153 // - Kabylake
154 switch (Level) {
156 [[fallthrough]];
158 return 8;
159 }
160
161 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
162}
163
165
167 return Vector ? VectorClass
168 : Ty && Ty->isFloatingPointTy() ? ScalarFPClass
169 : GPRClass;
170}
171
172unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
173 if (ClassID == VectorClass && !ST->hasSSE1())
174 return 0;
175
176 if (!ST->is64Bit())
177 return 8;
178
179 if ((ClassID == GPRClass && ST->hasEGPR()) ||
180 (ClassID != GPRClass && ST->hasAVX512()))
181 return 32;
182
183 return 16;
184}
185
187 if (!ST->hasCF())
188 return false;
189 if (!Ty)
190 return true;
191 // Conditional faulting is supported by CFCMOV, which only accepts
192 // 16/32/64-bit operands.
193 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
194 // profitable.
195 auto *VTy = dyn_cast<FixedVectorType>(Ty);
196 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
197 return false;
198 auto *ScalarTy = Ty->getScalarType();
199 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
200 default:
201 return false;
202 case 16:
203 case 32:
204 case 64:
205 return true;
206 }
207}
208
211 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
212 switch (K) {
214 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
216 if (ST->hasAVX512() && PreferVectorWidth >= 512)
217 return TypeSize::getFixed(512);
218 if (ST->hasAVX() && PreferVectorWidth >= 256)
219 return TypeSize::getFixed(256);
220 if (ST->hasSSE1() && PreferVectorWidth >= 128)
221 return TypeSize::getFixed(128);
222 return TypeSize::getFixed(0);
224 return TypeSize::getScalable(0);
225 }
226
227 llvm_unreachable("Unsupported register kind");
228}
229
234
236 // If the loop will not be vectorized, don't interleave the loop.
237 // Let regular unroll to unroll the loop, which saves the overflow
238 // check and memory check cost.
239 if (VF.isScalar())
240 return 1;
241
242 if (ST->isAtom())
243 return 1;
244
245 // Sandybridge and Haswell have multiple execution ports and pipelined
246 // vector units.
247 if (ST->hasAVX())
248 return 4;
249
250 return 2;
251}
252
254 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
256 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
257
258 // vXi8 multiplications are always promoted to vXi16.
259 // Sub-128-bit types can be extended/packed more efficiently.
260 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
261 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
262 Type *WideVecTy =
264 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
266 CostKind) +
267 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
269 CostKind) +
270 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
271 }
272
273 // Legalize the type.
274 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
275
276 int ISD = TLI->InstructionOpcodeToISD(Opcode);
277 assert(ISD && "Invalid opcode");
278
279 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
280 (LT.second.getScalarType() == MVT::i32 ||
281 LT.second.getScalarType() == MVT::i64)) {
282 // Check if the operands can be represented as a smaller datatype.
283 bool Op1Signed = false, Op2Signed = false;
284 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
285 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
286 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
287 bool SignedMode = Op1Signed || Op2Signed;
288
289 // If both vXi32 are representable as i15 and at least one is constant,
290 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
291 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
292 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
293 LT.second.getScalarType() == MVT::i32) {
294 bool Op1Constant =
295 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
296 bool Op2Constant =
297 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
298 bool Op1Sext = isa<SExtInst>(Args[0]) &&
299 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
300 bool Op2Sext = isa<SExtInst>(Args[1]) &&
301 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
302
303 bool IsZeroExtended = !Op1Signed || !Op2Signed;
304 bool IsConstant = Op1Constant || Op2Constant;
305 bool IsSext = Op1Sext || Op2Sext;
306 if (IsConstant || IsZeroExtended || IsSext)
307 LT.second =
308 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
309 }
310
311 // Check if the vXi32 operands can be shrunk into a smaller datatype.
312 // This should match the codegen from reduceVMULWidth.
313 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
314 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
315 if (OpMinSize <= 7)
316 return LT.first * 3; // pmullw/sext
317 if (!SignedMode && OpMinSize <= 8)
318 return LT.first * 3; // pmullw/zext
319 if (OpMinSize <= 15)
320 return LT.first * 5; // pmullw/pmulhw/pshuf
321 if (!SignedMode && OpMinSize <= 16)
322 return LT.first * 5; // pmullw/pmulhw/pshuf
323 }
324
325 // If both vXi64 are representable as (unsigned) i32, then we can perform
326 // the multiple with a single PMULUDQ instruction.
327 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
328 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
329 ISD = X86ISD::PMULUDQ;
330 }
331
332 // Vector multiply by pow2 will be simplified to shifts.
333 // Vector multiply by -pow2 will be simplified to shifts/negates.
334 if (ISD == ISD::MUL && Op2Info.isConstant() &&
335 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
337 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
338 Op1Info.getNoProps(), Op2Info.getNoProps());
339 if (Op2Info.isNegatedPowerOf2())
340 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
341 return Cost;
342 }
343
344 // On X86, vector signed division by constants power-of-two are
345 // normally expanded to the sequence SRA + SRL + ADD + SRA.
346 // The OperandValue properties may not be the same as that of the previous
347 // operation; conservatively assume OP_None.
348 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
349 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
351 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
352 Op1Info.getNoProps(), Op2Info.getNoProps());
353 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
354 Op1Info.getNoProps(), Op2Info.getNoProps());
355 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
356 Op1Info.getNoProps(), Op2Info.getNoProps());
357
358 if (ISD == ISD::SREM) {
359 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
360 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
361 Op2Info.getNoProps());
362 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
363 Op2Info.getNoProps());
364 }
365
366 return Cost;
367 }
368
369 // Vector unsigned division/remainder will be simplified to shifts/masks.
370 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
371 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
372 if (ISD == ISD::UDIV)
373 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
374 Op1Info.getNoProps(), Op2Info.getNoProps());
375 // UREM
376 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
377 Op1Info.getNoProps(), Op2Info.getNoProps());
378 }
379
380 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
381 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
382 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
383 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
384 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
385 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
386 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
387 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
388 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
389 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
390 };
391
392 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
393 if (const auto *Entry =
394 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
395 if (auto KindCost = Entry->Cost[CostKind])
396 return LT.first * *KindCost;
397
398 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
399 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
400 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
401 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
402 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
403 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
404 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
405 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
406 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
407 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
408
409 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
410 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
411 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
412 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
413 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
414 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
415 };
416
417 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
418 if (const auto *Entry =
419 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
420 if (auto KindCost = Entry->Cost[CostKind])
421 return LT.first * *KindCost;
422
423 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
424 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
425 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
426 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
427
428 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
429 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
430 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
431
432 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
433 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
434 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
435 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
436 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
437 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
438
439 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
440 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
441 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
442 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
443 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
444 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
445 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
446
447 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
448 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
449 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
450 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
451 };
452
453 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
454 if (const auto *Entry =
455 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
456 if (auto KindCost = Entry->Cost[CostKind])
457 return LT.first * *KindCost;
458
459 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
460 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
461 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
462 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
463 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
464 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
465 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
466
467 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
468 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
469 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
470 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
471 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
472 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
473
474 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
475 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
476 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
477 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
478 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
479 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
480
481 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
482 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
483 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
484 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
485 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
486 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
487
488 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
489 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
490 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
491 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
492 };
493
494 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
495 if (const auto *Entry =
496 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
497 if (auto KindCost = Entry->Cost[CostKind])
498 return LT.first * *KindCost;
499
500 static const CostKindTblEntry AVXUniformConstCostTable[] = {
501 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
502 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
503 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
504 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
505 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
506 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
507
508 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
509 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
510 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
511 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
512 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
513 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
514
515 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
516 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
517 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
518 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
519 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
520 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
521
522 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
523 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
524 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
525 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
526 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
527 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
528
529 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
530 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
531 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
532 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
533 };
534
535 // XOP has faster vXi8 shifts.
536 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
537 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
538 if (const auto *Entry =
539 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
540 if (auto KindCost = Entry->Cost[CostKind])
541 return LT.first * *KindCost;
542
543 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
544 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
545 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
546 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
547
548 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
549 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
550 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
551
552 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
553 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
554 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
555
556 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
557 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
558 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
559
560 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
561 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
562 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
563 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
564 };
565
566 // XOP has faster vXi8 shifts.
567 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
568 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
569 if (const auto *Entry =
570 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
571 if (auto KindCost = Entry->Cost[CostKind])
572 return LT.first * *KindCost;
573
574 static const CostKindTblEntry AVX512BWConstCostTable[] = {
575 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
576 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
577 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
578 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
579
580 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
581 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
582 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
583 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
584 };
585
586 if (Op2Info.isConstant() && ST->hasBWI())
587 if (const auto *Entry =
588 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
589 if (auto KindCost = Entry->Cost[CostKind])
590 return LT.first * *KindCost;
591
592 static const CostKindTblEntry AVX512ConstCostTable[] = {
593 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
594 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
595 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
596 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
597
598 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
599 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
600 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
601 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
602
603 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
604 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
605 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
606 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
607 };
608
609 if (Op2Info.isConstant() && ST->hasAVX512())
610 if (const auto *Entry =
611 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
612 if (auto KindCost = Entry->Cost[CostKind])
613 return LT.first * *KindCost;
614
615 static const CostKindTblEntry AVX2ConstCostTable[] = {
616 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
617 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
618 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
619 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
620
621 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
622 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
623 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
624 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
625
626 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
627 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
628 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
629 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
630 };
631
632 if (Op2Info.isConstant() && ST->hasAVX2())
633 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
634 if (auto KindCost = Entry->Cost[CostKind])
635 return LT.first * *KindCost;
636
637 static const CostKindTblEntry AVXConstCostTable[] = {
638 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
639 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
640 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
641 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
642
643 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
644 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
645 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
646 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
647
648 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
649 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
650 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
651 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
652 };
653
654 if (Op2Info.isConstant() && ST->hasAVX())
655 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
656 if (auto KindCost = Entry->Cost[CostKind])
657 return LT.first * *KindCost;
658
659 static const CostKindTblEntry SSE41ConstCostTable[] = {
660 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
661 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
662 };
663
664 if (Op2Info.isConstant() && ST->hasSSE41())
665 if (const auto *Entry =
666 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
667 if (auto KindCost = Entry->Cost[CostKind])
668 return LT.first * *KindCost;
669
670 static const CostKindTblEntry SSE2ConstCostTable[] = {
671 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
672 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
673 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
674 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
675
676 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
677 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
678 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
679 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
680
681 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
682 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
683 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
684 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
685 };
686
687 if (Op2Info.isConstant() && ST->hasSSE2())
688 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
689 if (auto KindCost = Entry->Cost[CostKind])
690 return LT.first * *KindCost;
691
692 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
693 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
694 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
695 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
696 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
697 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
698 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
699 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
700 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
701 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
702
703 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
704 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
705 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
706 };
707
708 if (ST->hasBWI() && Op2Info.isUniform())
709 if (const auto *Entry =
710 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
711 if (auto KindCost = Entry->Cost[CostKind])
712 return LT.first * *KindCost;
713
714 static const CostKindTblEntry AVX512UniformCostTable[] = {
715 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
716 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
717 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
718
719 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
720 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
721 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
722
723 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
724 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
725 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
726 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
727 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
728 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
729 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
730 };
731
732 if (ST->hasAVX512() && Op2Info.isUniform())
733 if (const auto *Entry =
734 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
735 if (auto KindCost = Entry->Cost[CostKind])
736 return LT.first * *KindCost;
737
738 static const CostKindTblEntry AVX2UniformCostTable[] = {
739 // Uniform splats are cheaper for the following instructions.
740 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
741 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
742 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
743 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
744 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
745 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
746
747 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
748 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
749 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
750 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
751 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
752 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
753
754 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
755 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
756 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
757 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
758 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
759 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
760
761 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
762 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
763 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
764 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
765 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
766 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
767 };
768
769 if (ST->hasAVX2() && Op2Info.isUniform())
770 if (const auto *Entry =
771 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
772 if (auto KindCost = Entry->Cost[CostKind])
773 return LT.first * *KindCost;
774
775 static const CostKindTblEntry AVXUniformCostTable[] = {
776 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
777 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
778 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
779 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
780 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
781 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
782
783 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
784 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
785 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
786 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
787 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
788 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
789
790 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
791 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
792 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
793 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
794 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
795 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
796
797 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
798 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
799 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
800 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
801 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
802 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
803 };
804
805 // XOP has faster vXi8 shifts.
806 if (ST->hasAVX() && Op2Info.isUniform() &&
807 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
808 if (const auto *Entry =
809 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
810 if (auto KindCost = Entry->Cost[CostKind])
811 return LT.first * *KindCost;
812
813 static const CostKindTblEntry SSE2UniformCostTable[] = {
814 // Uniform splats are cheaper for the following instructions.
815 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
816 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
817 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
818
819 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
820 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
821 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
822
823 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
824 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
825 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
826
827 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
828 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
829 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
830 };
831
832 if (ST->hasSSE2() && Op2Info.isUniform() &&
833 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
834 if (const auto *Entry =
835 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
836 if (auto KindCost = Entry->Cost[CostKind])
837 return LT.first * *KindCost;
838
839 static const CostKindTblEntry AVX512DQCostTable[] = {
840 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
841 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
842 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
843 };
844
845 // Look for AVX512DQ lowering tricks for custom cases.
846 if (ST->hasDQI())
847 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
848 if (auto KindCost = Entry->Cost[CostKind])
849 return LT.first * *KindCost;
850
851 static const CostKindTblEntry AVX512BWCostTable[] = {
852 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
853 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
854 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
855 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
856 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
857 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
858 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
859 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
860 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
861
862 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
863 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
864 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
865 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
866 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
867 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
868 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
869 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
870 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
871
872 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
873 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
874
875 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
876 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
877 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
878 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
879
880 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
881 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
882
883 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
884 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
885 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
886 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
887
888 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
889 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
890 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
891 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
892 };
893
894 // Look for AVX512BW lowering tricks for custom cases.
895 if (ST->hasBWI())
896 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
897 if (auto KindCost = Entry->Cost[CostKind])
898 return LT.first * *KindCost;
899
900 static const CostKindTblEntry AVX512CostTable[] = {
901 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
902 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
903 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
904
905 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
906 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
907 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
908
909 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
910 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
911 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
912 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
913 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
914 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
915 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
916 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
917 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
918
919 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
920 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
921 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
922 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
923 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
924 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
925 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
926 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
927 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
928
929 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
930 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
931
932 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
933 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
934
935 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
936 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
937 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
938 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
939
940 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
941 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
942 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
943 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
944
945 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
946 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
947 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
948 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
949
950 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
951 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
952 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
953 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
954 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
955
956 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
957
958 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
959 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
960 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
961 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
962 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
964 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
965 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
966 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
967
968 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
969 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
970 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
972
973 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
974 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
975 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
976 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
977 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
979 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
980 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
981 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
982
983 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
984 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
985 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
986 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
987 };
988
989 if (ST->hasAVX512())
990 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
991 if (auto KindCost = Entry->Cost[CostKind])
992 return LT.first * *KindCost;
993
994 static const CostKindTblEntry AVX2ShiftCostTable[] = {
995 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
996 // customize them to detect the cases where shift amount is a scalar one.
997 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
998 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
999 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
1000 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
1001 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
1002 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
1003 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
1004 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
1005 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
1006 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1007 };
1008
1009 if (ST->hasAVX512()) {
1010 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1011 // On AVX512, a packed v32i16 shift left by a constant build_vector
1012 // is lowered into a vector multiply (vpmullw).
1013 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1014 Op1Info.getNoProps(), Op2Info.getNoProps());
1015 }
1016
1017 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1018 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1019 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1020 Op2Info.isConstant())
1021 // On AVX2, a packed v16i16 shift left by a constant build_vector
1022 // is lowered into a vector multiply (vpmullw).
1023 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1024 Op1Info.getNoProps(), Op2Info.getNoProps());
1025
1026 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1027 if (auto KindCost = Entry->Cost[CostKind])
1028 return LT.first * *KindCost;
1029 }
1030
1031 static const CostKindTblEntry XOPShiftCostTable[] = {
1032 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1033 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1034 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1035 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1036 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1037 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1038 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1039 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1040 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1041 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1042 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1043 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1044 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1045 // 256bit shifts require splitting if AVX2 didn't catch them above.
1046 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1047 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1048 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1049 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1050 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1051 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1052 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1053 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1054 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1055 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1056 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1057 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1058 };
1059
1060 // Look for XOP lowering tricks.
1061 if (ST->hasXOP()) {
1062 // If the right shift is constant then we'll fold the negation so
1063 // it's as cheap as a left shift.
1064 int ShiftISD = ISD;
1065 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1066 ShiftISD = ISD::SHL;
1067 if (const auto *Entry =
1068 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1069 if (auto KindCost = Entry->Cost[CostKind])
1070 return LT.first * *KindCost;
1071 }
1072
1073 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1074 MVT VT = LT.second;
1075 // Vector shift left by non uniform constant can be lowered
1076 // into vector multiply.
1077 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1078 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1079 ISD = ISD::MUL;
1080 }
1081
1082 static const CostKindTblEntry GLMCostTable[] = {
1083 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1084 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1085 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1086 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1087 };
1088
1089 if (ST->useGLMDivSqrtCosts())
1090 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1091 if (auto KindCost = Entry->Cost[CostKind])
1092 return LT.first * *KindCost;
1093
1094 static const CostKindTblEntry SLMCostTable[] = {
1095 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1096 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1097 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1098 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1099 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1100 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1101 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1102 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1103 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1104 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1105 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1106 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1107 // v2i64/v4i64 mul is custom lowered as a series of long:
1108 // multiplies(3), shifts(3) and adds(2)
1109 // slm muldq version throughput is 2 and addq throughput 4
1110 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1111 // 3X4 (addq throughput) = 17
1112 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1113 // slm addq\subq throughput is 4
1114 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1115 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1116 };
1117
1118 if (ST->useSLMArithCosts())
1119 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1120 if (auto KindCost = Entry->Cost[CostKind])
1121 return LT.first * *KindCost;
1122
1123 static const CostKindTblEntry AVX2CostTable[] = {
1124 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1125 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1126 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1127 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1128
1129 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1130 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1131 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1132 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1133
1134 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1135 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1136 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1137 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1138 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1139 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1140
1141 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1142 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1143 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1144 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1145 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1146 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1147 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1148 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1149
1150 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1151 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1152 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1153 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1154 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1155 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1156 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1157
1158 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1159
1160 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1161 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1162
1163 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1164 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1165 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1166 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1167 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1168 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1169
1170 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1171 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1172 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1173 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1174 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1175 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1176
1177 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1178 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1179 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1180 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1181 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1182 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1183
1184 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1185 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1186 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1187 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1188 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1189 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1190 };
1191
1192 // Look for AVX2 lowering tricks for custom cases.
1193 if (ST->hasAVX2())
1194 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1195 if (auto KindCost = Entry->Cost[CostKind])
1196 return LT.first * *KindCost;
1197
1198 static const CostKindTblEntry AVX1CostTable[] = {
1199 // We don't have to scalarize unsupported ops. We can issue two half-sized
1200 // operations and we only need to extract the upper YMM half.
1201 // Two ops + 1 extract + 1 insert = 4.
1202 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1203 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1204 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1205 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1206 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1207 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1208
1209 { X86ISD::PMULUDQ, MVT::v4i64, { 3, 5, 5, 6 } }, // pmuludq + split
1210
1211 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1212 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1213 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1214 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1215
1216 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1217 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1218 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1219 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1220
1221 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1222 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1223 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1224 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1225
1226 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1227 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1228 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1229 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1230 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1231 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1232 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1233 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1234 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1235 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1236
1237 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1238 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1239 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1240 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1241 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1242 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1243 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1244 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1245
1246 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1247 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1248 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1249 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1250 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1251 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1252 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1253 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1254
1255 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1256 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1257 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1258 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1259 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1260 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1261 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1262 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1263
1264 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1265 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1266
1267 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1268 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1269 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1270 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1271 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1272 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1273
1274 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1275 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1276 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1277 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1278 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1279 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1280
1281 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1282 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1283 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1284 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1285 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1286 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1287
1288 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1289 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1290 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1291 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1292 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1293 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1294 };
1295
1296 if (ST->hasAVX())
1297 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1298 if (auto KindCost = Entry->Cost[CostKind])
1299 return LT.first * *KindCost;
1300
1301 static const CostKindTblEntry SSE42CostTable[] = {
1302 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1303 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1304 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1305 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1306
1307 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1308 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1309 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1310 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1311
1312 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1313 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1314 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1315 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1316
1317 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1318 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1319 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1320 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1321
1322 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1323 };
1324
1325 if (ST->hasSSE42())
1326 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1327 if (auto KindCost = Entry->Cost[CostKind])
1328 return LT.first * *KindCost;
1329
1330 static const CostKindTblEntry SSE41CostTable[] = {
1331 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1332 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1333 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1334
1335 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1336 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1337 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1338 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1339
1340 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1341 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1342 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1343 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1344
1345 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1346 };
1347
1348 if (ST->hasSSE41())
1349 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1350 if (auto KindCost = Entry->Cost[CostKind])
1351 return LT.first * *KindCost;
1352
1353 static const CostKindTblEntry SSSE3CostTable[] = {
1354 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1355 };
1356
1357 if (ST->hasSSSE3())
1358 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1359 if (auto KindCost = Entry->Cost[CostKind])
1360 return LT.first * *KindCost;
1361
1362 static const CostKindTblEntry SSE2CostTable[] = {
1363 // We don't correctly identify costs of casts because they are marked as
1364 // custom.
1365 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1366 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1367 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1368 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1369
1370 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1371 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1372 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1373 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1374
1375 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1376 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1377 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1378 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1379
1380 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1381 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1382 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1383 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1384
1385 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1386 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1387 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1388 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1389
1390 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1391 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1392 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1393 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1394
1395 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1396 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1397
1398 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1399 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1400 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1401 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1402
1403 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1404
1405 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1406 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1407 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409
1410 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1411 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1412 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1413 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1414
1415 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1417 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1418
1419 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1420 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1421 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1422
1423 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1424 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1425 };
1426
1427 if (ST->hasSSE2())
1428 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1429 if (auto KindCost = Entry->Cost[CostKind])
1430 return LT.first * *KindCost;
1431
1432 static const CostKindTblEntry SSE1CostTable[] = {
1433 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1434 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1435
1436 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1437 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1438
1439 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1440 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1441
1442 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1443 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1444
1445 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1446 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1447 };
1448
1449 if (ST->hasSSE1())
1450 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1451 if (auto KindCost = Entry->Cost[CostKind])
1452 return LT.first * *KindCost;
1453
1454 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1455 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1456 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1457 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1458 };
1459
1460 if (ST->is64Bit())
1461 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1462 if (auto KindCost = Entry->Cost[CostKind])
1463 return LT.first * *KindCost;
1464
1465 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1466 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1467 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1468 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1469
1470 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1471 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1472 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1473
1474 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1475 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1476 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1477
1478 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1479 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1480 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1481 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1482 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1483 };
1484
1485 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1486 if (auto KindCost = Entry->Cost[CostKind])
1487 return LT.first * *KindCost;
1488
1489 // It is not a good idea to vectorize division. We have to scalarize it and
1490 // in the process we will often end up having to spilling regular
1491 // registers. The overhead of division is going to dominate most kernels
1492 // anyways so try hard to prevent vectorization of division - it is
1493 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1494 // to hide "20 cycles" for each lane.
1495 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1496 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1497 ISD == ISD::UREM)) {
1498 InstructionCost ScalarCost =
1499 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
1500 Op1Info.getNoProps(), Op2Info.getNoProps());
1501 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1502 }
1503
1504 // Handle some basic single instruction code size cases.
1505 if (CostKind == TTI::TCK_CodeSize) {
1506 switch (ISD) {
1507 case ISD::FADD:
1508 case ISD::FSUB:
1509 case ISD::FMUL:
1510 case ISD::FDIV:
1511 case ISD::FNEG:
1512 case ISD::AND:
1513 case ISD::OR:
1514 case ISD::XOR:
1515 return LT.first;
1516 break;
1517 }
1518 }
1519
1520 // Fallback to the default implementation.
1521 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1522 Args, CxtI);
1523}
1524
1527 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1529 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1530 return TTI::TCC_Basic;
1532}
1533
1535 VectorType *DstTy, VectorType *SrcTy,
1536 ArrayRef<int> Mask,
1538 int Index, VectorType *SubTp,
1540 const Instruction *CxtI) const {
1541 assert((Mask.empty() || DstTy->isScalableTy() ||
1542 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1543 "Expected the Mask to match the return size if given");
1544 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1545 "Expected the same scalar types");
1546
1547 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1548 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1549 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1550
1551 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1552
1553 // If all args are constant than this will be constant folded away.
1554 if (!Args.empty() &&
1555 all_of(Args, [](const Value *Arg) { return isa<Constant>(Arg); }))
1556 return TTI::TCC_Free;
1557
1558 // Recognize a basic concat_vector shuffle.
1559 if (Kind == TTI::SK_PermuteTwoSrc &&
1560 Mask.size() == (2 * SrcTy->getElementCount().getKnownMinValue()) &&
1561 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1565 CostKind, Mask.size() / 2, SrcTy);
1566
1567 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1568 if (Kind == TTI::SK_Transpose)
1569 if (LT.second != MVT::v4f64 && LT.second != MVT::v4i64)
1570 Kind = TTI::SK_PermuteTwoSrc;
1571
1572 if (Kind == TTI::SK_Broadcast) {
1573 // For Broadcasts we are splatting the first element from the first input
1574 // register, so only need to reference that input and all the output
1575 // registers are the same.
1576 LT.first = 1;
1577
1578 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1579 // If many-used-load whose every use is one of a small set of operations
1580 // that SLP can rewrite into a single vector lane, codegen can fold it into
1581 // the free broadcast.
1582 using namespace PatternMatch;
1583 auto IsBroadcastLoadFoldUser = [&](const User *U) {
1584 if (isa<InsertElementInst>(U) && U->getOperand(1) == Args[0])
1585 return true;
1586 if (U->getType()->isVectorTy())
1587 return false;
1588 // Terminators (return/branch/switch/indirectbr/resume/invoke EH)
1589 // and phis carry the value across control flow.
1590 if (const auto *I = dyn_cast<Instruction>(U))
1591 if (I->isTerminator() ||
1593 return false;
1594 // Only pure calls can be folded.
1595 if (const auto *CB = dyn_cast<CallBase>(U))
1596 return CB->doesNotAccessMemory() && !CB->mayHaveSideEffects();
1597 return true;
1598 };
1599 auto IsFoldableSLPBroadcastLoad = [&]() {
1600 if (!match(Args[0], m_Load(m_Value())))
1601 return false;
1602 auto *FVT = dyn_cast<FixedVectorType>(DstTy);
1603 if (!FVT)
1604 return false;
1605 // getNumUses() counts each Use, matching the per-lane broadcast
1606 // accounting (a use like `op %x, %x` consumes two broadcast lanes).
1607 if (Args[0]->getNumUses() != FVT->getNumElements())
1608 return false;
1609 return all_of(Args[0]->users(), IsBroadcastLoadFoldUser);
1610 };
1611 if (!Args.empty() &&
1612 (match(Args[0], m_OneUse(m_Load(m_Value()))) ||
1613 IsFoldableSLPBroadcastLoad()) &&
1614 (ST->hasAVX2() ||
1615 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1616 return TTI::TCC_Free;
1617 }
1618
1619 // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
1620 // permutation.
1621 // Attempt to detect a shuffle mask with a single defined element.
1622 bool IsInLaneShuffle = false;
1623 bool IsSingleElementMask = false;
1624 if (SrcTy->getPrimitiveSizeInBits() > 0 &&
1625 (SrcTy->getPrimitiveSizeInBits() % 128) == 0 &&
1626 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
1627 Mask.size() == SrcTy->getElementCount().getKnownMinValue()) {
1628 unsigned NumLanes = SrcTy->getPrimitiveSizeInBits() / 128;
1629 unsigned NumEltsPerLane = Mask.size() / NumLanes;
1630 if ((Mask.size() % NumLanes) == 0) {
1631 IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
1632 return P.value() == PoisonMaskElem ||
1633 ((P.value() % Mask.size()) / NumEltsPerLane) ==
1634 (P.index() / NumEltsPerLane);
1635 });
1636 IsSingleElementMask =
1637 (Mask.size() - 1) == static_cast<unsigned>(count_if(Mask, [](int M) {
1638 return M == PoisonMaskElem;
1639 }));
1640 }
1641 }
1642
1643 // Treat <X x bfloat> shuffles as <X x half>.
1644 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1645 LT.second = LT.second.changeVectorElementType(MVT::f16);
1646
1647 // Subvector extractions are free if they start at the beginning of a
1648 // vector and cheap if the subvectors are aligned.
1649 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1650 int NumElts = LT.second.getVectorNumElements();
1651 if ((Index % NumElts) == 0)
1652 return TTI::TCC_Free;
1653 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1654 if (SubLT.second.isVector()) {
1655 int NumSubElts = SubLT.second.getVectorNumElements();
1656 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1657 return SubLT.first;
1658 // Handle some cases for widening legalization. For now we only handle
1659 // cases where the original subvector was naturally aligned and evenly
1660 // fit in its legalized subvector type.
1661 // FIXME: Remove some of the alignment restrictions.
1662 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1663 // vectors.
1664 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1665 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1666 (NumSubElts % OrigSubElts) == 0 &&
1667 LT.second.getVectorElementType() ==
1668 SubLT.second.getVectorElementType() &&
1669 LT.second.getVectorElementType().getSizeInBits() ==
1670 SrcTy->getElementType()->getPrimitiveSizeInBits()) {
1671 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1672 "Unexpected number of elements!");
1673 auto *VecTy = FixedVectorType::get(SrcTy->getElementType(),
1674 LT.second.getVectorNumElements());
1675 auto *SubTy = FixedVectorType::get(SrcTy->getElementType(),
1676 SubLT.second.getVectorNumElements());
1677 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1678 InstructionCost ExtractCost =
1680 ExtractIndex, SubTy);
1681
1682 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1683 // if we have SSSE3 we can use pshufb.
1684 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1685 return ExtractCost + 1; // pshufd or pshufb
1686
1687 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1688 "Unexpected vector size");
1689
1690 return ExtractCost + 2; // worst case pshufhw + pshufd
1691 }
1692 }
1693 // If the extract subvector is not optimal, treat it as single op shuffle.
1695 }
1696
1697 // Subvector insertions are cheap if the subvectors are aligned.
1698 // Note that in general, the insertion starting at the beginning of a vector
1699 // isn't free, because we need to preserve the rest of the wide vector,
1700 // but if the destination vector legalizes to the same width as the subvector
1701 // then the insertion will simplify to a (free) register copy.
1702 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1703 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(DstTy);
1704 int NumElts = DstLT.second.getVectorNumElements();
1705 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1706 if (SubLT.second.isVector()) {
1707 int NumSubElts = SubLT.second.getVectorNumElements();
1708 bool MatchingTypes =
1709 NumElts == NumSubElts &&
1710 (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0;
1711 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1712 return MatchingTypes ? TTI::TCC_Free : SubLT.first;
1713 }
1714
1715 // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have
1716 // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of
1717 // v1f32 (legalised to f32) into a v4f32.
1718 if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 &&
1719 SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41()))
1720 return 1;
1721
1722 // If the insertion is the lowest subvector then it will be blended
1723 // otherwise treat it like a 2-op shuffle.
1724 Kind =
1725 (Index == 0 && LT.first == 1) ? TTI::SK_Select : TTI::SK_PermuteTwoSrc;
1726 }
1727
1728 // Handle some common (illegal) sub-vector types as they are often very cheap
1729 // to shuffle even on targets without PSHUFB.
1730 EVT VT = TLI->getValueType(DL, SrcTy);
1731 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1732 !ST->hasSSSE3()) {
1733 static const CostKindTblEntry SSE2SubVectorShuffleTbl[] = {
1734 {TTI::SK_Broadcast, MVT::v4i16, {1,1,1,1}}, // pshuflw
1735 {TTI::SK_Broadcast, MVT::v2i16, {1,1,1,1}}, // pshuflw
1736 {TTI::SK_Broadcast, MVT::v8i8, {2,2,2,2}}, // punpck/pshuflw
1737 {TTI::SK_Broadcast, MVT::v4i8, {2,2,2,2}}, // punpck/pshuflw
1738 {TTI::SK_Broadcast, MVT::v2i8, {1,1,1,1}}, // punpck
1739
1740 {TTI::SK_Reverse, MVT::v4i16, {1,1,1,1}}, // pshuflw
1741 {TTI::SK_Reverse, MVT::v2i16, {1,1,1,1}}, // pshuflw
1742 {TTI::SK_Reverse, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw/packus
1743 {TTI::SK_Reverse, MVT::v2i8, {1,1,1,1}}, // punpck
1744
1745 {TTI::SK_Splice, MVT::v4i16, {2,2,2,2}}, // punpck+psrldq
1746 {TTI::SK_Splice, MVT::v2i16, {2,2,2,2}}, // punpck+psrldq
1747 {TTI::SK_Splice, MVT::v4i8, {2,2,2,2}}, // punpck+psrldq
1748 {TTI::SK_Splice, MVT::v2i8, {2,2,2,2}}, // punpck+psrldq
1749
1750 {TTI::SK_PermuteTwoSrc, MVT::v4i16, {2,2,2,2}}, // punpck/pshuflw
1751 {TTI::SK_PermuteTwoSrc, MVT::v2i16, {2,2,2,2}}, // punpck/pshuflw
1752 {TTI::SK_PermuteTwoSrc, MVT::v8i8, {7,7,7,7}}, // punpck/pshuflw
1753 {TTI::SK_PermuteTwoSrc, MVT::v4i8, {4,4,4,4}}, // punpck/pshuflw
1754 {TTI::SK_PermuteTwoSrc, MVT::v2i8, {2,2,2,2}}, // punpck
1755
1756 {TTI::SK_PermuteSingleSrc, MVT::v4i16, {1,1,1,1}}, // pshuflw
1757 {TTI::SK_PermuteSingleSrc, MVT::v2i16, {1,1,1,1}}, // pshuflw
1758 {TTI::SK_PermuteSingleSrc, MVT::v8i8, {5,5,5,5}}, // punpck/pshuflw
1759 {TTI::SK_PermuteSingleSrc, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw
1760 {TTI::SK_PermuteSingleSrc, MVT::v2i8, {1,1,1,1}}, // punpck
1761 };
1762
1763 if (ST->hasSSE2())
1764 if (const auto *Entry =
1765 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1766 if (auto KindCost = Entry->Cost[CostKind])
1767 return LT.first * *KindCost;
1768 }
1769
1770 // We are going to permute multiple sources and the result will be in multiple
1771 // destinations. Providing an accurate cost only for splits where the element
1772 // type remains the same.
1773 if (LT.first != 1) {
1774 MVT LegalVT = LT.second;
1775 if (LegalVT.isVector() &&
1776 LegalVT.getVectorElementType().getSizeInBits() ==
1777 SrcTy->getElementType()->getPrimitiveSizeInBits() &&
1778 LegalVT.getVectorNumElements() <
1779 cast<FixedVectorType>(SrcTy)->getNumElements()) {
1780 unsigned VecTySize = DL.getTypeStoreSize(SrcTy);
1781 unsigned LegalVTSize = LegalVT.getStoreSize();
1782 // Number of source vectors after legalization:
1783 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1784 // Number of destination vectors after legalization:
1785 InstructionCost NumOfDests = LT.first;
1786
1787 auto *SingleOpTy = FixedVectorType::get(SrcTy->getElementType(),
1788 LegalVT.getVectorNumElements());
1789
1790 if (!Mask.empty() && NumOfDests.isValid()) {
1791 // Try to perform better estimation of the permutation.
1792 // 1. Split the source/destination vectors into real registers.
1793 // 2. Do the mask analysis to identify which real registers are
1794 // permuted. If more than 1 source registers are used for the
1795 // destination register building, the cost for this destination register
1796 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1797 // source register is used, build mask and calculate the cost as a cost
1798 // of PermuteSingleSrc.
1799 // Also, for the single register permute we try to identify if the
1800 // destination register is just a copy of the source register or the
1801 // copy of the previous destination register (the cost is
1802 // TTI::TCC_Basic). If the source register is just reused, the cost for
1803 // this operation is TTI::TCC_Free.
1804 NumOfDests =
1806 FixedVectorType::get(SrcTy->getElementType(), Mask.size()))
1807 .first;
1808 unsigned E = NumOfDests.getValue();
1809 unsigned NormalizedVF =
1810 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1811 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1812 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1813 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1814 copy(Mask, NormalizedMask.begin());
1815 unsigned PrevSrcReg = 0;
1816 ArrayRef<int> PrevRegMask;
1819 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1820 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1821 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1822 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1823 // Check if the previous register can be just copied to the next
1824 // one.
1825 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1826 PrevRegMask != RegMask)
1827 Cost +=
1829 SingleOpTy, RegMask, CostKind, 0, nullptr);
1830 else
1831 // Just a copy of previous destination register.
1833 return;
1834 }
1835 if (SrcReg != DestReg &&
1836 any_of(RegMask, not_equal_to(PoisonMaskElem))) {
1837 // Just a copy of the source register.
1839 }
1840 PrevSrcReg = SrcReg;
1841 PrevRegMask = RegMask;
1842 },
1843 [this, SingleOpTy, CostKind,
1844 &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/,
1845 unsigned /*Unused*/, bool /*Unused*/) {
1847 SingleOpTy, RegMask, CostKind, 0, nullptr);
1848 });
1849 return Cost;
1850 }
1851
1852 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1853 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1854 SingleOpTy, {}, CostKind, 0,
1855 nullptr);
1856 }
1857
1858 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1859 SubTp);
1860 }
1861
1862 // If we're just moving a single element around (probably as an alternative to
1863 // extracting it), we can assume this is cheap.
1864 if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask)
1865 return TTI::TCC_Basic;
1866
1867 static const CostKindTblEntry AVX512VBMIShuffleTbl[] = {
1868 { TTI::SK_Reverse, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1869 { TTI::SK_Reverse, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1870 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1871 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1872 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 2, 2, 2, 2 } }, // vpermt2b
1873 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // vpermt2b
1874 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 2, 2, 2, 2 } } // vpermt2b
1875 };
1876
1877 if (ST->hasVBMI())
1878 if (const auto *Entry =
1879 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1880 if (auto KindCost = Entry->Cost[CostKind])
1881 return LT.first * *KindCost;
1882
1883 static const CostKindTblEntry AVX512BWShuffleTbl[] = {
1884 { TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1885 { TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1886 { TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1887
1888 { TTI::SK_Reverse, MVT::v32i16, { 2, 6, 2, 4 } }, // vpermw
1889 { TTI::SK_Reverse, MVT::v32f16, { 2, 6, 2, 4 } }, // vpermw
1890 { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1891 { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1892 { TTI::SK_Reverse, MVT::v64i8, { 2, 9, 2, 3 } }, // pshufb + vshufi64x2
1893
1894 { TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
1895 { TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
1896 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1897 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1898 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 8, 8, 8, 8 } }, // extend to v32i16
1899
1900 { TTI::SK_PermuteTwoSrc, MVT::v32i16,{ 2, 2, 2, 2 } }, // vpermt2w
1901 { TTI::SK_PermuteTwoSrc, MVT::v32f16,{ 2, 2, 2, 2 } }, // vpermt2w
1902 { TTI::SK_PermuteTwoSrc, MVT::v16i16,{ 2, 2, 2, 2 } }, // vpermt2w
1903 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 2, 2, 2, 2 } }, // vpermt2w
1904 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 19, 19, 19, 19 } }, // 6 * v32i8 + 1
1905
1906 { TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vblendmw
1907 { TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vblendmb
1908
1909 { TTI::SK_Splice, MVT::v32i16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1910 { TTI::SK_Splice, MVT::v32f16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1911 { TTI::SK_Splice, MVT::v64i8, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1912 };
1913
1914 if (ST->hasBWI())
1915 if (const auto *Entry =
1916 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1917 if (auto KindCost = Entry->Cost[CostKind])
1918 return LT.first * *KindCost;
1919
1920 static const CostKindTblEntry AVX512InLaneShuffleTbl[] = {
1921 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } },
1922 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } },
1923 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } },
1924 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } },
1925 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } },
1926 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } },
1927 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } },
1928 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } },
1929 };
1930
1931 if (IsInLaneShuffle && ST->hasAVX512())
1932 if (const auto *Entry =
1933 CostTableLookup(AVX512InLaneShuffleTbl, Kind, LT.second))
1934 if (auto KindCost = Entry->Cost[CostKind])
1935 return LT.first * *KindCost;
1936
1937 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1938 {TTI::SK_Broadcast, MVT::v8f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1939 {TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1940 {TTI::SK_Broadcast, MVT::v16f32, { 1, 3, 1, 1 } }, // vbroadcastss
1941 {TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 1 } }, // vbroadcastss
1942 {TTI::SK_Broadcast, MVT::v8i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1943 {TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1944 {TTI::SK_Broadcast, MVT::v16i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1945 {TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1946 {TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1947 {TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1948 {TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1949 {TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1950 {TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1951 {TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 1 }}, // vpbroadcastb
1952
1953 {TTI::SK_Reverse, MVT::v8f64, { 1, 5, 2, 3 } }, // vpermpd
1954 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 2, 3 } }, // vpermps
1955 {TTI::SK_Reverse, MVT::v8i64, { 1, 5, 2, 3 } }, // vpermq
1956 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 2, 3 } }, // vpermd
1957 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1958 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1959 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1960
1961 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1962 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1963 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1964 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1965 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1966 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1967 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1968 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1969 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1970 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1971 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1972
1973 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1974 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1975 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1976 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1977 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1978 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1979 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1980 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1981 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1982 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1983 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1984 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1985 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1986
1987 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 2, 3, 1, 1 } }, // vpermt2pd
1988 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 2, 3, 1, 1 } }, // vpermt2ps
1989 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 2, 3, 1, 1 } }, // vpermt2q
1990 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 2, 3, 1, 1 } }, // vpermt2d
1991 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 3, 1, 1 } }, // vpermt2pd
1992 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 3, 1, 1 } }, // vpermt2ps
1993 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 3, 1, 1 } }, // vpermt2q
1994 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 3, 1, 1 } }, // vpermt2d
1995 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } },
1996 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } },
1997 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } },
1998 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } },
1999
2000 // FIXME: This just applies the type legalization cost rules above
2001 // assuming these completely split.
2002 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
2003 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
2004 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
2005 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
2006 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
2007 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
2008
2009 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
2010 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
2011 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
2012 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
2013 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
2014 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
2015 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
2016 };
2017
2018 if (ST->hasAVX512())
2019 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
2020 if (auto KindCost = Entry->Cost[CostKind])
2021 return LT.first * *KindCost;
2022
2023 static const CostKindTblEntry AVX2InLaneShuffleTbl[] = {
2024 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 1, 1, 1, 1 } }, // vpshufb
2025 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 1, 1, 1, 1 } }, // vpshufb
2026 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpshufb
2027
2028 { TTI::SK_Transpose, MVT::v4f64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2029 { TTI::SK_Transpose, MVT::v4i64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2030
2031 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
2032 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
2033 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
2034 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
2035 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
2036 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
2037 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
2038 };
2039
2040 if (IsInLaneShuffle && ST->hasAVX2())
2041 if (const auto *Entry =
2042 CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second))
2043 if (auto KindCost = Entry->Cost[CostKind])
2044 return LT.first * *KindCost;
2045
2046 static const CostKindTblEntry AVX2ShuffleTbl[] = {
2047 { TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 2 } }, // vbroadcastpd
2048 { TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 2 } }, // vbroadcastps
2049 { TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 2 } }, // vpbroadcastq
2050 { TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 2 } }, // vpbroadcastd
2051 { TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 2 } }, // vpbroadcastw
2052 { TTI::SK_Broadcast, MVT::v8i16, { 1, 3, 1, 1 } }, // vpbroadcastw
2053 { TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 2 } }, // vpbroadcastw
2054 { TTI::SK_Broadcast, MVT::v8f16, { 1, 3, 1, 1 } }, // vpbroadcastw
2055 { TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 2 } }, // vpbroadcastb
2056 { TTI::SK_Broadcast, MVT::v16i8, { 1, 3, 1, 1 } }, // vpbroadcastb
2057
2058 { TTI::SK_Reverse, MVT::v4f64, { 1, 6, 1, 2 } }, // vpermpd
2059 { TTI::SK_Reverse, MVT::v8f32, { 2, 7, 2, 4 } }, // vpermps
2060 { TTI::SK_Reverse, MVT::v4i64, { 1, 6, 1, 2 } }, // vpermq
2061 { TTI::SK_Reverse, MVT::v8i32, { 2, 7, 2, 4 } }, // vpermd
2062 { TTI::SK_Reverse, MVT::v16i16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2063 { TTI::SK_Reverse, MVT::v16f16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2064 { TTI::SK_Reverse, MVT::v32i8, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2065
2066 { TTI::SK_Select, MVT::v16i16, { 1, 1, 1, 1 } }, // vpblendvb
2067 { TTI::SK_Select, MVT::v16f16, { 1, 1, 1, 1 } }, // vpblendvb
2068 { TTI::SK_Select, MVT::v32i8, { 1, 1, 1, 1 } }, // vpblendvb
2069
2070 { TTI::SK_Splice, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2071 { TTI::SK_Splice, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2072 { TTI::SK_Splice, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2073 { TTI::SK_Splice, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2074 { TTI::SK_Splice, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2075
2076 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd
2077 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps
2078 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq
2079 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd
2080 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } },
2081 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } },
2082 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } },
2083
2084 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 3, 3, 3, 3 } }, // 2*vpermpd + vblendpd
2085 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 3, 3, 3, 3 } }, // 2*vpermps + vblendps
2086 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 3, 3, 3, 3 } }, // 2*vpermq + vpblendd
2087 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 3, 3, 3, 3 } }, // 2*vpermd + vpblendd
2088 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 7, 7, 7, 7 } },
2089 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 7, 7, 7, 7 } },
2090 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 7, 7, 7, 7 } },
2091 };
2092
2093 if (ST->hasAVX2())
2094 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
2095 if (auto KindCost = Entry->Cost[CostKind])
2096 return LT.first * *KindCost;
2097
2098 static const CostKindTblEntry XOPShuffleTbl[] = {
2099 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2100 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2101 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2102 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2103 { TTI::SK_PermuteSingleSrc, MVT::v16i16,{ 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2104 // + vinsertf128
2105 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2106 // + vinsertf128
2107
2108 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2109 // + vinsertf128
2110
2111 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 1, 1, 1, 1 } }, // vpperm
2112 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2113 // + vinsertf128
2114 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 1, 1, 1, 1 } }, // vpperm
2115 };
2116
2117 if (ST->hasXOP())
2118 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
2119 if (auto KindCost = Entry->Cost[CostKind])
2120 return LT.first * *KindCost;
2121
2122 static const CostKindTblEntry AVX1InLaneShuffleTbl[] = {
2123 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermilpd
2124 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermilpd
2125 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermilps
2126 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermilps
2127
2128 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2129 // + vpor + vinsertf128
2130 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2131 // + vpor + vinsertf128
2132 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2133 // + vpor + vinsertf128
2134
2135 { TTI::SK_Transpose, MVT::v4f64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2136 { TTI::SK_Transpose, MVT::v4i64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2137
2138 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
2139 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
2140 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpermilpd + vblendpd
2141 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpermilps + vblendps
2142 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2143 // + 2*vpor + vinsertf128
2144 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2145 // + 2*vpor + vinsertf128
2146 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2147 // + 2*vpor + vinsertf128
2148 };
2149
2150 if (IsInLaneShuffle && ST->hasAVX())
2151 if (const auto *Entry =
2152 CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second))
2153 if (auto KindCost = Entry->Cost[CostKind])
2154 return LT.first * *KindCost;
2155
2156 static const CostKindTblEntry AVX1ShuffleTbl[] = {
2157 {TTI::SK_Broadcast, MVT::v4f64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2158 {TTI::SK_Broadcast, MVT::v8f32, {2,3,2,3}}, // vperm2f128 + vpermilps
2159 {TTI::SK_Broadcast, MVT::v4i64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2160 {TTI::SK_Broadcast, MVT::v8i32, {2,3,2,3}}, // vperm2f128 + vpermilps
2161 {TTI::SK_Broadcast, MVT::v16i16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2162 {TTI::SK_Broadcast, MVT::v16f16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2163 {TTI::SK_Broadcast, MVT::v32i8, {3,4,3,6}}, // vpshufb + vinsertf128
2164
2165 {TTI::SK_Reverse, MVT::v4f64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2166 {TTI::SK_Reverse, MVT::v8f32, {2,7,2,4}}, // vperm2f128 + vpermilps
2167 {TTI::SK_Reverse, MVT::v4i64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2168 {TTI::SK_Reverse, MVT::v8i32, {2,7,2,4}}, // vperm2f128 + vpermilps
2169 {TTI::SK_Reverse, MVT::v16i16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2170 // + vinsertf128
2171 {TTI::SK_Reverse, MVT::v16f16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2172 // + vinsertf128
2173 {TTI::SK_Reverse, MVT::v32i8, {2,9,5,5}}, // vextractf128 + 2*pshufb
2174 // + vinsertf128
2175
2176 {TTI::SK_Select, MVT::v4i64, {1,1,1,1}}, // vblendpd
2177 {TTI::SK_Select, MVT::v4f64, {1,1,1,1}}, // vblendpd
2178 {TTI::SK_Select, MVT::v8i32, {1,1,1,1}}, // vblendps
2179 {TTI::SK_Select, MVT::v8f32, {1,1,1,1}}, // vblendps
2180 {TTI::SK_Select, MVT::v16i16, {3,3,3,3}}, // vpand + vpandn + vpor
2181 {TTI::SK_Select, MVT::v16f16, {3,3,3,3}}, // vpand + vpandn + vpor
2182 {TTI::SK_Select, MVT::v32i8, {3,3,3,3}}, // vpand + vpandn + vpor
2183
2184 {TTI::SK_Splice, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + shufpd
2185 {TTI::SK_Splice, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + shufpd
2186 {TTI::SK_Splice, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2187 {TTI::SK_Splice, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2188 {TTI::SK_Splice, MVT::v16i16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2189 {TTI::SK_Splice, MVT::v16f16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2190 {TTI::SK_Splice, MVT::v32i8, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2191
2192 {TTI::SK_PermuteSingleSrc, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vshufpd
2193 {TTI::SK_PermuteSingleSrc, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vshufpd
2194 {TTI::SK_PermuteSingleSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2195 {TTI::SK_PermuteSingleSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2196 {TTI::SK_PermuteSingleSrc, MVT::v16i16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2197 // + 2*por + vinsertf128
2198 {TTI::SK_PermuteSingleSrc, MVT::v16f16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2199 // + 2*por + vinsertf128
2200 {TTI::SK_PermuteSingleSrc, MVT::v32i8, {8,8,8,8}}, // vextractf128 + 4*pshufb
2201 // + 2*por + vinsertf128
2202
2203 {TTI::SK_PermuteTwoSrc, MVT::v4f64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2204 {TTI::SK_PermuteTwoSrc, MVT::v4i64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2205 {TTI::SK_PermuteTwoSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2206 {TTI::SK_PermuteTwoSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2207 {TTI::SK_PermuteTwoSrc, MVT::v16i16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2208 // + 4*por + vinsertf128
2209 {TTI::SK_PermuteTwoSrc, MVT::v16f16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2210 // + 4*por + vinsertf128
2211 {TTI::SK_PermuteTwoSrc, MVT::v32i8, {15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2212 // + 4*por + vinsertf128
2213 };
2214
2215 if (ST->hasAVX())
2216 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2217 if (auto KindCost = Entry->Cost[CostKind])
2218 return LT.first * *KindCost;
2219
2220 static const CostKindTblEntry SSE41ShuffleTbl[] = {
2221 {TTI::SK_Select, MVT::v2i64, {1,1,1,1}}, // pblendw
2222 {TTI::SK_Select, MVT::v2f64, {1,1,1,1}}, // movsd
2223 {TTI::SK_Select, MVT::v4i32, {1,1,1,1}}, // pblendw
2224 {TTI::SK_Select, MVT::v4f32, {1,1,1,1}}, // blendps
2225 {TTI::SK_Select, MVT::v8i16, {1,1,1,1}}, // pblendw
2226 {TTI::SK_Select, MVT::v8f16, {1,1,1,1}}, // pblendw
2227 {TTI::SK_Select, MVT::v16i8, {1,1,1,1}} // pblendvb
2228 };
2229
2230 if (ST->hasSSE41())
2231 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2232 if (auto KindCost = Entry->Cost[CostKind])
2233 return LT.first * *KindCost;
2234
2235 static const CostKindTblEntry SSSE3ShuffleTbl[] = {
2236 {TTI::SK_Broadcast, MVT::v8i16, {1, 3, 2, 2}}, // pshufb
2237 {TTI::SK_Broadcast, MVT::v8f16, {1, 3, 2, 2}}, // pshufb
2238 {TTI::SK_Broadcast, MVT::v16i8, {1, 3, 2, 2}}, // pshufb
2239
2240 {TTI::SK_Reverse, MVT::v8i16, {1, 2, 1, 2}}, // pshufb
2241 {TTI::SK_Reverse, MVT::v8f16, {1, 2, 1, 2}}, // pshufb
2242 {TTI::SK_Reverse, MVT::v16i8, {1, 2, 1, 2}}, // pshufb
2243
2244 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2245 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2246 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2247
2248 {TTI::SK_Splice, MVT::v4i32, {1, 1, 1, 1}}, // palignr
2249 {TTI::SK_Splice, MVT::v4f32, {1, 1, 1, 1}}, // palignr
2250 {TTI::SK_Splice, MVT::v8i16, {1, 1, 1, 1}}, // palignr
2251 {TTI::SK_Splice, MVT::v8f16, {1, 1, 1, 1}}, // palignr
2252 {TTI::SK_Splice, MVT::v16i8, {1, 1, 1, 1}}, // palignr
2253
2254 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
2255 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
2256 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
2257
2258 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2259 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2260 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2261 };
2262
2263 if (ST->hasSSSE3())
2264 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2265 if (auto KindCost = Entry->Cost[CostKind])
2266 return LT.first * *KindCost;
2267
2268 static const CostKindTblEntry SSE2ShuffleTbl[] = {
2269 {TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2270 {TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2271 {TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2272 {TTI::SK_Broadcast, MVT::v8i16, {1, 2, 2, 2}}, // pshuflw + pshufd
2273 {TTI::SK_Broadcast, MVT::v8f16, {1, 2, 2, 2}}, // pshuflw + pshufd
2274 {TTI::SK_Broadcast, MVT::v16i8, {2, 3, 3, 4}}, // unpck + pshuflw + pshufd
2275
2276 {TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2277 {TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2278 {TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2279 {TTI::SK_Reverse, MVT::v8i16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2280 {TTI::SK_Reverse, MVT::v8f16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2281 {TTI::SK_Reverse, MVT::v16i8, {5, 6,11,11}}, // 2*pshuflw + 2*pshufhw
2282 // + 2*pshufd + 2*unpck + packus
2283
2284 {TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd
2285 {TTI::SK_Select, MVT::v2f64, {1, 1, 1, 1}}, // movsd
2286 {TTI::SK_Select, MVT::v4i32, {2, 2, 2, 2}}, // 2*shufps
2287 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // pand + pandn + por
2288 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // pand + pandn + por
2289 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // pand + pandn + por
2290
2291 {TTI::SK_Splice, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2292 {TTI::SK_Splice, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2293 {TTI::SK_Splice, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2294 {TTI::SK_Splice, MVT::v8i16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2295 {TTI::SK_Splice, MVT::v8f16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2296 {TTI::SK_Splice, MVT::v16i8, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2297
2298 {TTI::SK_PermuteSingleSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2299 {TTI::SK_PermuteSingleSrc, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2300 {TTI::SK_PermuteSingleSrc, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2301 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2302 // + pshufd/unpck
2303 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2304 // + pshufd/unpck
2305 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {8, 10, 10, 10}}, // 2*pshuflw + 2*pshufhw
2306 // + 2*pshufd + 2*unpck + 2*packus
2307
2308 {TTI::SK_PermuteTwoSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2309 {TTI::SK_PermuteTwoSrc, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2310 {TTI::SK_PermuteTwoSrc, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2311 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {6, 8, 8, 8}}, // blend+permute
2312 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {6, 8, 8, 8}}, // blend+permute
2313 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {11, 13, 13, 13}}, // blend+permute
2314 };
2315
2316 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2317 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2318 };
2319
2320 if (ST->hasSSE2()) {
2321 bool IsLoad =
2322 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2323 if (ST->hasSSE3() && IsLoad)
2324 if (const auto *Entry =
2325 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2326 assert(isLegalBroadcastLoad(SrcTy->getElementType(),
2327 LT.second.getVectorElementCount()) &&
2328 "Table entry missing from isLegalBroadcastLoad()");
2329 return LT.first * Entry->Cost;
2330 }
2331
2332 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2333 if (auto KindCost = Entry->Cost[CostKind])
2334 return LT.first * *KindCost;
2335 }
2336
2337 static const CostKindTblEntry SSE1ShuffleTbl[] = {
2338 { TTI::SK_Broadcast, MVT::v4f32, {1,1,1,1} }, // shufps
2339 { TTI::SK_Reverse, MVT::v4f32, {1,1,1,1} }, // shufps
2340 { TTI::SK_Select, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2341 { TTI::SK_Splice, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2342 { TTI::SK_PermuteSingleSrc, MVT::v4f32, {1,1,1,1} }, // shufps
2343 { TTI::SK_PermuteTwoSrc, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2344 };
2345
2346 if (ST->hasSSE1()) {
2347 if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) {
2348 // SHUFPS: both pairs must come from the same source register.
2349 auto MatchSHUFPS = [](int X, int Y) {
2350 return X < 0 || Y < 0 || ((X & 4) == (Y & 4));
2351 };
2352 if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3]))
2353 return 1;
2354 }
2355 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2356 if (auto KindCost = Entry->Cost[CostKind])
2357 return LT.first * *KindCost;
2358 }
2359
2360 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
2361 SubTp);
2362}
2363
2365 Type *Src,
2368 const Instruction *I) const {
2369 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2370 assert(ISD && "Invalid opcode");
2371
2372 // The cost tables include both specific, custom (non-legal) src/dst type
2373 // conversions and generic, legalized types. We test for customs first, before
2374 // falling back to legalization.
2375 // FIXME: Need a better design of the cost table to handle non-simple types of
2376 // potential massive combinations (elem_num x src_type x dst_type).
2377 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2378 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2379 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2380
2381 // Mask sign extend has an instruction.
2382 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2383 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2384 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2385 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2386 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2387 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2388 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2389 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2390 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2391 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2392 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2393 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2394 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2395 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2396 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2397 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2398 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2399
2400 // Mask zero extend is a sext + shift.
2401 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2402 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2403 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2404 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2405 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2406 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2407 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2408 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2409 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2410 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2411 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2412 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2413 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2414 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2415 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2416 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2417 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2418
2419 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2420 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2421 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2422 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2423 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2424 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2425 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2426 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2427 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2428 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2429 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2430 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2431 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2432 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2433 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2434 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2435 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2436
2437 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2438 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2439 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2440 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2441 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2442 };
2443
2444 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2445 // Mask sign extend has an instruction.
2446 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2447 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2448 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2449 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2450 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2451 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2452 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2453 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2454
2455 // Mask zero extend is a sext + shift.
2456 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2457 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2458 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2459 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2460 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2461 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2462 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2463 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2464
2465 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2466 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2467 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2468 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2469 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2470 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2471 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2472 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2473
2474 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2475 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2476
2477 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2478 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2479
2480 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2481 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2482
2483 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2484 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2485 };
2486
2487 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2488 // 256-bit wide vectors.
2489
2490 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2491 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2492 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2493 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2494 { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps
2495 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
2496 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2497 { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph
2498
2499 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2500 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2501 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2502 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2503 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2504 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2505 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2506 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2507 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2508 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2509 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2510 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2511 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2512 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2513 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2514 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2515 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2516 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2517 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2518 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2519 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2520 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2521 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2522 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2523 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2524 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2525 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2526 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2527 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2528 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2529 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2530 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2531 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2532 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2533
2534 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2535 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2536 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2537
2538 // Sign extend is zmm vpternlogd+vptruncdb.
2539 // Zero extend is zmm broadcast load+vptruncdw.
2540 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2541 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2542 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2543 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2544 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2545 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2546 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2547 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2548
2549 // Sign extend is zmm vpternlogd+vptruncdw.
2550 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2551 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2552 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2553 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2554 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2555 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2556 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2557 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2558 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2559
2560 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2561 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2562 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2563 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2564 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2565 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2566 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2567 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2568 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2569 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2570
2571 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2572 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2573 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2574 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2575
2576 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2577 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2578 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2579 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2580 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2581 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2582 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2583 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2584 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2585 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2586
2587 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2588 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2589
2590 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2591 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2592 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2593 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2594 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2595 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2596 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2597 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2598
2599 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2600 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2601 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2602 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2603 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2604 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2605 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2606 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2607 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2608 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2609
2610 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2611 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2612 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2613 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2614 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2615 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2616 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2617 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2618 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2619 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2620 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2621
2622 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2623 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2624 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2625 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2626 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2627 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2628 };
2629
2630 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2631 // Mask sign extend has an instruction.
2632 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2633 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2634 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2635 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2636 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2637 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2638 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2639 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2640 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2641 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2642 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2643 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2644 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2645 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2646 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2647 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2648 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2649
2650 // Mask zero extend is a sext + shift.
2651 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2652 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2653 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2654 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2655 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2656 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2657 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2658 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2659 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2660 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2661 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2662 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2663 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2664 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2665 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2666 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2667 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2668
2669 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2670 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2671 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2672 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2673 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2674 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2675 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2676 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2677 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2678 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2679 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2680 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2681 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2682 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2683 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2684 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2685 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2686
2687 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2688 };
2689
2690 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2691 // Mask sign extend has an instruction.
2692 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2693 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2694 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2695 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2696 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2697 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2698 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2699 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2700
2701 // Mask zero extend is a sext + shift.
2702 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2703 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2704 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2705 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2706 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2707 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2708 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2709 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2710
2711 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2712 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2713 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2714 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2715 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2716 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2717 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2718 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2719
2720 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2721 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2722 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2723 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2724
2725 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2726 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2727 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2728 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2729
2730 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2731 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2732 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2733 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2734
2735 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2736 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2737 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2738 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2739 };
2740
2741 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2742 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2743 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2744 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2745 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2746 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2747 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2748 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2749 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2750 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2751 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2752 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2753 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2754 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2755 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2756 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2757 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2758 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2759 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2760
2761 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2762 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2763 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2764 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2765 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2766 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2767 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2768 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2769 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2770 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2771
2772 // sign extend is vpcmpeq+maskedmove+vpmovdw
2773 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2774 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2775 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2776 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2777 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2778 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2779 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2780 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2781 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2782
2783 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2784 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2785 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2786 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2787 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2788 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2789 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2790 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2791
2792 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2793 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2794 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2795 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2796
2797 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2798 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2799 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2800 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2801 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2802 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2803 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2804 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2805 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2806 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2807 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2808 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2809
2810 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2811 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2812 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2813 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2814
2815 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2816 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2817 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2818 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2819 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2820 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2821 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2822 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2823 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2824 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2825 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2826 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2827 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2828
2829 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2830 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2831 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2832
2833 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2834 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2835 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2836 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2837 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2838 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2839 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2840 };
2841
2842 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2843 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2844 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2845 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2846 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2847 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2848 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2849
2850 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2851 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2852 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2853 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2854 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2855 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2856 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2857 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2858 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2859 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2860 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2861 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2862 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2863 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2864
2865 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2866
2867 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2868 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2869 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2870 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2871 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2872 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2873 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2874 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2875 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2876 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2877 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2878 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2879
2880 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2881 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2882
2883 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2884 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2885 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2886 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2887
2888 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2889 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2890 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2891 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2892 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2893 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2894 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2895 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2896
2897 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2898 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2899 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2900 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2901 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2902 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2903 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2904
2905 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2906 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2907 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2908 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2909 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2910 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2911 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2912 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2913 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2914 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2915 };
2916
2917 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2918 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2919 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2920 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2921 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2922 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2923 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2924
2925 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2926 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2927 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2928 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2929 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2930 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2931 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2932 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2933 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2934 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2935 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2936 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2937
2938 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2939 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2940 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2941 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2942 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2943
2944 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2945 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2946 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2947 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2948 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2949 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2950 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2951 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2952
2953 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2954 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2955 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2956 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2957 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2958 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2959 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2960 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2961 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2962 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2963 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2964 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2965
2966 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2967 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2968 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2969 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2970 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2971 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2972 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2973 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2974 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2975 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2976 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2977 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2978 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2979 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2980 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2981 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2982 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2983
2984 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2985 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2986 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2987 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2988 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2989 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2990 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2991 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2992 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2993 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2994 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2995
2996 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2997 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2998 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2999 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
3000 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
3001 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
3002 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
3003 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
3004 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
3005 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3006 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
3007 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
3008 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
3009
3010 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
3011 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
3012 };
3013
3014 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
3015 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
3016 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
3017 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
3018 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
3019 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3020 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3021 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
3022 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
3023 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3024 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3025 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3026 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3027
3028 // These truncates end up widening elements.
3029 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
3030 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
3031 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
3032
3033 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
3034 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
3035 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
3036
3037 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
3038 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
3039 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
3040 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
3041 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
3042 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
3043 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
3044 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
3045 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
3046 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
3047 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
3048
3049 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
3050 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
3051 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
3052 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
3053 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
3054 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
3055 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
3056 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
3057 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
3058 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3059 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
3060 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
3061 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
3062 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
3063
3064 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3065 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
3066 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3067 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
3068 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3069 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3070 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3071 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3072 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
3073 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
3074
3075 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3076 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3077 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3078 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3079 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3080 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3081 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3082 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3083 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3084 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3085 };
3086
3087 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
3088 // These are somewhat magic numbers justified by comparing the
3089 // output of llvm-mca for our various supported scheduler models
3090 // and basing it off the worst case scenario.
3091 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3092 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3093 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
3094 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
3095 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
3096 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3097 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
3098 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3099 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3100 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
3101 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
3102 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
3103
3104 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3105 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3106 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
3107 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
3108 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3109 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
3110 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
3111 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3112 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
3113 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
3114 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
3115 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
3116 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
3117
3118 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3119 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3120 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3121 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3122 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3123 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3124 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3125 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3126 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3127 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3128
3129 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3130 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3131 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3132 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
3133 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3134 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3135 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3136 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3137 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
3138 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
3139
3140 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3141 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3142 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
3143 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
3144 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3145 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
3146 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
3147 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
3148 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3149 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
3150 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3151 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
3152
3153 // These truncates are really widening elements.
3154 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
3155 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
3156 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
3157 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
3158 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
3159 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
3160
3161 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
3162 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
3163 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
3164 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
3165 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
3166 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
3167 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
3168 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
3169 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
3170 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
3171 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
3172 };
3173
3174 static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
3175 { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } },
3176 { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } },
3177 { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } },
3178 { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } },
3179 { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3180 { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } },
3181 { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } },
3182 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3183 };
3184
3185 // Attempt to map directly to (simple) MVT types to let us match custom entries.
3186 EVT SrcTy = TLI->getValueType(DL, Src);
3187 EVT DstTy = TLI->getValueType(DL, Dst);
3188
3189 // If we're sign-extending a vector comparison result back to the comparison
3190 // width, this will be free without AVX512 (or for 8/16-bit types without
3191 // BWI).
3192 if (!ST->hasAVX512() || (!ST->hasBWI() && DstTy.getScalarSizeInBits() < 32)) {
3193 if (I && Opcode == Instruction::CastOps::SExt &&
3194 SrcTy.isFixedLengthVector() && SrcTy.getScalarType() == MVT::i1) {
3195 if (auto *CmpI = dyn_cast<CmpInst>(I->getOperand(0))) {
3196 Type *CmpTy = CmpI->getOperand(0)->getType();
3197 if (CmpTy->getScalarSizeInBits() == DstTy.getScalarSizeInBits())
3198 return TTI::TCC_Free;
3199 }
3200 }
3201 }
3202
3203 // The function getSimpleVT only handles simple value types.
3204 if (SrcTy.isSimple() && DstTy.isSimple()) {
3205 MVT SimpleSrcTy = SrcTy.getSimpleVT();
3206 MVT SimpleDstTy = DstTy.getSimpleVT();
3207
3208 if (ST->useAVX512Regs()) {
3209 if (ST->hasBWI())
3210 if (const auto *Entry = ConvertCostTableLookup(
3211 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3212 if (auto KindCost = Entry->Cost[CostKind])
3213 return *KindCost;
3214
3215 if (ST->hasDQI())
3216 if (const auto *Entry = ConvertCostTableLookup(
3217 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3218 if (auto KindCost = Entry->Cost[CostKind])
3219 return *KindCost;
3220
3221 if (ST->hasAVX512())
3222 if (const auto *Entry = ConvertCostTableLookup(
3223 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3224 if (auto KindCost = Entry->Cost[CostKind])
3225 return *KindCost;
3226 }
3227
3228 if (ST->hasBWI())
3229 if (const auto *Entry = ConvertCostTableLookup(
3230 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3231 if (auto KindCost = Entry->Cost[CostKind])
3232 return *KindCost;
3233
3234 if (ST->hasDQI())
3235 if (const auto *Entry = ConvertCostTableLookup(
3236 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3237 if (auto KindCost = Entry->Cost[CostKind])
3238 return *KindCost;
3239
3240 if (ST->hasAVX512())
3241 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3242 SimpleDstTy, SimpleSrcTy))
3243 if (auto KindCost = Entry->Cost[CostKind])
3244 return *KindCost;
3245
3246 if (ST->hasAVX2()) {
3247 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3248 SimpleDstTy, SimpleSrcTy))
3249 if (auto KindCost = Entry->Cost[CostKind])
3250 return *KindCost;
3251 }
3252
3253 if (ST->hasAVX()) {
3254 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3255 SimpleDstTy, SimpleSrcTy))
3256 if (auto KindCost = Entry->Cost[CostKind])
3257 return *KindCost;
3258 }
3259
3260 if (ST->hasF16C()) {
3261 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3262 SimpleDstTy, SimpleSrcTy))
3263 if (auto KindCost = Entry->Cost[CostKind])
3264 return *KindCost;
3265 }
3266
3267 if (ST->hasSSE41()) {
3268 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3269 SimpleDstTy, SimpleSrcTy))
3270 if (auto KindCost = Entry->Cost[CostKind])
3271 return *KindCost;
3272 }
3273
3274 if (ST->hasSSE2()) {
3275 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3276 SimpleDstTy, SimpleSrcTy))
3277 if (auto KindCost = Entry->Cost[CostKind])
3278 return *KindCost;
3279 }
3280
3281 if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) ||
3282 (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) {
3283 // fp16 conversions not covered by any table entries require a libcall.
3284 // Return a large (arbitrary) number to model this.
3285 return InstructionCost(64);
3286 }
3287 }
3288
3289 // Fall back to legalized types.
3290 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3291 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3292
3293 // If we're truncating to the same legalized type - just assume its free.
3294 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3295 return TTI::TCC_Free;
3296
3297 if (ST->useAVX512Regs()) {
3298 if (ST->hasBWI())
3299 if (const auto *Entry = ConvertCostTableLookup(
3300 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3301 if (auto KindCost = Entry->Cost[CostKind])
3302 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3303
3304 if (ST->hasDQI())
3305 if (const auto *Entry = ConvertCostTableLookup(
3306 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3307 if (auto KindCost = Entry->Cost[CostKind])
3308 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3309
3310 if (ST->hasAVX512())
3311 if (const auto *Entry = ConvertCostTableLookup(
3312 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3313 if (auto KindCost = Entry->Cost[CostKind])
3314 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3315 }
3316
3317 if (ST->hasBWI())
3318 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3319 LTDest.second, LTSrc.second))
3320 if (auto KindCost = Entry->Cost[CostKind])
3321 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3322
3323 if (ST->hasDQI())
3324 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3325 LTDest.second, LTSrc.second))
3326 if (auto KindCost = Entry->Cost[CostKind])
3327 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3328
3329 if (ST->hasAVX512())
3330 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3331 LTDest.second, LTSrc.second))
3332 if (auto KindCost = Entry->Cost[CostKind])
3333 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3334
3335 if (ST->hasAVX2())
3336 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3337 LTDest.second, LTSrc.second))
3338 if (auto KindCost = Entry->Cost[CostKind])
3339 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3340
3341 if (ST->hasAVX())
3342 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3343 LTDest.second, LTSrc.second))
3344 if (auto KindCost = Entry->Cost[CostKind])
3345 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3346
3347 if (ST->hasF16C()) {
3348 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3349 LTDest.second, LTSrc.second))
3350 if (auto KindCost = Entry->Cost[CostKind])
3351 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3352 }
3353
3354 if (ST->hasSSE41())
3355 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3356 LTDest.second, LTSrc.second))
3357 if (auto KindCost = Entry->Cost[CostKind])
3358 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3359
3360 if (ST->hasSSE2())
3361 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3362 LTDest.second, LTSrc.second))
3363 if (auto KindCost = Entry->Cost[CostKind])
3364 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3365
3366 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3367 // sitofp.
3368 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3369 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3370 Type *ExtSrc = Src->getWithNewBitWidth(32);
3371 unsigned ExtOpc =
3372 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3373
3374 // For scalar loads the extend would be free.
3375 InstructionCost ExtCost = 0;
3376 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3377 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3378
3379 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3381 }
3382
3383 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3384 // i32.
3385 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3386 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3387 Type *TruncDst = Dst->getWithNewBitWidth(32);
3388 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3389 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3391 }
3392
3393 // TODO: Allow non-throughput costs that aren't binary.
3394 auto AdjustCost = [&CostKind](InstructionCost Cost,
3397 return Cost == 0 ? 0 : N;
3398 return Cost * N;
3399 };
3400 return AdjustCost(
3401 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3402}
3403
3405 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3407 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
3408 // Early out if this type isn't scalar/vector integer/float.
3409 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3410 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3411 Op1Info, Op2Info, I);
3412
3413 // Legalize the type.
3414 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3415
3416 MVT MTy = LT.second;
3417
3418 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3419 assert(ISD && "Invalid opcode");
3420
3421 InstructionCost ExtraCost = 0;
3422 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3423 // Some vector comparison predicates cost extra instructions.
3424 // TODO: Adjust ExtraCost based on CostKind?
3425 // TODO: Should we invert this and assume worst case cmp costs
3426 // and reduce for particular predicates?
3427 if (MTy.isVector() &&
3428 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3429 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3430 ST->hasBWI())) {
3431 // Fallback to I if a specific predicate wasn't specified.
3432 CmpInst::Predicate Pred = VecPred;
3433 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3435 Pred = cast<CmpInst>(I)->getPredicate();
3436
3437 bool CmpWithConstant = false;
3438 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3439 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3440
3441 switch (Pred) {
3443 // xor(cmpeq(x,y),-1)
3444 ExtraCost = CmpWithConstant ? 0 : 1;
3445 break;
3448 // xor(cmpgt(x,y),-1)
3449 ExtraCost = CmpWithConstant ? 0 : 1;
3450 break;
3453 // cmpgt(xor(x,signbit),xor(y,signbit))
3454 // xor(cmpeq(pmaxu(x,y),x),-1)
3455 ExtraCost = CmpWithConstant ? 1 : 2;
3456 break;
3459 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3460 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3461 // cmpeq(psubus(x,y),0)
3462 // cmpeq(pminu(x,y),x)
3463 ExtraCost = 1;
3464 } else {
3465 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3466 ExtraCost = CmpWithConstant ? 2 : 3;
3467 }
3468 break;
3471 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3472 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3473 if (CondTy && !ST->hasAVX())
3474 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3476 Op1Info, Op2Info) +
3477 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3479 Op1Info, Op2Info) +
3480 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3481
3482 break;
3485 // Assume worst case scenario and add the maximum extra cost.
3486 ExtraCost = 3;
3487 break;
3488 default:
3489 break;
3490 }
3491 }
3492 }
3493
3494 static const CostKindTblEntry SLMCostTbl[] = {
3495 // slm pcmpeq/pcmpgt throughput is 2
3496 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3497 // slm pblendvb/blendvpd/blendvps throughput is 4
3498 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3499 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3500 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3501 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3502 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3503 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3504 };
3505
3506 static const CostKindTblEntry AVX512BWCostTbl[] = {
3507 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3508 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3509 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3510 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3511
3512 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3513 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3514 };
3515
3516 static const CostKindTblEntry AVX512CostTbl[] = {
3517 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3518 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3519 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3520 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3521
3522 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3523 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3524 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3525 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3526 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3527 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3528 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3529
3530 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3531 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3532 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3533 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3534 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3535 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3536 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3537 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3538 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3539 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3540 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3541 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3542 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3543 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3544
3545 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3546 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3547 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3548 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3549 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3550 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3551 };
3552
3553 static const CostKindTblEntry AVX2CostTbl[] = {
3554 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3555 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3556 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3557 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3558 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3559 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3560
3561 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3562 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3563 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3564 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3565
3566 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3567 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3568 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3569 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3570 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3571 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3572 };
3573
3574 static const CostKindTblEntry XOPCostTbl[] = {
3575 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3576 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3577 };
3578
3579 static const CostKindTblEntry AVX1CostTbl[] = {
3580 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3581 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3582 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3583 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3584 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3585 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3586
3587 // AVX1 does not support 8-wide integer compare.
3588 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3589 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3590 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3591 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3592
3593 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3594 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3595 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3596 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3597 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3598 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3599 };
3600
3601 static const CostKindTblEntry SSE42CostTbl[] = {
3602 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3603 };
3604
3605 static const CostKindTblEntry SSE41CostTbl[] = {
3606 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3607 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3608
3609 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3610 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3611 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3612 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3613 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3614 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3615 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3616 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3617 };
3618
3619 static const CostKindTblEntry SSE2CostTbl[] = {
3620 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3621 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3622
3623 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3624 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3625 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3626 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3627
3628 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3629 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3630 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3631 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3632 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3633 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3634 };
3635
3636 static const CostKindTblEntry SSE1CostTbl[] = {
3637 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3638 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3639
3640 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3641 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3642 };
3643
3644 if (ST->useSLMArithCosts())
3645 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3646 if (auto KindCost = Entry->Cost[CostKind])
3647 return LT.first * (ExtraCost + *KindCost);
3648
3649 if (ST->hasBWI())
3650 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3651 if (auto KindCost = Entry->Cost[CostKind])
3652 return LT.first * (ExtraCost + *KindCost);
3653
3654 if (ST->hasAVX512())
3655 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3656 if (auto KindCost = Entry->Cost[CostKind])
3657 return LT.first * (ExtraCost + *KindCost);
3658
3659 if (ST->hasAVX2())
3660 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3661 if (auto KindCost = Entry->Cost[CostKind])
3662 return LT.first * (ExtraCost + *KindCost);
3663
3664 if (ST->hasXOP())
3665 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3666 if (auto KindCost = Entry->Cost[CostKind])
3667 return LT.first * (ExtraCost + *KindCost);
3668
3669 if (ST->hasAVX())
3670 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3671 if (auto KindCost = Entry->Cost[CostKind])
3672 return LT.first * (ExtraCost + *KindCost);
3673
3674 if (ST->hasSSE42())
3675 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3676 if (auto KindCost = Entry->Cost[CostKind])
3677 return LT.first * (ExtraCost + *KindCost);
3678
3679 if (ST->hasSSE41())
3680 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3681 if (auto KindCost = Entry->Cost[CostKind])
3682 return LT.first * (ExtraCost + *KindCost);
3683
3684 if (ST->hasSSE2())
3685 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3686 if (auto KindCost = Entry->Cost[CostKind])
3687 return LT.first * (ExtraCost + *KindCost);
3688
3689 if (ST->hasSSE1())
3690 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3691 if (auto KindCost = Entry->Cost[CostKind])
3692 return LT.first * (ExtraCost + *KindCost);
3693
3694 // Assume a 3cy latency for fp select ops.
3695 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3696 if (ValTy->getScalarType()->isFloatingPointTy())
3697 return 3;
3698
3699 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3700 Op1Info, Op2Info, I);
3701}
3702
3704
3708 // Costs should match the codegen from:
3709 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3710 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3711 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3712 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3713 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3714
3715 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3716 // specialized in these tables yet.
3717 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3718 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3719 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3720 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3721 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3722 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3723 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3724 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3725 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3726 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3727 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3728 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3729 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3730 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3731 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3732 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3733 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3734 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3735 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3736 };
3737 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3738 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3739 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3740 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3741 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3742 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3743 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3744 };
3745 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3746 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3747 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3748 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3749 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3750 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3751 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3752 };
3753 static const CostKindTblEntry AVX512CDCostTbl[] = {
3754 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3755 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3756 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3757 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3758 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3759 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3760 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3761 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3762 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3763 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3764 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3765 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3766
3767 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3768 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3769 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3770 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3771 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3772 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3773 };
3774 static const CostKindTblEntry AVX512BWCostTbl[] = {
3775 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3776 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3777 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3778 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3779 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3780 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3781 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3782 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3783 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3784 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3785 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3786 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3787 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3788 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3789 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3790 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3791 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3792 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3793 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3794 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3795 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3796 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3797 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3798 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3799 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3800 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3801 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3802 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3803 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3804 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3805 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3806 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3807 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3808 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3809 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3810 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3811 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3812 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3813 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3814 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3815 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3816 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3817 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3818 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3819 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3820 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3821 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3822 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3823 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3824 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3825 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3826 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3827 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3828 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3829 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3830 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3831 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3832 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3833 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3834 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3835 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3836 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3837 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3838 { ISD::SADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3839 { ISD::SADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3840 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3841 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3842 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3843 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3844 { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } },
3845 { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } },
3846 { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } },
3847 { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } },
3848 { ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3849 { ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3850 { ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3851 { ISD::UADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3852 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3853 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3854 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3855 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3856 { ISD::USUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3857 { ISD::USUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3858 };
3859 static const CostKindTblEntry AVX512CostTbl[] = {
3860 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3861 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3862 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3863 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3864 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3865 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3866 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3867 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3868 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3869 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3870 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3871 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3872 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3873 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3874 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3875 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3876 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3877 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3878 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3879 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3880 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3881 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3882 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3883 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3884 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3885 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3886 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3887 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3888 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3889 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3890 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3891 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3892 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3893 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3894 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3895 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3896 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3897 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3898 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3899 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3900 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3901 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3902 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3903 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3904 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3905 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3906 { ISD::SADDSAT, MVT::v2i64, { 3, 3, 8, 9 } },
3907 { ISD::SADDSAT, MVT::v4i64, { 2, 2, 6, 7 } },
3908 { ISD::SADDSAT, MVT::v8i64, { 3, 3, 6, 7 } },
3909 { ISD::SADDSAT, MVT::v4i32, { 2, 2, 6, 7 } },
3910 { ISD::SADDSAT, MVT::v8i32, { 2, 2, 6, 7 } },
3911 { ISD::SADDSAT, MVT::v16i32, { 3, 3, 6, 7 } },
3912 { ISD::SADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3913 { ISD::SADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3914 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3915 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3916 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3917 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3918 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3919 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3920 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3921 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3922 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3923 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3924 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3925 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3926 { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } },
3927 { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } },
3928 { ISD::SMULO, MVT::v32i16, { 6, 12, 17, 17 } },
3929 { ISD::SMULO, MVT::v64i8, { 22, 28, 42, 42 } },
3930 { ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } },
3931 { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } },
3932 { ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } },
3933 { ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } },
3934 { ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } },
3935 { ISD::SSUBSAT, MVT::v16i32, { 2, 14, 7, 8 } },
3936 { ISD::SSUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3937 { ISD::SSUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3938 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3939 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3940 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3941 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3942 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3943 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3944 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3945 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3946 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3947 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3948 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3949 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3950 { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} },
3951 { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } },
3952 { ISD::UMULO, MVT::v32i16, { 5, 13, 16, 16 } },
3953 { ISD::UMULO, MVT::v64i8, { 18, 24, 30, 30 } },
3954 { ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } },
3955 { ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } },
3956 { ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } },
3957 { ISD::UADDSAT, MVT::v4i32, { 1, 2, 4, 4 } },
3958 { ISD::UADDSAT, MVT::v8i32, { 1, 2, 4, 4 } },
3959 { ISD::UADDSAT, MVT::v16i32, { 2, 2, 4, 4 } },
3960 { ISD::UADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3961 { ISD::UADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3962 { ISD::USUBSAT, MVT::v2i64, { 1, 4, 2, 2 } },
3963 { ISD::USUBSAT, MVT::v4i64, { 1, 4, 2, 2 } },
3964 { ISD::USUBSAT, MVT::v8i64, { 1, 4, 2, 2 } },
3965 { ISD::USUBSAT, MVT::v8i32, { 1, 2, 2, 2 } },
3966 { ISD::USUBSAT, MVT::v16i32, { 1, 2, 2, 2 } },
3967 { ISD::USUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3968 { ISD::USUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3969 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3970 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3971 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3972 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3973 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3974 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3975 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3976 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3977 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3978 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3979 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3980 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3981 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3982 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3983 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3984 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3985 };
3986 static const CostKindTblEntry XOPCostTbl[] = {
3987 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3988 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3989 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3990 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3991 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3992 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3993 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3994 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3995 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3996 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3997 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3998 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3999 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
4000 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
4001 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
4002 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
4003 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
4004 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
4005 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
4006 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
4007 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
4008 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
4009 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
4010 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
4011 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
4012 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
4013 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
4014 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
4015 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
4016 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
4017 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
4018 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
4019 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
4020 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
4021 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
4022 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
4023 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
4024 };
4025 static const CostKindTblEntry AVX2CostTbl[] = {
4026 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4027 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4028 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
4029 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
4030 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
4031 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
4032 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
4033 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
4034 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
4035 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
4036 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
4037 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
4038 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
4039 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
4040 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
4041 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
4042 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
4043 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
4044 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
4045 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
4046 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
4047 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
4048 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
4049 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
4050 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
4051 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
4052 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
4053 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
4054 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
4055 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
4056 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
4057 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
4058 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
4059 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
4060 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
4061 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
4062 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
4063 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
4064 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
4065 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
4066 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
4067 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
4068 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
4069 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
4070 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
4071 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
4072 { ISD::SADDSAT, MVT::v2i64, { 4, 13, 8, 11 } },
4073 { ISD::SADDSAT, MVT::v4i64, { 3, 10, 8, 12 } },
4074 { ISD::SADDSAT, MVT::v4i32, { 2, 6, 7, 9 } },
4075 { ISD::SADDSAT, MVT::v8i32, { 4, 6, 7, 13 } },
4076 { ISD::SADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4077 { ISD::SADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4078 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
4079 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
4080 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4081 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4082 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4083 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
4084 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
4085 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4086 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4087 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4088 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4089 { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } },
4090 { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } },
4091 { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } },
4092 { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } },
4093 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4094 { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } },
4095 { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } },
4096 { ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } },
4097 { ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } },
4098 { ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } },
4099 { ISD::SSUBSAT, MVT::v8i32, { 4, 15, 9, 16 } },
4100 { ISD::SSUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4101 { ISD::SSUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4102 { ISD::UADDSAT, MVT::v2i64, { 2, 8, 6, 6 } },
4103 { ISD::UADDSAT, MVT::v4i64, { 3, 8, 6, 10 } },
4104 { ISD::UADDSAT, MVT::v8i32, { 2, 2, 4, 8 } },
4105 { ISD::UADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4106 { ISD::UADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4107 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
4108 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
4109 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4110 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4111 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4112 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
4113 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
4114 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4115 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4116 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4117 { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } },
4118 { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } },
4119 { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } },
4120 { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } },
4121 { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } },
4122 { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } },
4123 { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } },
4124 { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } },
4125 { ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } },
4126 { ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } },
4127 { ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } },
4128 { ISD::USUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4129 { ISD::USUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4130 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4131 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4132 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4133 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4134 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4135 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4136 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
4137 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
4138 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
4139 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
4140 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
4141 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
4142 };
4143 static const CostKindTblEntry AVX1CostTbl[] = {
4144 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4145 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
4146 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
4147 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
4148 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4149 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
4150 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4151 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
4152 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4153 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
4154 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
4155 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
4156 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
4157 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
4158 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
4159 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
4160 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
4161 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
4162 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
4163 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
4164 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
4165 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
4166 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
4167 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
4168 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4169 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
4170 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4171 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
4172 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4173 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
4174 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
4175 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
4176 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
4177 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
4178 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
4179 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
4180 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
4181 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
4182 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4183 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
4184 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
4185 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
4186 { ISD::SADDSAT, MVT::v2i64, { 6, 13, 8, 11 } },
4187 { ISD::SADDSAT, MVT::v4i64, { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
4188 { ISD::SADDSAT, MVT::v8i32, { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
4189 { ISD::SADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4190 { ISD::SADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4191 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4192 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
4193 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4194 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4195 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4196 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4197 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4198 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4199 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4200 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4201 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4202 { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } },
4203 { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } },
4204 { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } },
4205 { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } },
4206 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4207 { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } },
4208 { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } },
4209 { ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } },
4210 { ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4211 { ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4212 { ISD::SSUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4213 { ISD::SSUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4214 { ISD::UADDSAT, MVT::v2i64, { 3, 8, 6, 6 } },
4215 { ISD::UADDSAT, MVT::v4i64, { 8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4216 { ISD::UADDSAT, MVT::v8i32, { 6, 6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
4217 { ISD::UADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4218 { ISD::UADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4219 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4220 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
4221 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4222 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4223 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4224 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4225 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
4226 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4227 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4228 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4229 { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } },
4230 { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } },
4231 { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } },
4232 { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } },
4233 { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } },
4234 { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } },
4235 { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } },
4236 { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } },
4237 { ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } },
4238 { ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4239 { ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert
4240 { ISD::USUBSAT, MVT::v8i32, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4241 { ISD::USUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4242 { ISD::USUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4243 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4244 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4245 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4246 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4247 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4248 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4249 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
4250 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
4251 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
4252 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
4253 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
4254 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
4255 };
4256 static const CostKindTblEntry GFNICostTbl[] = {
4257 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
4258 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
4259 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
4260 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
4261 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4262 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4263 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4264 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
4265 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4266 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4267 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
4268 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4269 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4270 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
4271 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4272 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4273 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4274 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4275 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4276 };
4277 static const CostKindTblEntry GLMCostTbl[] = {
4278 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
4279 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
4280 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
4281 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
4282 };
4283 static const CostKindTblEntry SLMCostTbl[] = {
4284 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
4285 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
4286 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
4287 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
4288 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
4289 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
4290 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
4291 };
4292 static const CostKindTblEntry SSE42CostTbl[] = {
4293 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4294 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4295 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4296 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4297 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4298 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4299 };
4300 static const CostKindTblEntry SSE41CostTbl[] = {
4301 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
4302 { ISD::SADDSAT, MVT::v2i64, { 10, 14, 17, 21 } },
4303 { ISD::SADDSAT, MVT::v4i32, { 5, 11, 8, 10 } },
4304 { ISD::SSUBSAT, MVT::v2i64, { 12, 19, 25, 29 } },
4305 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 10, 12 } },
4306 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
4307 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4308 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4309 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4310 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4311 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4312 { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } },
4313 { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } },
4314 { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } },
4315 { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } },
4316 { ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } },
4317 { ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } },
4318 { ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } },
4319 { ISD::USUBSAT, MVT::v4i32, { 1, 2, 2, 2 } },
4320 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
4321 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4322 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4323 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
4324 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4325 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4326 { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } },
4327 { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } },
4328 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4329 { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } },
4330 };
4331 static const CostKindTblEntry SSSE3CostTbl[] = {
4332 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
4333 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
4334 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
4335 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
4336 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
4337 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
4338 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
4339 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
4340 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
4341 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
4342 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4343 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4344 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4345 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4346 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4347 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4348 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4349 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4350 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4351 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4352 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4353 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4354 };
4355 static const CostKindTblEntry SSE2CostTbl[] = {
4356 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4357 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4358 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4359 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4360 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4361 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4362 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4363 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4364 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4365 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4366 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4367 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4368 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4369 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4370 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4371 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4372 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4373 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4374 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4375 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4376 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4377 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4378 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4379 { ISD::SADDSAT, MVT::v2i64, { 12, 14, 24, 24 } },
4380 { ISD::SADDSAT, MVT::v4i32, { 6, 11, 11, 12 } },
4381 { ISD::SADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4382 { ISD::SADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4383 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4384 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4385 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4386 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4387 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4388 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4389 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4390 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4391 { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } },
4392 { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } },
4393 { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } },
4394 { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } },
4395 { ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } },
4396 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } },
4397 { ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4398 { ISD::SSUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4399 { ISD::UADDSAT, MVT::v2i64, { 7, 13, 14, 14 } },
4400 { ISD::UADDSAT, MVT::v4i32, { 4, 5, 7, 7 } },
4401 { ISD::UADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4402 { ISD::UADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4403 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4404 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4405 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4406 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4407 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4408 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4409 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4410 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4411 { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } },
4412 { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } },
4413 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4414 { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } },
4415 { ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } },
4416 { ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } },
4417 { ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4418 { ISD::USUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4419 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4420 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4421 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4422 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4423 };
4424 static const CostKindTblEntry SSE1CostTbl[] = {
4425 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4426 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4427 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4428 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4429 };
4430 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4431 { ISD::CTTZ, MVT::i64, { 1, 1, 1, 1 } },
4432 };
4433 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4434 { ISD::CTTZ, MVT::i32, { 1, 1, 1, 1 } },
4435 { ISD::CTTZ, MVT::i16, { 2, 1, 1, 1 } },
4436 { ISD::CTTZ, MVT::i8, { 2, 1, 1, 1 } },
4437 };
4438 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4439 { ISD::CTLZ, MVT::i64, { 1, 1, 1, 1 } },
4440 };
4441 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4442 { ISD::CTLZ, MVT::i32, { 1, 1, 1, 1 } },
4443 { ISD::CTLZ, MVT::i16, { 2, 1, 1, 1 } },
4444 { ISD::CTLZ, MVT::i8, { 2, 1, 1, 1 } },
4445 };
4446 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4447 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4448 };
4449 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4450 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4451 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4452 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4453 };
4454 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4455 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4456 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4457 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4458 { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4459 { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4460 { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR
4461 { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR
4462 { ISD::CTLZ_ZERO_POISON,MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
4463 { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF
4464 { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF
4465 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF
4466 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF
4467 { ISD::CTTZ_ZERO_POISON,MVT::i64,{ 1, 2, 1, 2 } }, // BSF
4468 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4469 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4470 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4471 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4472 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4473 { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } },
4474 { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } },
4475 { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } },
4476 { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } },
4477 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4478 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4479 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4480 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4481 { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
4482 { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
4483 { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
4484 { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
4485 };
4486 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4487 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4488 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4489 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4490 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4491 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4492 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4493 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4494 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4495 { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4496 { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4497 { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4498 { ISD::CTLZ_ZERO_POISON,MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
4499 { ISD::CTLZ_ZERO_POISON,MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
4500 { ISD::CTLZ_ZERO_POISON,MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4501 { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH
4502 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4503 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4504 { ISD::CTTZ_ZERO_POISON,MVT::i32,{ 1, 2, 1, 2 } }, // BSF
4505 { ISD::CTTZ_ZERO_POISON,MVT::i16,{ 2, 2, 1, 2 } }, // BSF
4506 { ISD::CTTZ_ZERO_POISON,MVT::i8, { 2, 2, 1, 2 } }, // BSF
4507 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4508 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4509 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4510 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4511 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4512 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4513 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4514 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4515 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4516 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4517 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4518 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4519 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4520 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4521 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4522 { ISD::SADDSAT, MVT::i32, { 3, 4, 6, 9 } },
4523 { ISD::SADDSAT, MVT::i16, { 4, 4, 7, 10 } },
4524 { ISD::SADDSAT, MVT::i8, { 4, 5, 8, 11 } },
4525 { ISD::SSUBSAT, MVT::i32, { 4, 4, 7, 10 } },
4526 { ISD::SSUBSAT, MVT::i16, { 4, 4, 7, 10 } },
4527 { ISD::SSUBSAT, MVT::i8, { 4, 5, 8, 11 } },
4528 { ISD::UADDSAT, MVT::i32, { 2, 3, 4, 7 } },
4529 { ISD::UADDSAT, MVT::i16, { 2, 3, 4, 7 } },
4530 { ISD::UADDSAT, MVT::i8, { 3, 3, 5, 8 } },
4531 { ISD::USUBSAT, MVT::i32, { 2, 3, 4, 7 } },
4532 { ISD::USUBSAT, MVT::i16, { 2, 3, 4, 7 } },
4533 { ISD::USUBSAT, MVT::i8, { 3, 3, 5, 8 } },
4534 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4535 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4536 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4537 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4538 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4539 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4540 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4541 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4542 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4543 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4544 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4545 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4546 { ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } },
4547 { ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } },
4548 { ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } },
4549 { ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } },
4550 { ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } },
4551 { ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } },
4552 { ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } },
4553 { ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } },
4554 { ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } },
4555 { ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } },
4556 { ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } },
4557 { ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } },
4558 };
4559
4560 Type *RetTy = ICA.getReturnType();
4561 Type *OpTy = RetTy;
4562 Intrinsic::ID IID = ICA.getID();
4563 unsigned ISD = ISD::DELETED_NODE;
4564 switch (IID) {
4565 default:
4566 break;
4567 case Intrinsic::abs:
4568 ISD = ISD::ABS;
4569 break;
4570 case Intrinsic::bitreverse:
4572 break;
4573 case Intrinsic::bswap:
4574 ISD = ISD::BSWAP;
4575 break;
4576 case Intrinsic::ctlz:
4577 ISD = ISD::CTLZ;
4578 break;
4579 case Intrinsic::ctpop:
4580 ISD = ISD::CTPOP;
4581 break;
4582 case Intrinsic::cttz:
4583 ISD = ISD::CTTZ;
4584 break;
4585 case Intrinsic::fshl:
4586 ISD = ISD::FSHL;
4587 if (!ICA.isTypeBasedOnly()) {
4588 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4589 if (Args[0] == Args[1]) {
4590 ISD = ISD::ROTL;
4591 // Handle uniform constant rotation amounts.
4592 // TODO: Handle funnel-shift cases.
4593 const APInt *Amt;
4594 if (Args[2] &&
4596 ISD = X86ISD::VROTLI;
4597 }
4598 }
4599 break;
4600 case Intrinsic::fshr:
4601 // FSHR has same costs so don't duplicate.
4602 ISD = ISD::FSHL;
4603 if (!ICA.isTypeBasedOnly()) {
4604 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4605 if (Args[0] == Args[1]) {
4606 ISD = ISD::ROTR;
4607 // Handle uniform constant rotation amount.
4608 // TODO: Handle funnel-shift cases.
4609 const APInt *Amt;
4610 if (Args[2] &&
4612 ISD = X86ISD::VROTLI;
4613 }
4614 }
4615 break;
4616 case Intrinsic::lrint:
4617 case Intrinsic::llrint: {
4618 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4619 // have the same costs as the CVTTP2SI (fptosi) instructions
4620 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4621 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4623 }
4624 case Intrinsic::maxnum:
4625 case Intrinsic::minnum:
4626 // FMINNUM has same costs so don't duplicate.
4627 ISD = ISD::FMAXNUM;
4628 break;
4629 case Intrinsic::sadd_sat:
4630 ISD = ISD::SADDSAT;
4631 break;
4632 case Intrinsic::smax:
4633 ISD = ISD::SMAX;
4634 break;
4635 case Intrinsic::smin:
4636 ISD = ISD::SMIN;
4637 break;
4638 case Intrinsic::ssub_sat:
4639 ISD = ISD::SSUBSAT;
4640 break;
4641 case Intrinsic::uadd_sat:
4642 ISD = ISD::UADDSAT;
4643 break;
4644 case Intrinsic::umax:
4645 ISD = ISD::UMAX;
4646 break;
4647 case Intrinsic::umin:
4648 ISD = ISD::UMIN;
4649 break;
4650 case Intrinsic::usub_sat:
4651 ISD = ISD::USUBSAT;
4652 break;
4653 case Intrinsic::sqrt:
4654 ISD = ISD::FSQRT;
4655 break;
4656 case Intrinsic::sadd_with_overflow:
4657 case Intrinsic::ssub_with_overflow:
4658 // SSUBO has same costs so don't duplicate.
4659 ISD = ISD::SADDO;
4660 OpTy = RetTy->getContainedType(0);
4661 break;
4662 case Intrinsic::uadd_with_overflow:
4663 case Intrinsic::usub_with_overflow:
4664 // USUBO has same costs so don't duplicate.
4665 ISD = ISD::UADDO;
4666 OpTy = RetTy->getContainedType(0);
4667 break;
4668 case Intrinsic::smul_with_overflow:
4669 ISD = ISD::SMULO;
4670 OpTy = RetTy->getContainedType(0);
4671 break;
4672 case Intrinsic::umul_with_overflow:
4673 ISD = ISD::UMULO;
4674 OpTy = RetTy->getContainedType(0);
4675 break;
4676 }
4677
4678 if (ISD != ISD::DELETED_NODE) {
4679 auto adjustTableCost = [&](int ISD, unsigned Cost,
4680 std::pair<InstructionCost, MVT> LT,
4682 InstructionCost LegalizationCost = LT.first;
4683 MVT MTy = LT.second;
4684
4685 // If there are no NANs to deal with, then these are reduced to a
4686 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4687 // assume is used in the non-fast case.
4688 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4689 if (FMF.noNaNs())
4690 return LegalizationCost * 1;
4691 }
4692
4693 // For cases where some ops can be folded into a load/store, assume free.
4694 if (MTy.isScalarInteger()) {
4695 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4696 if (const Instruction *II = ICA.getInst()) {
4697 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4698 return TTI::TCC_Free;
4699 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4700 if (LI->hasOneUse())
4701 return TTI::TCC_Free;
4702 }
4703 }
4704 }
4705 }
4706
4707 return LegalizationCost * (int)Cost;
4708 };
4709
4710 // Legalize the type.
4711 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4712 MVT MTy = LT.second;
4713
4714 // Without BMI/LZCNT see if we're only looking for a *_ZERO_POISON cost.
4715 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4716 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4717 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4718 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4719 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4720 if (Cst->isAllOnesValue())
4721 ISD =
4723 }
4724
4725 // FSQRT is a single instruction.
4727 return LT.first;
4728
4729 if (ST->useGLMDivSqrtCosts())
4730 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4731 if (auto KindCost = Entry->Cost[CostKind])
4732 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4733
4734 if (ST->useSLMArithCosts())
4735 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4736 if (auto KindCost = Entry->Cost[CostKind])
4737 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4738
4739 if (ST->hasVBMI2())
4740 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4741 if (auto KindCost = Entry->Cost[CostKind])
4742 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4743
4744 if (ST->hasBITALG())
4745 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4746 if (auto KindCost = Entry->Cost[CostKind])
4747 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4748
4749 if (ST->hasVPOPCNTDQ())
4750 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4751 if (auto KindCost = Entry->Cost[CostKind])
4752 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4753
4754 if (ST->hasGFNI())
4755 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4756 if (auto KindCost = Entry->Cost[CostKind])
4757 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4758
4759 if (ST->hasCDI())
4760 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4761 if (auto KindCost = Entry->Cost[CostKind])
4762 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4763
4764 if (ST->hasBWI())
4765 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4766 if (auto KindCost = Entry->Cost[CostKind])
4767 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4768
4769 if (ST->hasAVX512())
4770 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4771 if (auto KindCost = Entry->Cost[CostKind])
4772 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4773
4774 if (ST->hasXOP())
4775 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4776 if (auto KindCost = Entry->Cost[CostKind])
4777 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4778
4779 if (ST->hasAVX2())
4780 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4781 if (auto KindCost = Entry->Cost[CostKind])
4782 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4783
4784 if (ST->hasAVX())
4785 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4786 if (auto KindCost = Entry->Cost[CostKind])
4787 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4788
4789 if (ST->hasSSE42())
4790 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4791 if (auto KindCost = Entry->Cost[CostKind])
4792 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4793
4794 if (ST->hasSSE41())
4795 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4796 if (auto KindCost = Entry->Cost[CostKind])
4797 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4798
4799 if (ST->hasSSSE3())
4800 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4801 if (auto KindCost = Entry->Cost[CostKind])
4802 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4803
4804 if (ST->hasSSE2())
4805 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4806 if (auto KindCost = Entry->Cost[CostKind])
4807 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4808
4809 if (ST->hasSSE1())
4810 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4811 if (auto KindCost = Entry->Cost[CostKind])
4812 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4813
4814 if (ST->hasBMI()) {
4815 if (ST->is64Bit())
4816 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4817 if (auto KindCost = Entry->Cost[CostKind])
4818 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4819
4820 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4821 if (auto KindCost = Entry->Cost[CostKind])
4822 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4823 }
4824
4825 if (ST->hasLZCNT()) {
4826 if (ST->is64Bit())
4827 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4828 if (auto KindCost = Entry->Cost[CostKind])
4829 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4830
4831 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4832 if (auto KindCost = Entry->Cost[CostKind])
4833 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4834 }
4835
4836 if (ST->hasPOPCNT()) {
4837 if (ST->is64Bit())
4838 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4839 if (auto KindCost = Entry->Cost[CostKind])
4840 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4841
4842 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4843 if (auto KindCost = Entry->Cost[CostKind])
4844 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4845 }
4846
4847 if (ST->is64Bit())
4848 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4849 if (auto KindCost = Entry->Cost[CostKind])
4850 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4851
4852 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4853 if (auto KindCost = Entry->Cost[CostKind])
4854 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4855
4856 // Without arg data, we need to compute the expanded costs of custom lowered
4857 // intrinsics to prevent use of the (very low) default costs.
4858 if (ICA.isTypeBasedOnly() &&
4859 (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) {
4860 Type *CondTy = RetTy->getWithNewBitWidth(1);
4862 Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
4863 Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
4864 Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
4865 Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind);
4866 Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
4867 Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
4869 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
4871 return Cost;
4872 }
4873 }
4874
4876}
4877
4879 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4880 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4881 static const CostTblEntry SLMCostTbl[] = {
4882 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4883 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4884 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4885 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4886 };
4887
4888 assert(Val->isVectorTy() && "This must be a vector type");
4889 auto *VT = cast<VectorType>(Val);
4890 if (VT->isScalableTy())
4892
4893 Type *ScalarType = Val->getScalarType();
4894 InstructionCost RegisterFileMoveCost = 0;
4895
4896 // Non-immediate extraction/insertion can be handled as a sequence of
4897 // aliased loads+stores via the stack.
4898 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4899 Opcode == Instruction::InsertElement)) {
4900 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4901 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4902
4903 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4904 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4905 Align VecAlign = DL.getPrefTypeAlign(Val);
4906 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4907
4908 // Extract - store vector to stack, load scalar.
4909 if (Opcode == Instruction::ExtractElement) {
4910 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4911 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4912 CostKind);
4913 }
4914 // Insert - store vector to stack, store scalar, load vector.
4915 if (Opcode == Instruction::InsertElement) {
4916 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4917 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4918 CostKind) +
4919 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4920 }
4921 }
4922
4923 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4924 Opcode == Instruction::InsertElement)) {
4925 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4926 if (Opcode == Instruction::ExtractElement &&
4927 ScalarType->getScalarSizeInBits() == 1 &&
4928 cast<FixedVectorType>(Val)->getNumElements() > 1)
4929 return 1;
4930
4931 // Legalize the type.
4932 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4933
4934 // This type is legalized to a scalar type.
4935 if (!LT.second.isVector())
4936 return TTI::TCC_Free;
4937
4938 // The type may be split. Normalize the index to the new type.
4939 unsigned SizeInBits = LT.second.getSizeInBits();
4940 unsigned NumElts = LT.second.getVectorNumElements();
4941 unsigned SubNumElts = NumElts;
4942 Index = Index % NumElts;
4943
4944 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4945 // For inserts, we also need to insert the subvector back.
4946 if (SizeInBits > 128) {
4947 assert((SizeInBits % 128) == 0 && "Illegal vector");
4948 unsigned NumSubVecs = SizeInBits / 128;
4949 SubNumElts = NumElts / NumSubVecs;
4950 if (SubNumElts <= Index) {
4951 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4952 Index %= SubNumElts;
4953 }
4954 }
4955
4956 MVT MScalarTy = LT.second.getScalarType();
4957 auto IsCheapPInsrPExtrInsertPS = [&]() {
4958 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4959 // Inserting f32 into index0 is just movss.
4960 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4961 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4962 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4963 (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 &&
4964 Opcode == Instruction::InsertElement) ||
4965 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4966 Opcode == Instruction::InsertElement);
4967 };
4968
4969 if (Index == 0) {
4970 // Floating point scalars are already located in index #0.
4971 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4972 // true for all.
4973 if (ScalarType->isFloatingPointTy() &&
4974 (Opcode != Instruction::InsertElement || !Op0 ||
4975 isa<UndefValue>(Op0)))
4976 return RegisterFileMoveCost;
4977
4978 if (Opcode == Instruction::InsertElement &&
4980 // Consider the gather cost to be cheap.
4982 return RegisterFileMoveCost;
4983 if (!IsCheapPInsrPExtrInsertPS()) {
4984 // mov constant-to-GPR + movd/movq GPR -> XMM.
4985 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4986 return 2 + RegisterFileMoveCost;
4987 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4988 return 1 + RegisterFileMoveCost;
4989 }
4990 }
4991
4992 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4993 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4994 return 1 + RegisterFileMoveCost;
4995 }
4996
4997 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4998 assert(ISD && "Unexpected vector opcode");
4999 if (ST->useSLMArithCosts())
5000 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
5001 return Entry->Cost + RegisterFileMoveCost;
5002
5003 // Consider cheap cases.
5004 if (IsCheapPInsrPExtrInsertPS())
5005 return 1 + RegisterFileMoveCost;
5006
5007 // For extractions we just need to shuffle the element to index 0, which
5008 // should be very cheap (assume cost = 1). For insertions we need to shuffle
5009 // the elements to its destination. In both cases we must handle the
5010 // subvector move(s).
5011 // If the vector type is already less than 128-bits then don't reduce it.
5012 // TODO: Under what circumstances should we shuffle using the full width?
5013 InstructionCost ShuffleCost = 1;
5014 if (Opcode == Instruction::InsertElement) {
5015 auto *SubTy = cast<VectorType>(Val);
5016 EVT VT = TLI->getValueType(DL, Val);
5017 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
5018 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
5019 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, SubTy, {},
5020 CostKind, 0, SubTy);
5021 }
5022 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
5023 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
5024 }
5025
5026 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1,
5027 VIC) +
5028 RegisterFileMoveCost;
5029}
5030
5032 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
5033 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
5034 TTI::VectorInstrContext VIC) const {
5035 assert(DemandedElts.getBitWidth() ==
5036 cast<FixedVectorType>(Ty)->getNumElements() &&
5037 "Vector size mismatch");
5038
5039 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5040 MVT MScalarTy = LT.second.getScalarType();
5041 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
5043
5044 constexpr unsigned LaneBitWidth = 128;
5045 assert((LegalVectorBitWidth < LaneBitWidth ||
5046 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
5047 "Illegal vector");
5048
5049 const int NumLegalVectors = LT.first.getValue();
5050 assert(NumLegalVectors >= 0 && "Negative cost!");
5051
5052 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
5053 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. SLPVectorizer has
5054 // a special heuristic regarding poison input which is passed here in
5055 // ForPoisonSrc.
5056 if (Insert && !ForPoisonSrc) {
5057 // This is nearly identical to BaseT::getScalarizationOverhead(), except
5058 // it is passing nullptr to getVectorInstrCost() for Op0 (instead of
5059 // Constant::getNullValue()), which makes the X86TTIImpl
5060 // getVectorInstrCost() return 0 instead of 1.
5061 for (unsigned I : seq(DemandedElts.getBitWidth())) {
5062 if (!DemandedElts[I])
5063 continue;
5064 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, I,
5066 VL.empty() ? nullptr : VL[I],
5068 }
5069 return Cost;
5070 }
5071
5072 if (Insert) {
5073 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
5074 (MScalarTy.isInteger() && ST->hasSSE41()) ||
5075 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
5076 // For types we can insert directly, insertion into 128-bit sub vectors is
5077 // cheap, followed by a cheap chain of concatenations.
5078 if (LegalVectorBitWidth <= LaneBitWidth) {
5079 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
5080 /*Extract*/ false, CostKind);
5081 } else {
5082 // In each 128-lane, if at least one index is demanded but not all
5083 // indices are demanded and this 128-lane is not the first 128-lane of
5084 // the legalized-vector, then this 128-lane needs a extracti128; If in
5085 // each 128-lane, there is at least one demanded index, this 128-lane
5086 // needs a inserti128.
5087
5088 // The following cases will help you build a better understanding:
5089 // Assume we insert several elements into a v8i32 vector in avx2,
5090 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
5091 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
5092 // inserti128.
5093 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
5094 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
5095 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5096 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5097 unsigned NumLegalElts =
5098 LT.second.getVectorNumElements() * NumLegalVectors;
5099 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5100 "Vector has been legalized to smaller element count");
5101 assert((NumLegalElts % NumLanesTotal) == 0 &&
5102 "Unexpected elts per lane");
5103 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5104
5105 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5106 auto *LaneTy =
5107 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5108
5109 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5110 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5111 NumEltsPerLane, NumEltsPerLane * I);
5112 if (LaneEltMask.isZero())
5113 continue;
5114 // FIXME: we don't need to extract if all non-demanded elements
5115 // are legalization-inserted padding.
5116 if (!LaneEltMask.isAllOnes())
5118 CostKind, I * NumEltsPerLane, LaneTy);
5119 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
5120 /*Extract*/ false, CostKind);
5121 }
5122
5123 APInt AffectedLanes =
5124 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
5125 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
5126 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
5127 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
5128 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
5129 unsigned I = NumLegalLanes * LegalVec + Lane;
5130 // No need to insert unaffected lane; or lane 0 of each legal vector
5131 // iff ALL lanes of that vector were affected and will be inserted.
5132 if (!AffectedLanes[I] ||
5133 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
5134 continue;
5136 CostKind, I * NumEltsPerLane, LaneTy);
5137 }
5138 }
5139 }
5140 } else if (LT.second.isVector()) {
5141 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
5142 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
5143 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
5144 // considered cheap.
5145 if (Ty->isIntOrIntVectorTy())
5146 Cost += DemandedElts.popcount();
5147
5148 // Get the smaller of the legalized or original pow2-extended number of
5149 // vector elements, which represents the number of unpacks we'll end up
5150 // performing.
5151 unsigned NumElts = LT.second.getVectorNumElements();
5152 unsigned Pow2Elts =
5154 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
5155 }
5156 }
5157
5158 if (Extract) {
5159 // vXi1 can be efficiently extracted with MOVMSK.
5160 // TODO: AVX512 predicate mask handling.
5161 // NOTE: This doesn't work well for roundtrip scalarization.
5162 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
5163 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
5164 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
5165 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
5166 return MOVMSKCost;
5167 }
5168
5169 if (LT.second.isVector()) {
5170 unsigned NumLegalElts =
5171 LT.second.getVectorNumElements() * NumLegalVectors;
5172 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5173 "Vector has been legalized to smaller element count");
5174
5175 // If we're extracting elements from a 128-bit subvector lane,
5176 // we only need to extract each lane once, not for every element.
5177 if (LegalVectorBitWidth > LaneBitWidth) {
5178 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5179 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5180 assert((NumLegalElts % NumLanesTotal) == 0 &&
5181 "Unexpected elts per lane");
5182 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5183
5184 // Add cost for each demanded 128-bit subvector extraction.
5185 // Luckily this is a lot easier than for insertion.
5186 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5187 auto *LaneTy =
5188 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5189
5190 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5191 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5192 NumEltsPerLane, I * NumEltsPerLane);
5193 if (LaneEltMask.isZero())
5194 continue;
5196 I * NumEltsPerLane, LaneTy);
5198 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
5199 }
5200
5201 return Cost;
5202 }
5203 }
5204
5205 // Fallback to default extraction.
5206 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
5207 Extract, CostKind);
5208 }
5209
5210 return Cost;
5211}
5212
5214X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
5215 int VF, const APInt &DemandedDstElts,
5217 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
5218 // We don't differentiate element types here, only element bit width.
5219 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
5220
5221 auto bailout = [&]() {
5222 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
5223 DemandedDstElts, CostKind);
5224 };
5225
5226 // For now, only deal with AVX512 cases.
5227 if (!ST->hasAVX512())
5228 return bailout();
5229
5230 // Do we have a native shuffle for this element type, or should we promote?
5231 unsigned PromEltTyBits = EltTyBits;
5232 switch (EltTyBits) {
5233 case 32:
5234 case 64:
5235 break; // AVX512F.
5236 case 16:
5237 if (!ST->hasBWI())
5238 PromEltTyBits = 32; // promote to i32, AVX512F.
5239 break; // AVX512BW
5240 case 8:
5241 if (!ST->hasVBMI())
5242 PromEltTyBits = 32; // promote to i32, AVX512F.
5243 break; // AVX512VBMI
5244 case 1:
5245 // There is no support for shuffling i1 elements. We *must* promote.
5246 if (ST->hasBWI()) {
5247 if (ST->hasVBMI())
5248 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
5249 else
5250 PromEltTyBits = 16; // promote to i16, AVX512BW.
5251 break;
5252 }
5253 PromEltTyBits = 32; // promote to i32, AVX512F.
5254 break;
5255 default:
5256 return bailout();
5257 }
5258 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
5259
5260 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
5261 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
5262
5263 int NumDstElements = VF * ReplicationFactor;
5264 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
5265 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
5266
5267 // Legalize the types.
5268 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
5269 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
5270 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
5271 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
5272 // They should have legalized into vector types.
5273 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
5274 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
5275 return bailout();
5276
5277 if (PromEltTyBits != EltTyBits) {
5278 // If we have to perform the shuffle with wider elt type than our data type,
5279 // then we will first need to anyext (we don't care about the new bits)
5280 // the source elements, and then truncate Dst elements.
5281 InstructionCost PromotionCost;
5282 PromotionCost += getCastInstrCost(
5283 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
5285 PromotionCost +=
5286 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
5287 /*Src=*/PromDstVecTy,
5289 return PromotionCost + getReplicationShuffleCost(PromEltTy,
5290 ReplicationFactor, VF,
5291 DemandedDstElts, CostKind);
5292 }
5293
5294 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
5295 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
5296 "We expect that the legalization doesn't affect the element width, "
5297 "doesn't coalesce/split elements.");
5298
5299 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
5300 unsigned NumDstVectors =
5301 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
5302
5303 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
5304
5305 // Not all the produced Dst elements may be demanded. In our case,
5306 // given that a single Dst vector is formed by a single shuffle,
5307 // if all elements that will form a single Dst vector aren't demanded,
5308 // then we won't need to do that shuffle, so adjust the cost accordingly.
5309 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
5310 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
5311 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
5312
5313 InstructionCost SingleShuffleCost =
5314 getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, SingleDstVecTy,
5315 /*Mask=*/{}, CostKind,
5316 /*Index=*/0, /*SubTp=*/nullptr);
5317 return NumDstVectorsDemanded * SingleShuffleCost;
5318}
5319
5321 Align Alignment,
5322 unsigned AddressSpace,
5324 TTI::OperandValueInfo OpInfo,
5325 const Instruction *I) const {
5326 // FIXME: Load latency isn't handled here
5327 if (Opcode == Instruction::Load && CostKind == TTI::TCK_Latency)
5328 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5329 CostKind, OpInfo, I);
5330
5331 // TODO: Handle other cost kinds.
5333 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
5334 // Store instruction with index and scale costs 2 Uops.
5335 // Check the preceding GEP to identify non-const indices.
5336 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
5337 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
5338 return TTI::TCC_Basic * 2;
5339 }
5340 }
5341 return TTI::TCC_Basic;
5342 }
5343
5344 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5345 "Invalid Opcode");
5346 // Type legalization can't handle structs
5347 if (TLI->getValueType(DL, Src, true) == MVT::Other)
5348 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5349 CostKind, OpInfo, I);
5350
5351 // Legalize the type.
5352 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
5353
5354 auto *VTy = dyn_cast<FixedVectorType>(Src);
5355
5357
5358 // Add a cost for constant load to vector.
5359 if (Opcode == Instruction::Store && OpInfo.isConstant())
5360 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
5361 /*AddressSpace=*/0, CostKind, OpInfo);
5362
5363 // Handle the simple case of non-vectors.
5364 // NOTE: this assumes that legalization never creates vector from scalars!
5365 if (!VTy || !LT.second.isVector()) {
5366 // Each load/store unit costs 1.
5367 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
5368 }
5369
5370 bool IsLoad = Opcode == Instruction::Load;
5371
5372 Type *EltTy = VTy->getElementType();
5373
5374 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
5375
5376 // Source of truth: how many elements were there in the original IR vector?
5377 const unsigned SrcNumElt = VTy->getNumElements();
5378
5379 // How far have we gotten?
5380 int NumEltRemaining = SrcNumElt;
5381 // Note that we intentionally capture by-reference, NumEltRemaining changes.
5382 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
5383
5384 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
5385
5386 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
5387 const unsigned XMMBits = 128;
5388 if (XMMBits % EltTyBits != 0)
5389 // Vector size must be a multiple of the element size. I.e. no padding.
5390 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5391 CostKind, OpInfo, I);
5392 const int NumEltPerXMM = XMMBits / EltTyBits;
5393
5394 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
5395
5396 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
5397 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
5398 // How many elements would a single op deal with at once?
5399 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
5400 // Vector size must be a multiple of the element size. I.e. no padding.
5401 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5402 CostKind, OpInfo, I);
5403 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
5404
5405 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
5406 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
5407 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
5408 "Unless we haven't halved the op size yet, "
5409 "we have less than two op's sized units of work left.");
5410
5411 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
5412 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
5413 : XMMVecTy;
5414
5415 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
5416 "After halving sizes, the vector elt count is no longer a multiple "
5417 "of number of elements per operation?");
5418 auto *CoalescedVecTy =
5419 CurrNumEltPerOp == 1
5420 ? CurrVecTy
5422 IntegerType::get(Src->getContext(),
5423 EltTyBits * CurrNumEltPerOp),
5424 CurrVecTy->getNumElements() / CurrNumEltPerOp);
5425 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
5426 DL.getTypeSizeInBits(CurrVecTy) &&
5427 "coalesciing elements doesn't change vector width.");
5428
5429 while (NumEltRemaining > 0) {
5430 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
5431
5432 // Can we use this vector size, as per the remaining element count?
5433 // Iff the vector is naturally aligned, we can do a wide load regardless.
5434 if (NumEltRemaining < CurrNumEltPerOp &&
5435 (!IsLoad || Alignment < CurrOpSizeBytes) && CurrOpSizeBytes != 1)
5436 break; // Try smalled vector size.
5437
5438 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5439 // as a proxy for a double-pumped AVX memory interface such as on
5440 // Sandybridge.
5441 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5442 // will be scalarized.
5443 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5444 Cost += 2;
5445 else if (CurrOpSizeBytes < 4)
5446 Cost += 2;
5447 else
5448 Cost += 1;
5449
5450 // If we're loading a uniform value, then we don't need to split the load,
5451 // loading just a single (widest) vector can be reused by all splits.
5452 if (IsLoad && OpInfo.isUniform())
5453 return Cost;
5454
5455 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5456
5457 // If we have fully processed the previous reg, we need to replenish it.
5458 if (SubVecEltsLeft == 0) {
5459 SubVecEltsLeft += CurrVecTy->getNumElements();
5460 // And that's free only for the 0'th subvector of a legalized vector.
5461 if (!Is0thSubVec)
5462 Cost +=
5465 VTy, VTy, {}, CostKind, NumEltDone(), CurrVecTy);
5466 }
5467
5468 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5469 // for smaller widths (32/16/8) we have to insert/extract them separately.
5470 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5471 // but let's pretend that it is also true for 16/8 bit wide ops...)
5472 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5473 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5474 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5475 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5476 APInt DemandedElts =
5477 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5478 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5479 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5480 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5481 !IsLoad, CostKind);
5482 }
5483
5484 SubVecEltsLeft -= CurrNumEltPerOp;
5485 NumEltRemaining -= CurrNumEltPerOp;
5486 Alignment = commonAlignment(Alignment, CurrOpSizeBytes);
5487 }
5488 }
5489
5490 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5491
5492 return Cost;
5493}
5494
5498 switch (MICA.getID()) {
5499 case Intrinsic::masked_scatter:
5500 case Intrinsic::masked_gather:
5501 return getGatherScatterOpCost(MICA, CostKind);
5502 case Intrinsic::masked_load:
5503 case Intrinsic::masked_store:
5504 return getMaskedMemoryOpCost(MICA, CostKind);
5505 }
5507}
5508
5512 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
5513 : Instruction::Store;
5514 Type *SrcTy = MICA.getDataType();
5515 Align Alignment = MICA.getAlignment();
5516 unsigned AddressSpace = MICA.getAddressSpace();
5517
5518 bool IsLoad = (Instruction::Load == Opcode);
5519 bool IsStore = (Instruction::Store == Opcode);
5520
5521 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5522 if (!SrcVTy)
5523 // To calculate scalar take the regular cost, without mask
5524 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5525
5526 unsigned NumElem = SrcVTy->getNumElements();
5527 auto *MaskTy =
5528 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5529 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment, AddressSpace)) ||
5530 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment, AddressSpace))) {
5531 // Scalarization
5532 APInt DemandedElts = APInt::getAllOnes(NumElem);
5534 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5535 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5536 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5538 InstructionCost BranchCost = getCFInstrCost(Instruction::CondBr, CostKind);
5539 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5541 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5542 InstructionCost MemopCost =
5543 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5544 Alignment, AddressSpace, CostKind);
5545 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5546 }
5547
5548 // Legalize the type.
5549 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5550 auto VT = TLI->getValueType(DL, SrcVTy);
5552 MVT Ty = LT.second;
5553 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5554 // APX masked load/store for scalar is cheap.
5555 return Cost + LT.first;
5556
5557 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5558 LT.second.getVectorNumElements() == NumElem)
5559 // Promotion requires extend/truncate for data and a shuffle for mask.
5560 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, SrcVTy, {}, CostKind,
5561 0, nullptr) +
5562 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, MaskTy, {}, CostKind,
5563 0, nullptr);
5564
5565 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5566 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5567 (unsigned)LT.first.getValue() *
5568 Ty.getVectorNumElements());
5569 // Expanding requires fill mask with zeroes
5570 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, NewMaskTy, {},
5571 CostKind, 0, MaskTy);
5572 }
5573
5574 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5575 if (!ST->hasAVX512())
5576 return Cost + LT.first * (IsLoad ? 2 : 8);
5577
5578 // AVX-512 masked load/store is cheaper
5579 return Cost + LT.first;
5580}
5581
5583 ArrayRef<const Value *> Ptrs, const Value *Base,
5584 const TTI::PointersChainInfo &Info, Type *AccessTy,
5586 if (Info.isSameBase() && Info.isKnownStride()) {
5587 // If all the pointers have known stride all the differences are translated
5588 // into constants. X86 memory addressing allows encoding it into
5589 // displacement. So we just need to take the base GEP cost.
5590 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5591 SmallVector<const Value *> Indices(BaseGEP->indices());
5592 return getGEPCost(BaseGEP->getSourceElementType(),
5593 BaseGEP->getPointerOperand(), Indices, nullptr,
5594 CostKind);
5595 }
5596 return TTI::TCC_Free;
5597 }
5598 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5599}
5600
5603 const SCEV *Ptr,
5605 // Address computations in vectorized code with non-consecutive addresses will
5606 // likely result in more instructions compared to scalar code where the
5607 // computation can more often be merged into the index mode. The resulting
5608 // extra micro-ops can significantly decrease throughput.
5609 const unsigned NumVectorInstToHideOverhead = 10;
5610
5611 // Cost modeling of Strided Access Computation is hidden by the indexing
5612 // modes of X86 regardless of the stride value. We dont believe that there
5613 // is a difference between constant strided access in gerenal and constant
5614 // strided value which is less than or equal to 64.
5615 // Even in the case of (loop invariant) stride whose value is not known at
5616 // compile time, the address computation will not incur more than one extra
5617 // ADD instruction.
5618 if (PtrTy->isVectorTy() && SE && !ST->hasAVX2()) {
5619 // TODO: AVX2 is the current cut-off because we don't have correct
5620 // interleaving costs for prior ISA's.
5621 if (!BaseT::isStridedAccess(Ptr))
5622 return NumVectorInstToHideOverhead;
5623 if (!BaseT::getConstantStrideStep(SE, Ptr))
5624 return 1;
5625 }
5626
5627 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
5628}
5629
5632 std::optional<FastMathFlags> FMF,
5635 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5636
5637 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5638 // and make it as the cost.
5639
5640 static const CostTblEntry SLMCostTbl[] = {
5641 { ISD::FADD, MVT::v2f64, 3 },
5642 { ISD::ADD, MVT::v2i64, 5 },
5643 };
5644
5645 static const CostTblEntry SSE2CostTbl[] = {
5646 { ISD::FADD, MVT::v2f64, 2 },
5647 { ISD::FADD, MVT::v2f32, 2 },
5648 { ISD::FADD, MVT::v4f32, 4 },
5649 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5650 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5651 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5652 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5653 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5654 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5655 { ISD::ADD, MVT::v2i8, 2 },
5656 { ISD::ADD, MVT::v4i8, 2 },
5657 { ISD::ADD, MVT::v8i8, 2 },
5658 { ISD::ADD, MVT::v16i8, 3 },
5659 };
5660
5661 static const CostTblEntry AVX1CostTbl[] = {
5662 { ISD::FADD, MVT::v4f64, 3 },
5663 { ISD::FADD, MVT::v4f32, 3 },
5664 { ISD::FADD, MVT::v8f32, 4 },
5665 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5666 { ISD::ADD, MVT::v4i64, 3 },
5667 { ISD::ADD, MVT::v8i32, 5 },
5668 { ISD::ADD, MVT::v16i16, 5 },
5669 { ISD::ADD, MVT::v32i8, 4 },
5670 };
5671
5672 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5673 assert(ISD && "Invalid opcode");
5674
5675 // Before legalizing the type, give a chance to look up illegal narrow types
5676 // in the table.
5677 // FIXME: Is there a better way to do this?
5678 EVT VT = TLI->getValueType(DL, ValTy);
5679 if (VT.isSimple()) {
5680 MVT MTy = VT.getSimpleVT();
5681 if (ST->useSLMArithCosts())
5682 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5683 return Entry->Cost;
5684
5685 if (ST->hasAVX())
5686 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5687 return Entry->Cost;
5688
5689 if (ST->hasSSE2())
5690 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5691 return Entry->Cost;
5692 }
5693
5694 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5695
5696 MVT MTy = LT.second;
5697
5698 auto *ValVTy = cast<FixedVectorType>(ValTy);
5699
5700 // Special case: vXi8 mul reductions are performed as vXi16.
5701 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5702 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5703 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5704 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5706 CostKind) +
5707 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5708 }
5709
5710 InstructionCost ArithmeticCost = 0;
5711 if (LT.first != 1 && MTy.isVector() &&
5712 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5713 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5714 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5715 MTy.getVectorNumElements());
5716 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5717 ArithmeticCost *= LT.first - 1;
5718 }
5719
5720 if (ST->useSLMArithCosts())
5721 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5722 return ArithmeticCost + Entry->Cost;
5723
5724 if (ST->hasAVX())
5725 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5726 return ArithmeticCost + Entry->Cost;
5727
5728 if (ST->hasSSE2())
5729 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5730 return ArithmeticCost + Entry->Cost;
5731
5732 // FIXME: These assume a naive kshift+binop lowering, which is probably
5733 // conservative in most cases.
5734 static const CostTblEntry AVX512BoolReduction[] = {
5735 { ISD::AND, MVT::v2i1, 3 },
5736 { ISD::AND, MVT::v4i1, 5 },
5737 { ISD::AND, MVT::v8i1, 7 },
5738 { ISD::AND, MVT::v16i1, 9 },
5739 { ISD::AND, MVT::v32i1, 11 },
5740 { ISD::AND, MVT::v64i1, 13 },
5741 { ISD::OR, MVT::v2i1, 3 },
5742 { ISD::OR, MVT::v4i1, 5 },
5743 { ISD::OR, MVT::v8i1, 7 },
5744 { ISD::OR, MVT::v16i1, 9 },
5745 { ISD::OR, MVT::v32i1, 11 },
5746 { ISD::OR, MVT::v64i1, 13 },
5747 };
5748
5749 static const CostTblEntry AVX2BoolReduction[] = {
5750 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5751 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5752 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5753 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5754 };
5755
5756 static const CostTblEntry AVX1BoolReduction[] = {
5757 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5758 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5759 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5760 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5761 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5762 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5763 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5764 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5765 };
5766
5767 static const CostTblEntry SSE2BoolReduction[] = {
5768 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5769 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5770 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5771 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5772 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5773 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5774 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5775 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5776 };
5777
5778 // Handle bool allof/anyof patterns.
5779 if (ValVTy->getElementType()->isIntegerTy(1)) {
5780 if (ISD == ISD::ADD) {
5781 // vXi1 addition reduction will bitcast to scalar and perform a popcount.
5782 auto *IntTy = IntegerType::getIntNTy(ValVTy->getContext(),
5783 ValVTy->getNumElements());
5784 IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy});
5785 return getCastInstrCost(Instruction::BitCast, IntTy, ValVTy,
5787 CostKind) +
5789 }
5790
5791 InstructionCost ArithmeticCost = 0;
5792 if (LT.first != 1 && MTy.isVector() &&
5793 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5794 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5795 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5796 MTy.getVectorNumElements());
5797 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5798 ArithmeticCost *= LT.first - 1;
5799 }
5800
5801 if (ST->hasAVX512())
5802 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5803 return ArithmeticCost + Entry->Cost;
5804 if (ST->hasAVX2())
5805 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5806 return ArithmeticCost + Entry->Cost;
5807 if (ST->hasAVX())
5808 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5809 return ArithmeticCost + Entry->Cost;
5810 if (ST->hasSSE2())
5811 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5812 return ArithmeticCost + Entry->Cost;
5813
5814 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5815 }
5816
5817 unsigned NumVecElts = ValVTy->getNumElements();
5818 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5819
5820 // Special case power of 2 reductions where the scalar type isn't changed
5821 // by type legalization.
5822 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5823 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5824
5825 InstructionCost ReductionCost = 0;
5826
5827 auto *Ty = ValVTy;
5828 if (LT.first != 1 && MTy.isVector() &&
5829 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5830 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5831 Ty = FixedVectorType::get(ValVTy->getElementType(),
5832 MTy.getVectorNumElements());
5833 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5834 ReductionCost *= LT.first - 1;
5835 NumVecElts = MTy.getVectorNumElements();
5836 }
5837
5838 // Now handle reduction with the legal type, taking into account size changes
5839 // at each level.
5840 while (NumVecElts > 1) {
5841 // Determine the size of the remaining vector we need to reduce.
5842 unsigned Size = NumVecElts * ScalarSize;
5843 NumVecElts /= 2;
5844 // If we're reducing from 256/512 bits, use an extract_subvector.
5845 if (Size > 128) {
5846 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5847 ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
5848 CostKind, NumVecElts, SubTy);
5849 Ty = SubTy;
5850 } else if (Size == 128) {
5851 // Reducing from 128 bits is a permute of v2f64/v2i64.
5852 FixedVectorType *ShufTy;
5853 if (ValVTy->isFloatingPointTy())
5854 ShufTy =
5855 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5856 else
5857 ShufTy =
5858 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5859 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5860 {}, CostKind, 0, nullptr);
5861 } else if (Size == 64) {
5862 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5863 FixedVectorType *ShufTy;
5864 if (ValVTy->isFloatingPointTy())
5865 ShufTy =
5866 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5867 else
5868 ShufTy =
5869 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5870 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5871 {}, CostKind, 0, nullptr);
5872 } else {
5873 // Reducing from smaller size is a shift by immediate.
5874 auto *ShiftTy = FixedVectorType::get(
5875 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5876 ReductionCost += getArithmeticInstrCost(
5877 Instruction::LShr, ShiftTy, CostKind,
5880 }
5881
5882 // Add the arithmetic op for this level.
5883 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5884 }
5885
5886 // Add the final extract element to the cost.
5887 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5888 CostKind, 0, nullptr, nullptr,
5890}
5891
5894 FastMathFlags FMF) const {
5895 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5896 return getIntrinsicInstrCost(ICA, CostKind);
5897}
5898
5901 FastMathFlags FMF,
5903 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5904
5905 MVT MTy = LT.second;
5906
5908 if (ValTy->isIntOrIntVectorTy()) {
5909 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5910 : ISD::SMIN;
5911 } else {
5912 assert(ValTy->isFPOrFPVectorTy() &&
5913 "Expected float point or integer vector type.");
5914 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5915 ? ISD::FMINNUM
5916 : ISD::FMINIMUM;
5917 }
5918
5919 // We use llvm-mca across all supported CPUs to measure the cost stats.
5920 static const CostKindTblEntry SSE2CostTbl[] = {
5921 {ISD::SMIN, MVT::v2i64, {3, 4, 5, 6}},
5922 {ISD::UMIN, MVT::v2i64, {3, 4, 5, 6}},
5923 {ISD::SMIN, MVT::v2i32, {2, 2, 5, 6}},
5924 {ISD::UMIN, MVT::v2i32, {2, 2, 5, 6}},
5925 {ISD::SMIN, MVT::v4i32, {3, 7,11,12}},
5926 {ISD::UMIN, MVT::v4i32, {4, 7,14,15}},
5927 {ISD::SMIN, MVT::v2i16, {2, 3, 4, 4}},
5928 {ISD::UMIN, MVT::v2i16, {2, 3, 4, 6}},
5929 {ISD::SMIN, MVT::v4i16, {3, 5, 6, 6}},
5930 {ISD::UMIN, MVT::v4i16, {3, 5, 8, 10}},
5931 {ISD::SMIN, MVT::v8i16, {3, 8, 8, 8}},
5932 {ISD::UMIN, MVT::v8i16, {4, 8,12,14}},
5933 {ISD::SMIN, MVT::v2i8, {2, 3, 5, 6}},
5934 {ISD::UMIN, MVT::v2i8, {2, 3, 4, 4}},
5935 {ISD::SMIN, MVT::v4i8, {4, 6,12,13}},
5936 {ISD::UMIN, MVT::v4i8, {3, 6, 7, 7}},
5937 {ISD::SMIN, MVT::v8i8, {5, 9,18,19}},
5938 {ISD::UMIN, MVT::v8i8, {4, 8, 9, 9}},
5939 {ISD::SMIN, MVT::v16i8, {7,13,24,25}},
5940 {ISD::UMIN, MVT::v16i8, {3,10,11,11}},
5941 };
5942
5943 static const CostKindTblEntry SSE41CostTbl[] = {
5944 {ISD::SMIN, MVT::v2i64, {3, 4, 4, 6}},
5945 {ISD::UMIN, MVT::v2i64, {3, 4, 4, 6}},
5946 {ISD::SMIN, MVT::v2i32, {2, 2, 3, 3}},
5947 {ISD::UMIN, MVT::v2i32, {2, 2, 3, 3}},
5948 {ISD::SMIN, MVT::v4i32, {3, 4, 5, 5}},
5949 {ISD::UMIN, MVT::v4i32, {3, 4, 5, 5}},
5950 {ISD::UMIN, MVT::v2i16, {2, 3, 4, 4}},
5951 {ISD::SMIN, MVT::v4i16, {3, 5, 6, 6}},
5952 {ISD::UMIN, MVT::v4i16, {3, 5, 6, 6}},
5953 {ISD::SMIN, MVT::v8i16, {2, 8, 4, 5}},
5954 {ISD::UMIN, MVT::v8i16, {2, 5, 2, 2}},
5955 {ISD::SMIN, MVT::v2i8, {2, 3, 4, 4}},
5956 {ISD::SMIN, MVT::v4i8, {3, 6, 7, 7}},
5957 {ISD::SMIN, MVT::v8i8, {4, 8, 9, 9}},
5958 {ISD::SMIN, MVT::v16i8, {3,10, 7, 8}},
5959 {ISD::UMIN, MVT::v16i8, {3, 8, 5, 5}},
5960 };
5961
5962 static const CostKindTblEntry AVX1CostTbl[] = {
5963 {ISD::SMIN, MVT::v4i64, {5,11, 7,10}},
5964 {ISD::UMIN, MVT::v4i64, {6,12,10,13}},
5965 {ISD::SMIN, MVT::v8i32, {4, 9, 7, 7}},
5966 {ISD::UMIN, MVT::v8i32, {4, 9, 7, 7}},
5967 {ISD::SMIN, MVT::v16i16, {3,15, 6, 7}},
5968 {ISD::UMIN, MVT::v16i16, {2, 9, 4, 4}},
5969 {ISD::SMIN, MVT::v32i8, {4,17, 8, 9}},
5970 {ISD::UMIN, MVT::v32i8, {3,11, 6, 6}},
5971 };
5972
5973 static const CostKindTblEntry AVX2CostTbl[] = {
5974 {ISD::SMIN, MVT::v4i64, {4,11, 7,10}},
5975 {ISD::UMIN, MVT::v4i64, {4,12,10,13}},
5976 {ISD::SMIN, MVT::v2i32, {1, 2, 3, 3}},
5977 {ISD::UMIN, MVT::v2i32, {1, 2, 3, 3}},
5978 {ISD::UMIN, MVT::v4i32, {2, 4, 5, 5}},
5979 {ISD::SMIN, MVT::v4i32, {2, 4, 5, 5}},
5980 {ISD::SMIN, MVT::v8i32, {3, 9, 7, 7}},
5981 {ISD::UMIN, MVT::v8i32, {3, 9, 7, 7}},
5982 {ISD::SMIN, MVT::v4i16, {2, 4, 5, 5}},
5983 {ISD::UMIN, MVT::v4i16, {2, 4, 5, 5}},
5984 {ISD::SMIN, MVT::v16i16, {2,15, 6, 7}},
5985 {ISD::SMIN, MVT::v8i8, {3, 6, 7, 7}},
5986 {ISD::UMIN, MVT::v8i8, {3, 6, 7, 7}},
5987 {ISD::SMIN, MVT::v32i8, {3,17, 8, 9}},
5988 };
5989
5990 static const CostKindTblEntry AVX512FCostTbl[] = {
5991 {ISD::SMIN, MVT::v2i64, {2, 4, 3, 3}},
5992 {ISD::UMIN, MVT::v2i64, {2, 4, 3, 3}},
5993 {ISD::SMIN, MVT::v4i64, {3,10, 5, 5}},
5994 {ISD::UMIN, MVT::v4i64, {3,10, 5, 5}},
5995 {ISD::SMIN, MVT::v8i64, {5,16, 7, 7}},
5996 {ISD::UMIN, MVT::v8i64, {5,16, 7, 7}},
5997 {ISD::SMIN, MVT::v16i32, {4,12, 9, 9}},
5998 {ISD::UMIN, MVT::v16i32, {4,12, 9, 9}},
5999 };
6000
6001 static const CostKindTblEntry AVX512BWCostTbl[] = {
6002 {ISD::SMIN, MVT::v2i16, {1, 2, 3, 3}},
6003 {ISD::UMIN, MVT::v2i16, {1, 2, 3, 3}},
6004 {ISD::SMIN, MVT::v32i16, {2,19, 8, 9}},
6005 {ISD::UMIN, MVT::v32i16, {2,12, 6, 6}},
6006 {ISD::SMIN, MVT::v2i8, {1, 2, 3, 3}},
6007 {ISD::UMIN, MVT::v2i8, {1, 2, 3, 3}},
6008 {ISD::SMIN, MVT::v4i8, {2, 4, 5, 5}},
6009 {ISD::UMIN, MVT::v4i8, {2, 4, 5, 5}},
6010 {ISD::SMIN, MVT::v16i8, {2,10, 6, 7}},
6011 {ISD::UMIN, MVT::v16i8, {2, 6, 4, 4}},
6012 {ISD::SMIN, MVT::v32i8, {2,17, 8, 9}},
6013 {ISD::UMIN, MVT::v32i8, {2,10, 6, 6}},
6014 {ISD::SMIN, MVT::v64i8, {2,21,10,11}},
6015 {ISD::UMIN, MVT::v64i8, {2,14, 8, 8}},
6016 };
6017
6018 // Before legalizing the type, give a chance to look up illegal narrow types
6019 // in the table.
6020 // FIXME: Is there a better way to do this?
6021 EVT VT = TLI->getValueType(DL, ValTy);
6022 if (VT.isSimple()) {
6023 MVT MTy = VT.getSimpleVT();
6024 if (ST->hasBWI())
6025 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
6026 if (auto KindCost = Entry->Cost[CostKind])
6027 return *KindCost;
6028
6029 if (ST->hasAVX512())
6030 if (const auto *Entry = CostTableLookup(AVX512FCostTbl, ISD, MTy))
6031 if (auto KindCost = Entry->Cost[CostKind])
6032 return *KindCost;
6033
6034 if (ST->hasAVX2())
6035 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
6036 if (auto KindCost = Entry->Cost[CostKind])
6037 return *KindCost;
6038
6039 if (ST->hasAVX())
6040 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
6041 if (auto KindCost = Entry->Cost[CostKind])
6042 return *KindCost;
6043
6044 if (ST->hasSSE41())
6045 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
6046 if (auto KindCost = Entry->Cost[CostKind])
6047 return *KindCost;
6048
6049 if (ST->hasSSE2())
6050 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
6051 if (auto KindCost = Entry->Cost[CostKind])
6052 return *KindCost;
6053 }
6054
6055 auto *ValVTy = cast<FixedVectorType>(ValTy);
6056 unsigned NumVecElts = ValVTy->getNumElements();
6057
6058 auto *Ty = ValVTy;
6059 InstructionCost MinMaxCost = 0;
6060 if (LT.first != 1 && MTy.isVector() &&
6061 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
6062 // Type needs to be split. We need LT.first - 1 operations ops.
6063 Ty = FixedVectorType::get(ValVTy->getElementType(),
6064 MTy.getVectorNumElements());
6065 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
6066 MinMaxCost *= LT.first - 1;
6067 NumVecElts = MTy.getVectorNumElements();
6068 }
6069
6070 if (ST->hasBWI())
6071 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
6072 if (auto KindCost = Entry->Cost[CostKind])
6073 return MinMaxCost + *KindCost;
6074
6075 if (ST->hasAVX512())
6076 if (const auto *Entry = CostTableLookup(AVX512FCostTbl, ISD, MTy))
6077 if (auto KindCost = Entry->Cost[CostKind])
6078 return MinMaxCost + *KindCost;
6079
6080 if (ST->hasAVX2())
6081 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
6082 if (auto KindCost = Entry->Cost[CostKind])
6083 return MinMaxCost + *KindCost;
6084
6085 if (ST->hasAVX())
6086 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
6087 if (auto KindCost = Entry->Cost[CostKind])
6088 return MinMaxCost + *KindCost;
6089
6090 if (ST->hasSSE41())
6091 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
6092 if (auto KindCost = Entry->Cost[CostKind])
6093 return MinMaxCost + *KindCost;
6094
6095 if (ST->hasSSE2())
6096 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
6097 if (auto KindCost = Entry->Cost[CostKind])
6098 return MinMaxCost + *KindCost;
6099
6100 unsigned ScalarSize = ValTy->getScalarSizeInBits();
6101
6102 // Special case power of 2 reductions where the scalar type isn't changed
6103 // by type legalization.
6104 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
6105 ScalarSize != MTy.getScalarSizeInBits())
6106 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
6107
6108 // Now handle reduction with the legal type, taking into account size changes
6109 // at each level.
6110 while (NumVecElts > 1) {
6111 // Determine the size of the remaining vector we need to reduce.
6112 unsigned Size = NumVecElts * ScalarSize;
6113 NumVecElts /= 2;
6114 // If we're reducing from 256/512 bits, use an extract_subvector.
6115 if (Size > 128) {
6116 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
6117 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
6118 CostKind, NumVecElts, SubTy);
6119 Ty = SubTy;
6120 } else if (Size == 128) {
6121 // Reducing from 128 bits is a permute of v2f64/v2i64.
6122 VectorType *ShufTy;
6123 if (ValTy->isFloatingPointTy())
6124 ShufTy =
6126 else
6127 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
6128 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
6129 CostKind, 0, nullptr);
6130 } else if (Size == 64) {
6131 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
6132 FixedVectorType *ShufTy;
6133 if (ValTy->isFloatingPointTy())
6134 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
6135 else
6136 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
6137 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
6138 CostKind, 0, nullptr);
6139 } else {
6140 // Reducing from smaller size is a shift by immediate.
6141 auto *ShiftTy = FixedVectorType::get(
6142 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
6143 MinMaxCost += getArithmeticInstrCost(
6144 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
6147 }
6148
6149 // Add the arithmetic op for this level.
6150 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
6151 }
6152
6153 // Add the final extract element to the cost.
6154 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
6155 CostKind, 0, nullptr, nullptr,
6157}
6158
6159/// Calculate the cost of materializing a 64-bit value. This helper
6160/// method might only calculate a fraction of a larger immediate. Therefore it
6161/// is valid to return a cost of ZERO.
6163 if (Val == 0)
6164 return TTI::TCC_Free;
6165
6166 if (isInt<32>(Val))
6167 return TTI::TCC_Basic;
6168
6169 return 2 * TTI::TCC_Basic;
6170}
6171
6174 assert(Ty->isIntegerTy());
6175
6176 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6177 if (BitSize == 0)
6178 return ~0U;
6179
6180 // Never hoist constants larger than 128bit, because this might lead to
6181 // incorrect code generation or assertions in codegen.
6182 // Fixme: Create a cost model for types larger than i128 once the codegen
6183 // issues have been fixed.
6184 if (BitSize > 128)
6185 return TTI::TCC_Free;
6186
6187 if (Imm == 0)
6188 return TTI::TCC_Free;
6189
6190 // Sign-extend all constants to a multiple of 64-bit.
6191 APInt ImmVal = Imm;
6192 if (BitSize % 64 != 0)
6193 ImmVal = Imm.sext(alignTo(BitSize, 64));
6194
6195 // Split the constant into 64-bit chunks and calculate the cost for each
6196 // chunk.
6198 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
6199 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
6200 int64_t Val = Tmp.getSExtValue();
6201 Cost += getIntImmCost(Val);
6202 }
6203 // We need at least one instruction to materialize the constant.
6204 return std::max<InstructionCost>(1, Cost);
6205}
6206
6208 const APInt &Imm, Type *Ty,
6210 Instruction *Inst) const {
6211 assert(Ty->isIntegerTy());
6212
6213 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6214 unsigned ImmBitWidth = Imm.getBitWidth();
6215
6216 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6217 // here, so that constant hoisting will ignore this constant.
6218 if (BitSize == 0)
6219 return TTI::TCC_Free;
6220
6221 unsigned ImmIdx = ~0U;
6222 switch (Opcode) {
6223 default:
6224 return TTI::TCC_Free;
6225 case Instruction::GetElementPtr:
6226 // Always hoist the base address of a GetElementPtr. This prevents the
6227 // creation of new constants for every base constant that gets constant
6228 // folded with the offset.
6229 if (Idx == 0)
6230 return 2 * TTI::TCC_Basic;
6231 return TTI::TCC_Free;
6232 case Instruction::Store:
6233 ImmIdx = 0;
6234 break;
6235 case Instruction::ICmp:
6236 // This is an imperfect hack to prevent constant hoisting of
6237 // compares that might be trying to check if a 64-bit value fits in
6238 // 32-bits. The backend can optimize these cases using a right shift by 32.
6239 // There are other predicates and immediates the backend can use shifts for.
6240 if (Idx == 1 && ImmBitWidth == 64) {
6241 uint64_t ImmVal = Imm.getZExtValue();
6242 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
6243 return TTI::TCC_Free;
6244
6245 if (auto *Cmp = dyn_cast_or_null<CmpInst>(Inst)) {
6246 if (Cmp->isEquality()) {
6247 KnownBits Known = computeKnownBits(Cmp->getOperand(0), DL);
6248 if (Known.countMinTrailingZeros() >= 32)
6249 return TTI::TCC_Free;
6250 }
6251 }
6252 }
6253 ImmIdx = 1;
6254 break;
6255 case Instruction::And:
6256 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
6257 // by using a 32-bit operation with implicit zero extension. Detect such
6258 // immediates here as the normal path expects bit 31 to be sign extended.
6259 if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(32))
6260 return TTI::TCC_Free;
6261 // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits.
6262 if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() &&
6263 Imm.isMask())
6264 return X86TTIImpl::getIntImmCost(ST->hasBMI2() ? 255 : 65535);
6265 ImmIdx = 1;
6266 break;
6267 case Instruction::Add:
6268 case Instruction::Sub:
6269 // For add/sub, we can use the opposite instruction for INT32_MIN.
6270 if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000)
6271 return TTI::TCC_Free;
6272 ImmIdx = 1;
6273 break;
6274 case Instruction::UDiv:
6275 case Instruction::SDiv:
6276 case Instruction::URem:
6277 case Instruction::SRem:
6278 // Division by constant is typically expanded later into a different
6279 // instruction sequence. This completely changes the constants.
6280 // Report them as "free" to stop ConstantHoist from marking them as opaque.
6281 return TTI::TCC_Free;
6282 case Instruction::Mul:
6283 case Instruction::Or:
6284 case Instruction::Xor:
6285 ImmIdx = 1;
6286 break;
6287 // Always return TCC_Free for the shift value of a shift instruction.
6288 case Instruction::Shl:
6289 case Instruction::LShr:
6290 case Instruction::AShr:
6291 if (Idx == 1)
6292 return TTI::TCC_Free;
6293 break;
6294 case Instruction::Trunc:
6295 case Instruction::ZExt:
6296 case Instruction::SExt:
6297 case Instruction::IntToPtr:
6298 case Instruction::PtrToInt:
6299 case Instruction::BitCast:
6300 case Instruction::PHI:
6301 case Instruction::Call:
6302 case Instruction::Select:
6303 case Instruction::Ret:
6304 case Instruction::Load:
6305 break;
6306 }
6307
6308 if (Idx == ImmIdx) {
6309 uint64_t NumConstants = divideCeil(BitSize, 64);
6311 return (Cost <= NumConstants * TTI::TCC_Basic)
6312 ? static_cast<int>(TTI::TCC_Free)
6313 : Cost;
6314 }
6315
6316 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6317}
6318
6321 const APInt &Imm, Type *Ty,
6323 assert(Ty->isIntegerTy());
6324
6325 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6326 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6327 // here, so that constant hoisting will ignore this constant.
6328 if (BitSize == 0)
6329 return TTI::TCC_Free;
6330
6331 switch (IID) {
6332 default:
6333 return TTI::TCC_Free;
6334 case Intrinsic::sadd_with_overflow:
6335 case Intrinsic::uadd_with_overflow:
6336 case Intrinsic::ssub_with_overflow:
6337 case Intrinsic::usub_with_overflow:
6338 case Intrinsic::smul_with_overflow:
6339 case Intrinsic::umul_with_overflow:
6340 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
6341 return TTI::TCC_Free;
6342 break;
6343 case Intrinsic::experimental_stackmap:
6344 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6345 return TTI::TCC_Free;
6346 break;
6347 case Intrinsic::experimental_patchpoint_void:
6348 case Intrinsic::experimental_patchpoint:
6349 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6350 return TTI::TCC_Free;
6351 break;
6352 }
6353 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6354}
6355
6358 const Instruction *I) const {
6360 return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic;
6361 // Branches are assumed to be predicted.
6362 return TTI::TCC_Free;
6363}
6364
6365int X86TTIImpl::getGatherOverhead() const {
6366 // Some CPUs have more overhead for gather. The specified overhead is relative
6367 // to the Load operation. "2" is the number provided by Intel architects. This
6368 // parameter is used for cost estimation of Gather Op and comparison with
6369 // other alternatives.
6370 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
6371 // enable gather with a -march.
6372 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
6373 return 2;
6374
6375 return 1024;
6376}
6377
6378int X86TTIImpl::getScatterOverhead() const {
6379 if (ST->hasAVX512())
6380 return 2;
6381
6382 return 1024;
6383}
6384
6385// Return an average cost of Gather / Scatter instruction, maybe improved later.
6386InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
6388 Type *SrcVTy, const Value *Ptr,
6389 Align Alignment,
6390 unsigned AddressSpace) const {
6391
6392 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
6393 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
6394
6395 // Try to reduce index size from 64 bit (default for GEP)
6396 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
6397 // operation will use 16 x 64 indices which do not fit in a zmm and needs
6398 // to split. Also check that the base pointer is the same for all lanes,
6399 // and that there's at most one variable index.
6400 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
6401 unsigned IndexSize = DL.getPointerSizeInBits();
6402 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
6403 if (IndexSize < 64 || !GEP)
6404 return IndexSize;
6405
6406 unsigned NumOfVarIndices = 0;
6407 const Value *Ptrs = GEP->getPointerOperand();
6408 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
6409 return IndexSize;
6410 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
6411 if (isa<Constant>(GEP->getOperand(I)))
6412 continue;
6413 Type *IndxTy = GEP->getOperand(I)->getType();
6414 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
6415 IndxTy = IndexVTy->getElementType();
6416 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
6417 !isa<SExtInst>(GEP->getOperand(I))) ||
6418 ++NumOfVarIndices > 1)
6419 return IndexSize; // 64
6420 }
6421 return (unsigned)32;
6422 };
6423
6424 // Trying to reduce IndexSize to 32 bits for vector 16.
6425 // By default the IndexSize is equal to pointer size.
6426 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
6427 ? getIndexSizeInBits(Ptr, DL)
6428 : DL.getPointerSizeInBits();
6429
6430 auto *IndexVTy = FixedVectorType::get(
6431 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
6432 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
6433 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
6434 InstructionCost::CostType SplitFactor =
6435 std::max(IdxsLT.first, SrcLT.first).getValue();
6436 if (SplitFactor > 1) {
6437 // Handle splitting of vector of pointers
6438 auto *SplitSrcTy =
6439 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
6440 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
6441 Alignment, AddressSpace);
6442 }
6443
6444 // If we didn't split, this will be a single gather/scatter instruction.
6446 return 1;
6447
6448 // The gather / scatter cost is given by Intel architects. It is a rough
6449 // number since we are looking at one instruction in a time.
6450 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
6451 : getScatterOverhead();
6452 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
6453 Alignment, AddressSpace, CostKind);
6454}
6455
6456/// Calculate the cost of Gather / Scatter operation
6460 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
6461 MICA.getID() == Intrinsic::vp_gather;
6462 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
6463 Type *SrcVTy = MICA.getDataType();
6464 const Value *Ptr = MICA.getPointer();
6465 Align Alignment = MICA.getAlignment();
6466 if ((Opcode == Instruction::Load &&
6467 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
6469 Align(Alignment)))) ||
6470 (Opcode == Instruction::Store &&
6471 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
6473 Align(Alignment)))))
6475
6476 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
6477 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
6478 if (!PtrTy && Ptr->getType()->isVectorTy())
6479 PtrTy = dyn_cast<PointerType>(
6480 cast<VectorType>(Ptr->getType())->getElementType());
6481 assert(PtrTy && "Unexpected type for Ptr argument");
6482 unsigned AddressSpace = PtrTy->getAddressSpace();
6483 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
6484 AddressSpace);
6485}
6486
6488 const TargetTransformInfo::LSRCost &C2) const {
6489 // X86 specific here are "instruction number 1st priority".
6490 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
6491 C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6492 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
6493 C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6494}
6495
6497 return ST->hasMacroFusion() || ST->hasBranchFusion();
6498}
6499
6500static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST) {
6501 if (!ST->hasAVX())
6502 return false;
6503
6504 if (ScalarTy->isPointerTy())
6505 return true;
6506
6507 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6508 return true;
6509
6510 if (ScalarTy->isHalfTy() && ST->hasBWI())
6511 return true;
6512
6513 if (ScalarTy->isBFloatTy() && ST->hasBF16())
6514 return true;
6515
6516 if (!ScalarTy->isIntegerTy())
6517 return false;
6518
6519 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6520 return IntWidth == 32 || IntWidth == 64 ||
6521 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
6522}
6523
6525 unsigned AddressSpace,
6526 TTI::MaskKind MaskKind) const {
6527 Type *ScalarTy = DataTy->getScalarType();
6528
6529 // The backend can't handle a single element vector w/o CFCMOV.
6530 if (isa<VectorType>(DataTy) &&
6531 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6532 return ST->hasCF() &&
6533 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/false);
6534
6535 return isLegalMaskedLoadStore(ScalarTy, ST);
6536}
6537
6539 unsigned AddressSpace,
6540 TTI::MaskKind MaskKind) const {
6541 Type *ScalarTy = DataTy->getScalarType();
6542
6543 // The backend can't handle a single element vector w/o CFCMOV.
6544 if (isa<VectorType>(DataTy) &&
6545 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6546 return ST->hasCF() &&
6547 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/true);
6548
6549 return isLegalMaskedLoadStore(ScalarTy, ST);
6550}
6551
6552bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) const {
6553 unsigned DataSize = DL.getTypeStoreSize(DataType);
6554 // The only supported nontemporal loads are for aligned vectors of 16 or 32
6555 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
6556 // (the equivalent stores only require AVX).
6557 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
6558 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
6559
6560 return false;
6561}
6562
6563bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) const {
6564 unsigned DataSize = DL.getTypeStoreSize(DataType);
6565
6566 // SSE4A supports nontemporal stores of float and double at arbitrary
6567 // alignment.
6568 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
6569 return true;
6570
6571 // Besides the SSE4A subtarget exception above, only aligned stores are
6572 // available nontemporaly on any other subtarget. And only stores with a size
6573 // of 4..32 bytes (powers of 2, only) are permitted.
6574 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6576 return false;
6577
6578 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6579 // loads require AVX2).
6580 if (DataSize == 32)
6581 return ST->hasAVX();
6582 if (DataSize == 16)
6583 return ST->hasSSE1();
6584 return true;
6585}
6586
6588 ElementCount NumElements) const {
6589 // movddup
6590 return ST->hasSSE3() && !NumElements.isScalable() &&
6591 NumElements.getFixedValue() == 2 &&
6592 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6593}
6594
6595bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const {
6596 if (!isa<VectorType>(DataTy))
6597 return false;
6598
6599 if (!ST->hasAVX512())
6600 return false;
6601
6602 // The backend can't handle a single element vector.
6603 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6604 return false;
6605
6606 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6607
6608 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6609 return true;
6610
6611 if (!ScalarTy->isIntegerTy())
6612 return false;
6613
6614 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6615 return IntWidth == 32 || IntWidth == 64 ||
6616 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6617}
6618
6620 Align Alignment) const {
6621 return isLegalMaskedExpandLoad(DataTy, Alignment);
6622}
6623
6624bool X86TTIImpl::supportsGather() const {
6625 // Some CPUs have better gather performance than others.
6626 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6627 // enable gather with a -march.
6628 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6629}
6630
6632 Align Alignment) const {
6633 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6634 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6635 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6636 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6637 // Check, maybe the gather/scatter instruction is better in the VariableMask
6638 // case.
6639 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6640 return NumElts == 1 ||
6641 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6642}
6643
6645 Align Alignment) const {
6646 Type *ScalarTy = DataTy->getScalarType();
6647 if (ScalarTy->isPointerTy())
6648 return true;
6649
6650 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6651 return true;
6652
6653 if (!ScalarTy->isIntegerTy())
6654 return false;
6655
6656 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6657 return IntWidth == 32 || IntWidth == 64;
6658}
6659
6660bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) const {
6661 if (!supportsGather() || !ST->preferGather())
6662 return false;
6663 return isLegalMaskedGatherScatter(DataTy, Alignment);
6664}
6665
6666bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6667 unsigned Opcode1,
6668 const SmallBitVector &OpcodeMask) const {
6669 // ADDSUBPS 4xf32 SSE3
6670 // VADDSUBPS 4xf32 AVX
6671 // VADDSUBPS 8xf32 AVX2
6672 // ADDSUBPD 2xf64 SSE3
6673 // VADDSUBPD 2xf64 AVX
6674 // VADDSUBPD 4xf64 AVX2
6675
6676 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6677 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6678 if (!isPowerOf2_32(NumElements))
6679 return false;
6680 // Check the opcode pattern. We apply the mask on the opcode arguments and
6681 // then check if it is what we expect.
6682 for (int Lane : seq<int>(0, NumElements)) {
6683 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6684 // We expect FSub for even lanes and FAdd for odd lanes.
6685 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6686 return false;
6687 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6688 return false;
6689 }
6690 // Now check that the pattern is supported by the target ISA.
6691 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6692 if (ElemTy->isFloatTy())
6693 return ST->hasSSE3() && NumElements % 4 == 0;
6694 if (ElemTy->isDoubleTy())
6695 return ST->hasSSE3() && NumElements % 2 == 0;
6696 return false;
6697}
6698
6699bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) const {
6700 // AVX2 doesn't support scatter
6701 if (!ST->hasAVX512() || !ST->preferScatter())
6702 return false;
6703 return isLegalMaskedGatherScatter(DataType, Alignment);
6704}
6705
6706bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const {
6707 EVT VT = TLI->getValueType(DL, DataType);
6708 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6709}
6710
6712 // FDIV is always expensive, even if it has a very low uop count.
6713 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6714 if (I->getOpcode() == Instruction::FDiv)
6715 return true;
6716
6718}
6719
6720bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return false; }
6721
6723 const Function *Callee) const {
6724 const TargetMachine &TM = getTLI()->getTargetMachine();
6725
6726 // Work this as a subsetting of subtarget features.
6727 const FeatureBitset &CallerBits =
6728 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6729 const FeatureBitset &CalleeBits =
6730 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6731
6732 // Check whether features are the same (apart from the ignore list).
6733 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6734 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6735 if (RealCallerBits == RealCalleeBits)
6736 return true;
6737
6738 // If the features are a subset, we need to additionally check for calls
6739 // that may become ABI-incompatible as a result of inlining.
6740 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6741 return false;
6742
6743 for (const Instruction &I : instructions(Callee)) {
6744 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6745 // Having more target features is fine for inline ASM and intrinsics.
6746 if (CB->isInlineAsm() || CB->getIntrinsicID() != Intrinsic::not_intrinsic)
6747 continue;
6748
6750 for (Value *Arg : CB->args())
6751 Types.push_back(Arg->getType());
6752 if (!CB->getType()->isVoidTy())
6753 Types.push_back(CB->getType());
6754
6755 // Simple types are always ABI compatible.
6756 auto IsSimpleTy = [](Type *Ty) {
6757 return !Ty->isVectorTy() && !Ty->isAggregateType();
6758 };
6759 if (all_of(Types, IsSimpleTy))
6760 continue;
6761
6762 // Do a precise compatibility check.
6763 if (!areTypesABICompatible(Caller, Callee, Types))
6764 return false;
6765 }
6766 }
6767 return true;
6768}
6769
6771 const Function *Callee,
6772 ArrayRef<Type *> Types) const {
6773 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6774 return false;
6775
6776 // If we get here, we know the target features match. If one function
6777 // considers 512-bit vectors legal and the other does not, consider them
6778 // incompatible.
6779 const TargetMachine &TM = getTLI()->getTargetMachine();
6780
6781 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6783 return true;
6784
6785 // Consider the arguments compatible if they aren't vectors or aggregates.
6786 // FIXME: Look at the size of vectors.
6787 // FIXME: Look at the element types of aggregates to see if there are vectors.
6788 return llvm::none_of(Types,
6789 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6790}
6791
6793X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6795 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6796 Options.NumLoadsPerBlock = 2;
6797 // All GPR and vector loads can be unaligned.
6798 Options.AllowOverlappingLoads = true;
6799 if (IsZeroCmp) {
6800 // Only enable vector loads for equality comparison. Right now the vector
6801 // version is not as fast for three way compare (see #33329).
6802 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6803 if (PreferredWidth >= 512 && ST->hasAVX512())
6804 Options.LoadSizes.push_back(64);
6805 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6806 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6807 }
6808 if (ST->is64Bit()) {
6809 Options.LoadSizes.push_back(8);
6810 }
6811 Options.LoadSizes.push_back(4);
6812 Options.LoadSizes.push_back(2);
6813 Options.LoadSizes.push_back(1);
6814 return Options;
6815}
6816
6818 return supportsGather();
6819}
6820
6822 return false;
6823}
6824
6826 // TODO: We expect this to be beneficial regardless of arch,
6827 // but there are currently some unexplained performance artifacts on Atom.
6828 // As a temporary solution, disable on Atom.
6829 return !(ST->isAtom());
6830}
6831
6833 switch (II->getIntrinsicID()) {
6834 default:
6835 return true;
6836 case Intrinsic::vector_reduce_smax:
6837 case Intrinsic::vector_reduce_smin:
6838 case Intrinsic::vector_reduce_umax:
6839 case Intrinsic::vector_reduce_umin:
6840 return false;
6841 }
6842}
6843
6844// Get estimation for interleaved load/store operations and strided load.
6845// \p Indices contains indices for strided load.
6846// \p Factor - the factor of interleaving.
6847// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6849 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6850 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6851 TTI::TargetCostKind CostKind, bool UseMaskForCond,
6852 bool UseMaskForGaps) const {
6853 // VecTy for interleave memop is <VF*Factor x Elt>.
6854 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6855 // VecTy = <12 x i32>.
6856
6857 // Calculate the number of memory operations (NumOfMemOps), required
6858 // for load/store the VecTy.
6859 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6860 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6861 unsigned LegalVTSize = LegalVT.getStoreSize();
6862 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6863
6864 // Get the cost of one memory operation.
6865 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6866 LegalVT.getVectorNumElements());
6867 InstructionCost MemOpCost;
6868 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6869 if (UseMaskedMemOp) {
6870 unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load
6871 : Intrinsic::masked_store;
6872 MemOpCost = getMaskedMemoryOpCost(
6873 {IID, SingleMemOpTy, Alignment, AddressSpace}, CostKind);
6874 } else
6875 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace,
6876 CostKind);
6877
6878 unsigned VF = VecTy->getNumElements() / Factor;
6879 MVT VT =
6880 MVT::getVectorVT(TLI->getSimpleValueType(DL, VecTy->getScalarType()), VF);
6881
6882 InstructionCost MaskCost;
6883 if (UseMaskedMemOp) {
6884 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6885 for (unsigned Index : Indices) {
6886 assert(Index < Factor && "Invalid index for interleaved memory op");
6887 for (unsigned Elm = 0; Elm < VF; Elm++)
6888 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6889 }
6890
6891 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6892
6893 MaskCost = getReplicationShuffleCost(
6894 I1Type, Factor, VF,
6895 UseMaskForGaps ? DemandedLoadStoreElts
6897 CostKind);
6898
6899 // The Gaps mask is invariant and created outside the loop, therefore the
6900 // cost of creating it is not accounted for here. However if we have both
6901 // a MaskForGaps and some other mask that guards the execution of the
6902 // memory access, we need to account for the cost of And-ing the two masks
6903 // inside the loop.
6904 if (UseMaskForGaps) {
6905 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6906 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6907 }
6908 }
6909
6910 if (Opcode == Instruction::Load) {
6911 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6912 // contain the cost of the optimized shuffle sequence that the
6913 // X86InterleavedAccess pass will generate.
6914 // The cost of loads and stores are computed separately from the table.
6915
6916 // X86InterleavedAccess support only the following interleaved-access group.
6917 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6918 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6919 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6920 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6921 };
6922
6923 if (const auto *Entry =
6924 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6925 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6926 //If an entry does not exist, fallback to the default implementation.
6927
6928 // Kind of shuffle depends on number of loaded values.
6929 // If we load the entire data in one register, we can use a 1-src shuffle.
6930 // Otherwise, we'll merge 2 sources in each operation.
6931 TTI::ShuffleKind ShuffleKind =
6932 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6933
6934 InstructionCost ShuffleCost = getShuffleCost(
6935 ShuffleKind, SingleMemOpTy, SingleMemOpTy, {}, CostKind, 0, nullptr);
6936
6937 unsigned NumOfLoadsInInterleaveGrp =
6938 Indices.size() ? Indices.size() : Factor;
6939 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6940 VecTy->getNumElements() / Factor);
6941 InstructionCost NumOfResults =
6942 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6943
6944 // About a half of the loads may be folded in shuffles when we have only
6945 // one result. If we have more than one result, or the loads are masked,
6946 // we do not fold loads at all.
6947 unsigned NumOfUnfoldedLoads =
6948 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6949
6950 // Get a number of shuffle operations per result.
6951 unsigned NumOfShufflesPerResult =
6952 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6953
6954 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6955 // When we have more than one destination, we need additional instructions
6956 // to keep sources.
6957 InstructionCost NumOfMoves = 0;
6958 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6959 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6960
6961 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6962 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6963 NumOfMoves;
6964
6965 return Cost;
6966 }
6967
6968 // Store.
6969 assert(Opcode == Instruction::Store &&
6970 "Expected Store Instruction at this point");
6971 // X86InterleavedAccess support only the following interleaved-access group.
6972 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6973 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6974 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6975 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6976
6977 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6978 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6979 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6980 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6981 };
6982
6983 if (const auto *Entry =
6984 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6985 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6986 //If an entry does not exist, fallback to the default implementation.
6987
6988 // There is no strided stores meanwhile. And store can't be folded in
6989 // shuffle.
6990 unsigned NumOfSources = Factor; // The number of values to be merged.
6991 InstructionCost ShuffleCost =
6992 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, SingleMemOpTy, {},
6993 CostKind, 0, nullptr);
6994 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6995
6996 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6997 // We need additional instructions to keep sources.
6998 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
7000 MaskCost +
7001 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
7002 NumOfMoves;
7003 return Cost;
7004}
7005
7007 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
7008 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
7009 bool UseMaskForCond, bool UseMaskForGaps) const {
7010 auto *VecTy = cast<FixedVectorType>(BaseTy);
7011
7012 auto isSupportedOnAVX512 = [&](Type *VecTy) {
7013 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
7014 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
7015 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
7016 return true;
7017 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
7018 return ST->hasBWI();
7019 if (EltTy->isBFloatTy())
7020 return ST->hasBF16();
7021 return false;
7022 };
7023 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
7025 Opcode, VecTy, Factor, Indices, Alignment,
7026 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
7027
7028 if (UseMaskForCond || UseMaskForGaps)
7029 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7030 Alignment, AddressSpace, CostKind,
7031 UseMaskForCond, UseMaskForGaps);
7032
7033 // Get estimation for interleaved load/store operations for SSE-AVX2.
7034 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
7035 // computing the cost using a generic formula as a function of generic
7036 // shuffles. We therefore use a lookup table instead, filled according to
7037 // the instruction sequences that codegen currently generates.
7038
7039 // VecTy for interleave memop is <VF*Factor x Elt>.
7040 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
7041 // VecTy = <12 x i32>.
7042 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
7043
7044 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
7045 // the VF=2, while v2i128 is an unsupported MVT vector type
7046 // (see MachineValueType.h::getVectorVT()).
7047 if (!LegalVT.isVector())
7048 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7049 Alignment, AddressSpace, CostKind);
7050
7051 unsigned VF = VecTy->getNumElements() / Factor;
7052 Type *ScalarTy = VecTy->getElementType();
7053 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
7054 if (!ScalarTy->isIntegerTy())
7055 ScalarTy =
7056 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
7057
7058 // Get the cost of all the memory operations.
7059 // FIXME: discount dead loads.
7060 InstructionCost MemOpCosts =
7061 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
7062
7063 auto *VT = FixedVectorType::get(ScalarTy, VF);
7064 EVT ETy = TLI->getValueType(DL, VT);
7065 if (!ETy.isSimple())
7066 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7067 Alignment, AddressSpace, CostKind);
7068
7069 // TODO: Complete for other data-types and strides.
7070 // Each combination of Stride, element bit width and VF results in a different
7071 // sequence; The cost tables are therefore accessed with:
7072 // Factor (stride) and VectorType=VFxiN.
7073 // The Cost accounts only for the shuffle sequence;
7074 // The cost of the loads/stores is accounted for separately.
7075 //
7076 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
7077 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
7078 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
7079 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
7080 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
7081 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
7082
7083 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
7084 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
7085 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
7086
7087 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
7088 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
7089 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
7090
7091 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
7092 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
7093 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
7094 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
7095
7096 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
7097 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
7098 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
7099 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
7100 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
7101
7102 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
7103 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
7104 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
7105 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
7106 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
7107
7108 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
7109 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
7110 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
7111 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
7112 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
7113
7114 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
7115 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
7116 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
7117 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
7118
7119 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
7120 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
7121 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
7122 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
7123 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
7124
7125 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
7126 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
7127 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
7128 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
7129 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
7130
7131 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
7132 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
7133 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
7134 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
7135 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
7136
7137 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
7138 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
7139 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
7140 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
7141
7142 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
7143 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
7144 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
7145 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
7146 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
7147
7148 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
7149 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
7150 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
7151 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
7152 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
7153
7154 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
7155 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
7156 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
7157 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
7158
7159 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
7160 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
7161 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
7162
7163 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
7164 };
7165
7166 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
7167 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
7168 };
7169
7170 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
7171 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
7172 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
7173
7174 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
7175 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
7176
7177 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
7178 };
7179
7180 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
7181 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
7182 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
7183
7184 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
7185 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
7186 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
7187
7188 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
7189 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
7190 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
7191 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
7192
7193 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
7194 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
7195 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
7196 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
7197 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
7198
7199 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
7200 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
7201 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
7202 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
7203 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
7204
7205 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
7206 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
7207 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
7208 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
7209 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
7210
7211 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
7212 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
7213 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
7214 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
7215 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
7216
7217 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
7218 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
7219 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
7220 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
7221
7222 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
7223 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
7224 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
7225 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
7226 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
7227
7228 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
7229 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
7230 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
7231 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
7232 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
7233
7234 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
7235 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
7236 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
7237 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
7238 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
7239
7240 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
7241 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
7242 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
7243 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
7244
7245 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
7246 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
7247 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
7248 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
7249 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
7250
7251 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
7252 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
7253 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
7254 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
7255 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
7256
7257 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
7258 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
7259 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
7260 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
7261
7262 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
7263 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
7264 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
7265 };
7266
7267 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
7268 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
7269 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
7270 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
7271
7272 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
7273 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
7274
7275 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
7276 };
7277
7278 if (Opcode == Instruction::Load) {
7279 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
7280 MemOpCosts](const CostTblEntry *Entry) {
7281 // NOTE: this is just an approximation!
7282 // It can over/under -estimate the cost!
7283 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
7284 };
7285
7286 if (ST->hasAVX2())
7287 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
7288 ETy.getSimpleVT()))
7289 return GetDiscountedCost(Entry);
7290
7291 if (ST->hasSSSE3())
7292 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
7293 ETy.getSimpleVT()))
7294 return GetDiscountedCost(Entry);
7295
7296 if (ST->hasSSE2())
7297 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
7298 ETy.getSimpleVT()))
7299 return GetDiscountedCost(Entry);
7300 } else {
7301 assert(Opcode == Instruction::Store &&
7302 "Expected Store Instruction at this point");
7303 assert((!Indices.size() || Indices.size() == Factor) &&
7304 "Interleaved store only supports fully-interleaved groups.");
7305 if (ST->hasAVX2())
7306 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
7307 ETy.getSimpleVT()))
7308 return MemOpCosts + Entry->Cost;
7309
7310 if (ST->hasSSE2())
7311 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
7312 ETy.getSimpleVT()))
7313 return MemOpCosts + Entry->Cost;
7314 }
7315
7316 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7317 Alignment, AddressSpace, CostKind,
7318 UseMaskForCond, UseMaskForGaps);
7319}
7320
7322 StackOffset BaseOffset,
7323 bool HasBaseReg, int64_t Scale,
7324 unsigned AddrSpace) const {
7325 // Scaling factors are not free at all.
7326 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
7327 // will take 2 allocations in the out of order engine instead of 1
7328 // for plain addressing mode, i.e. inst (reg1).
7329 // E.g.,
7330 // vaddps (%rsi,%rdx), %ymm0, %ymm1
7331 // Requires two allocations (one for the load, one for the computation)
7332 // whereas:
7333 // vaddps (%rsi), %ymm0, %ymm1
7334 // Requires just 1 allocation, i.e., freeing allocations for other operations
7335 // and having less micro operations to execute.
7336 //
7337 // For some X86 architectures, this is even worse because for instance for
7338 // stores, the complex addressing mode forces the instruction to use the
7339 // "load" ports instead of the dedicated "store" port.
7340 // E.g., on Haswell:
7341 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
7342 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
7344 AM.BaseGV = BaseGV;
7345 AM.BaseOffs = BaseOffset.getFixed();
7346 AM.HasBaseReg = HasBaseReg;
7347 AM.Scale = Scale;
7348 AM.ScalableOffset = BaseOffset.getScalable();
7349 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
7350 // Scale represents reg2 * scale, thus account for 1
7351 // as soon as we use a second register.
7352 return AM.Scale != 0;
7354}
7355
7357 // TODO: Hook MispredictPenalty of SchedMachineModel into this.
7358 return 14;
7359}
7360
7362 unsigned Bits = Ty->getScalarSizeInBits();
7363
7364 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
7365 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
7366 if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
7367 return false;
7368
7369 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
7370 // shifts just as cheap as scalar ones.
7371 if (ST->hasAVX2() && (Bits == 32 || Bits == 64))
7372 return false;
7373
7374 // AVX512BW has shifts such as vpsllvw.
7375 if (ST->hasBWI() && Bits == 16)
7376 return false;
7377
7378 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
7379 // fully general vector.
7380 return true;
7381}
7382
7383unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
7384 Type *ScalarValTy, Align Alignment,
7385 unsigned AddrSpace) const {
7386 if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
7387 return 4;
7388 }
7389 return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy, Alignment,
7390 AddrSpace);
7391}
7392
7394 SmallVectorImpl<Use *> &Ops) const {
7395 using namespace llvm::PatternMatch;
7396
7397 if (I->getOpcode() == Instruction::And &&
7398 (ST->hasBMI() || (I->getType()->isVectorTy() && ST->hasSSE2()))) {
7399 for (auto &Op : I->operands()) {
7400 // (and X, (not Y)) -> (andn X, Y)
7401 if (match(Op.get(), m_Not(m_Value())) && !I->getType()->isIntegerTy(8)) {
7402 Ops.push_back(&Op);
7403 return true;
7404 }
7405 // (and X, (splat (not Y))) -> (andn X, (splat Y))
7406 if (match(Op.get(),
7408 m_Value(), m_ZeroMask()))) {
7409 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
7410 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
7411 Ops.push_back(&Not);
7412 Ops.push_back(&InsertElt);
7413 Ops.push_back(&Op);
7414 return true;
7415 }
7416 }
7417 }
7418
7419 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
7420 if (!VTy)
7421 return false;
7422
7423 if (I->getOpcode() == Instruction::Mul &&
7424 VTy->getElementType()->isIntegerTy(64)) {
7425 for (auto &Op : I->operands()) {
7426 // Make sure we are not already sinking this operand
7427 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7428 continue;
7429
7430 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
7431 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
7432 if (ST->hasSSE41() &&
7433 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
7434 m_SpecificInt(32)))) {
7435 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7436 Ops.push_back(&Op);
7437 } else if (ST->hasSSE2() &&
7438 match(Op.get(),
7439 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
7440 Ops.push_back(&Op);
7441 }
7442 }
7443
7444 return !Ops.empty();
7445 }
7446
7447 // A uniform shift amount in a vector shift or funnel shift may be much
7448 // cheaper than a generic variable vector shift, so make that pattern visible
7449 // to SDAG by sinking the shuffle instruction next to the shift.
7450 int ShiftAmountOpNum = -1;
7451 if (I->isShift())
7452 ShiftAmountOpNum = 1;
7453 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
7454 if (II->getIntrinsicID() == Intrinsic::fshl ||
7455 II->getIntrinsicID() == Intrinsic::fshr)
7456 ShiftAmountOpNum = 2;
7457 }
7458
7459 if (ShiftAmountOpNum == -1)
7460 return false;
7461
7462 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
7463 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
7464 isVectorShiftByScalarCheap(I->getType())) {
7465 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
7466 return true;
7467 }
7468
7469 return false;
7470}
7471
7473 bool HasEGPR = ST->hasEGPR();
7474 const TargetMachine &TM = getTLI()->getTargetMachine();
7475
7476 for (User *U : F.users()) {
7478 if (!CB || CB->getCalledOperand() != &F)
7479 continue;
7480 Function *CallerFunc = CB->getFunction();
7481 if (TM.getSubtarget<X86Subtarget>(*CallerFunc).hasEGPR() != HasEGPR)
7482 return false;
7483 }
7484
7485 return true;
7486}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T
uint64_t IntrinsicInst * II
#define P(N)
static unsigned getNumElements(Type *Ty)
This file implements the SmallBitVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
CostTblEntryT< CostKindCosts > CostKindTblEntry
static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST)
TypeConversionCostTblEntryT< CostKindCosts > TypeConversionCostKindTblEntry
This file a TargetTransformInfoImplBase conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1353
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1084
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:834
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:483
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy, Align Alignment, unsigned AddrSpace) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Value * getCalledOperand() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:764
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:751
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:768
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
const FeatureBitset & getFeatureBits() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
Class to represent pointers.
unsigned getAddressSpace() const
Return the address space of the Pointer type.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
virtual bool isExpensiveToSpeculativelyExecute(const Instruction *I) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
MaskKind
Some targets only support masked load/store with a constant mask.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
LLVM_ABI unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:65
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:263
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:147
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:158
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:291
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:227
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition Type.h:399
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:290
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
Base class of all SIMD vector types.
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Type * getElementType() const
bool useAVX512Regs() const
bool hasAVX512() const
bool hasAVX2() const
bool useFastCCForInternalCall(Function &F) const override
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
bool isLegalNTLoad(Type *DataType, Align Alignment) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
unsigned getRegisterClassForType(bool Vector, Type *Ty) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalNTStore(Type *DataType, Align Alignment) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isVectorShiftByScalarCheap(Type *Ty) const override
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
unsigned getAtomicMemIntrinsicMaxElementSize() const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) const override
InstructionCost getBranchMispredictPenalty() const override
bool isExpensiveToSpeculativelyExecute(const Instruction *I) const override
bool hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment) const
bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
bool enableInterleavedAccessVectorization() const override
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const override
unsigned getNumberOfRegisters(unsigned ClassID) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy, Align Alignment, unsigned AddrSpace) const override
bool hasDivRemOp(Type *DataType, bool IsSigned) const override
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool supportsEfficientVectorElementLoadStore() const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCost(int64_t) const
Calculate the cost of materializing a 64-bit value.
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool canMacroFuseCmp() const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool prefersVectorizedAddressing() const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Type) const override
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
Calculate the cost of Gather / Scatter operation.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3061
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:823
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:783
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:884
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ CTLZ_ZERO_POISON
Definition ISDOpcodes.h:792
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:848
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:800
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:854
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:982
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:930
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ CTTZ_ZERO_POISON
Bit counting operators with a poisoned result for zero inputs.
Definition ISDOpcodes.h:791
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:963
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:860
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
ap_match< APInt > m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
auto m_Value()
Match an arbitrary value and ignore it.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
This is an optimization pass for GlobalISel generic memory operations.
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2179
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t DataSize
Definition InstrProf.h:299
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1884
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2018
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Cost Table Entry.
Definition CostTable.h:25
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:381
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:331
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:256
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition CostTable.h:55