LLVM 23.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
103
105X86TTIImpl::getPopcntSupport(unsigned TyWidth) const {
106 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
107 // TODO: Currently the __builtin_popcount() implementation using SSE3
108 // instructions is inefficient. Once the problem is fixed, we should
109 // call ST->hasSSE3() instead of ST->hasPOPCNT().
110 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
111}
112
113std::optional<unsigned> X86TTIImpl::getCacheSize(
115 switch (Level) {
117 // - Penryn
118 // - Nehalem
119 // - Westmere
120 // - Sandy Bridge
121 // - Ivy Bridge
122 // - Haswell
123 // - Broadwell
124 // - Skylake
125 // - Kabylake
126 return 32 * 1024; // 32 KiB
128 // - Penryn
129 // - Nehalem
130 // - Westmere
131 // - Sandy Bridge
132 // - Ivy Bridge
133 // - Haswell
134 // - Broadwell
135 // - Skylake
136 // - Kabylake
137 return 256 * 1024; // 256 KiB
138 }
139
140 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
141}
142
143std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
145 // - Penryn
146 // - Nehalem
147 // - Westmere
148 // - Sandy Bridge
149 // - Ivy Bridge
150 // - Haswell
151 // - Broadwell
152 // - Skylake
153 // - Kabylake
154 switch (Level) {
156 [[fallthrough]];
158 return 8;
159 }
160
161 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
162}
163
165
167 return Vector ? VectorClass
168 : Ty && Ty->isFloatingPointTy() ? ScalarFPClass
169 : GPRClass;
170}
171
172unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
173 if (ClassID == VectorClass && !ST->hasSSE1())
174 return 0;
175
176 if (!ST->is64Bit())
177 return 8;
178
179 if ((ClassID == GPRClass && ST->hasEGPR()) ||
180 (ClassID != GPRClass && ST->hasAVX512()))
181 return 32;
182
183 return 16;
184}
185
187 if (!ST->hasCF())
188 return false;
189 if (!Ty)
190 return true;
191 // Conditional faulting is supported by CFCMOV, which only accepts
192 // 16/32/64-bit operands.
193 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
194 // profitable.
195 auto *VTy = dyn_cast<FixedVectorType>(Ty);
196 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
197 return false;
198 auto *ScalarTy = Ty->getScalarType();
199 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
200 default:
201 return false;
202 case 16:
203 case 32:
204 case 64:
205 return true;
206 }
207}
208
211 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
212 switch (K) {
214 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
216 if (ST->hasAVX512() && PreferVectorWidth >= 512)
217 return TypeSize::getFixed(512);
218 if (ST->hasAVX() && PreferVectorWidth >= 256)
219 return TypeSize::getFixed(256);
220 if (ST->hasSSE1() && PreferVectorWidth >= 128)
221 return TypeSize::getFixed(128);
222 return TypeSize::getFixed(0);
224 return TypeSize::getScalable(0);
225 }
226
227 llvm_unreachable("Unsupported register kind");
228}
229
234
236 // If the loop will not be vectorized, don't interleave the loop.
237 // Let regular unroll to unroll the loop, which saves the overflow
238 // check and memory check cost.
239 if (VF.isScalar())
240 return 1;
241
242 if (ST->isAtom())
243 return 1;
244
245 // Sandybridge and Haswell have multiple execution ports and pipelined
246 // vector units.
247 if (ST->hasAVX())
248 return 4;
249
250 return 2;
251}
252
254 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
256 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
257
258 // vXi8 multiplications are always promoted to vXi16.
259 // Sub-128-bit types can be extended/packed more efficiently.
260 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
261 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
262 Type *WideVecTy =
264 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
266 CostKind) +
267 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
269 CostKind) +
270 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
271 }
272
273 // Legalize the type.
274 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
275
276 int ISD = TLI->InstructionOpcodeToISD(Opcode);
277 assert(ISD && "Invalid opcode");
278
279 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
280 (LT.second.getScalarType() == MVT::i32 ||
281 LT.second.getScalarType() == MVT::i64)) {
282 // Check if the operands can be represented as a smaller datatype.
283 bool Op1Signed = false, Op2Signed = false;
284 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
285 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
286 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
287 bool SignedMode = Op1Signed || Op2Signed;
288
289 // If both vXi32 are representable as i15 and at least one is constant,
290 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
291 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
292 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
293 LT.second.getScalarType() == MVT::i32) {
294 bool Op1Constant =
295 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
296 bool Op2Constant =
297 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
298 bool Op1Sext = isa<SExtInst>(Args[0]) &&
299 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
300 bool Op2Sext = isa<SExtInst>(Args[1]) &&
301 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
302
303 bool IsZeroExtended = !Op1Signed || !Op2Signed;
304 bool IsConstant = Op1Constant || Op2Constant;
305 bool IsSext = Op1Sext || Op2Sext;
306 if (IsConstant || IsZeroExtended || IsSext)
307 LT.second =
308 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
309 }
310
311 // Check if the vXi32 operands can be shrunk into a smaller datatype.
312 // This should match the codegen from reduceVMULWidth.
313 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
314 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
315 if (OpMinSize <= 7)
316 return LT.first * 3; // pmullw/sext
317 if (!SignedMode && OpMinSize <= 8)
318 return LT.first * 3; // pmullw/zext
319 if (OpMinSize <= 15)
320 return LT.first * 5; // pmullw/pmulhw/pshuf
321 if (!SignedMode && OpMinSize <= 16)
322 return LT.first * 5; // pmullw/pmulhw/pshuf
323 }
324
325 // If both vXi64 are representable as (unsigned) i32, then we can perform
326 // the multiple with a single PMULUDQ instruction.
327 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
328 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
330 }
331
332 // Vector multiply by pow2 will be simplified to shifts.
333 // Vector multiply by -pow2 will be simplified to shifts/negates.
334 if (ISD == ISD::MUL && Op2Info.isConstant() &&
335 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
337 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
338 Op1Info.getNoProps(), Op2Info.getNoProps());
339 if (Op2Info.isNegatedPowerOf2())
340 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
341 return Cost;
342 }
343
344 // On X86, vector signed division by constants power-of-two are
345 // normally expanded to the sequence SRA + SRL + ADD + SRA.
346 // The OperandValue properties may not be the same as that of the previous
347 // operation; conservatively assume OP_None.
348 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
349 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
351 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
352 Op1Info.getNoProps(), Op2Info.getNoProps());
353 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
354 Op1Info.getNoProps(), Op2Info.getNoProps());
355 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
356 Op1Info.getNoProps(), Op2Info.getNoProps());
357
358 if (ISD == ISD::SREM) {
359 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
360 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
361 Op2Info.getNoProps());
362 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
363 Op2Info.getNoProps());
364 }
365
366 return Cost;
367 }
368
369 // Vector unsigned division/remainder will be simplified to shifts/masks.
370 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
371 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
372 if (ISD == ISD::UDIV)
373 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
374 Op1Info.getNoProps(), Op2Info.getNoProps());
375 // UREM
376 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
377 Op1Info.getNoProps(), Op2Info.getNoProps());
378 }
379
380 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
381 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
382 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
383 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
384 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
385 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
386 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
387 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
388 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
389 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
390 };
391
392 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
393 if (const auto *Entry =
394 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
395 if (auto KindCost = Entry->Cost[CostKind])
396 return LT.first * *KindCost;
397
398 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
399 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
400 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
401 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
402 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
403 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
404 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
405 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
406 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
407 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
408
409 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
410 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
411 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
412 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
413 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
414 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
415 };
416
417 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
418 if (const auto *Entry =
419 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
420 if (auto KindCost = Entry->Cost[CostKind])
421 return LT.first * *KindCost;
422
423 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
424 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
425 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
426 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
427
428 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
429 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
430 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
431
432 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
433 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
434 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
435 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
436 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
437 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
438
439 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
440 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
441 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
442 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
443 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
444 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
445 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
446
447 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
448 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
449 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
450 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
451 };
452
453 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
454 if (const auto *Entry =
455 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
456 if (auto KindCost = Entry->Cost[CostKind])
457 return LT.first * *KindCost;
458
459 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
460 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
461 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
462 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
463 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
464 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
465 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
466
467 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
468 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
469 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
470 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
471 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
472 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
473
474 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
475 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
476 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
477 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
478 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
479 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
480
481 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
482 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
483 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
484 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
485 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
486 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
487
488 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
489 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
490 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
491 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
492 };
493
494 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
495 if (const auto *Entry =
496 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
497 if (auto KindCost = Entry->Cost[CostKind])
498 return LT.first * *KindCost;
499
500 static const CostKindTblEntry AVXUniformConstCostTable[] = {
501 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
502 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
503 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
504 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
505 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
506 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
507
508 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
509 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
510 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
511 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
512 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
513 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
514
515 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
516 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
517 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
518 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
519 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
520 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
521
522 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
523 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
524 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
525 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
526 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
527 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
528
529 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
530 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
531 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
532 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
533 };
534
535 // XOP has faster vXi8 shifts.
536 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
537 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
538 if (const auto *Entry =
539 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
540 if (auto KindCost = Entry->Cost[CostKind])
541 return LT.first * *KindCost;
542
543 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
544 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
545 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
546 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
547
548 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
549 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
550 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
551
552 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
553 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
554 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
555
556 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
557 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
558 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
559
560 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
561 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
562 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
563 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
564 };
565
566 // XOP has faster vXi8 shifts.
567 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
568 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
569 if (const auto *Entry =
570 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
571 if (auto KindCost = Entry->Cost[CostKind])
572 return LT.first * *KindCost;
573
574 static const CostKindTblEntry AVX512BWConstCostTable[] = {
575 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
576 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
577 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
578 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
579
580 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
581 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
582 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
583 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
584 };
585
586 if (Op2Info.isConstant() && ST->hasBWI())
587 if (const auto *Entry =
588 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
589 if (auto KindCost = Entry->Cost[CostKind])
590 return LT.first * *KindCost;
591
592 static const CostKindTblEntry AVX512ConstCostTable[] = {
593 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
594 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
595 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
596 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
597
598 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
599 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
600 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
601 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
602
603 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
604 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
605 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
606 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
607 };
608
609 if (Op2Info.isConstant() && ST->hasAVX512())
610 if (const auto *Entry =
611 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
612 if (auto KindCost = Entry->Cost[CostKind])
613 return LT.first * *KindCost;
614
615 static const CostKindTblEntry AVX2ConstCostTable[] = {
616 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
617 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
618 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
619 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
620
621 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
622 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
623 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
624 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
625
626 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
627 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
628 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
629 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
630 };
631
632 if (Op2Info.isConstant() && ST->hasAVX2())
633 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
634 if (auto KindCost = Entry->Cost[CostKind])
635 return LT.first * *KindCost;
636
637 static const CostKindTblEntry AVXConstCostTable[] = {
638 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
639 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
640 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
641 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
642
643 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
644 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
645 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
646 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
647
648 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
649 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
650 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
651 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
652 };
653
654 if (Op2Info.isConstant() && ST->hasAVX())
655 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
656 if (auto KindCost = Entry->Cost[CostKind])
657 return LT.first * *KindCost;
658
659 static const CostKindTblEntry SSE41ConstCostTable[] = {
660 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
661 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
662 };
663
664 if (Op2Info.isConstant() && ST->hasSSE41())
665 if (const auto *Entry =
666 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
667 if (auto KindCost = Entry->Cost[CostKind])
668 return LT.first * *KindCost;
669
670 static const CostKindTblEntry SSE2ConstCostTable[] = {
671 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
672 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
673 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
674 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
675
676 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
677 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
678 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
679 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
680
681 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
682 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
683 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
684 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
685 };
686
687 if (Op2Info.isConstant() && ST->hasSSE2())
688 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
689 if (auto KindCost = Entry->Cost[CostKind])
690 return LT.first * *KindCost;
691
692 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
693 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
694 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
695 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
696 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
697 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
698 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
699 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
700 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
701 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
702
703 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
704 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
705 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
706 };
707
708 if (ST->hasBWI() && Op2Info.isUniform())
709 if (const auto *Entry =
710 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
711 if (auto KindCost = Entry->Cost[CostKind])
712 return LT.first * *KindCost;
713
714 static const CostKindTblEntry AVX512UniformCostTable[] = {
715 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
716 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
717 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
718
719 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
720 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
721 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
722
723 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
724 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
725 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
726 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
727 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
728 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
729 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
730 };
731
732 if (ST->hasAVX512() && Op2Info.isUniform())
733 if (const auto *Entry =
734 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
735 if (auto KindCost = Entry->Cost[CostKind])
736 return LT.first * *KindCost;
737
738 static const CostKindTblEntry AVX2UniformCostTable[] = {
739 // Uniform splats are cheaper for the following instructions.
740 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
741 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
742 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
743 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
744 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
745 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
746
747 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
748 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
749 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
750 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
751 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
752 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
753
754 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
755 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
756 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
757 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
758 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
759 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
760
761 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
762 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
763 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
764 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
765 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
766 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
767 };
768
769 if (ST->hasAVX2() && Op2Info.isUniform())
770 if (const auto *Entry =
771 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
772 if (auto KindCost = Entry->Cost[CostKind])
773 return LT.first * *KindCost;
774
775 static const CostKindTblEntry AVXUniformCostTable[] = {
776 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
777 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
778 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
779 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
780 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
781 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
782
783 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
784 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
785 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
786 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
787 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
788 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
789
790 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
791 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
792 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
793 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
794 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
795 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
796
797 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
798 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
799 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
800 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
801 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
802 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
803 };
804
805 // XOP has faster vXi8 shifts.
806 if (ST->hasAVX() && Op2Info.isUniform() &&
807 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
808 if (const auto *Entry =
809 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
810 if (auto KindCost = Entry->Cost[CostKind])
811 return LT.first * *KindCost;
812
813 static const CostKindTblEntry SSE2UniformCostTable[] = {
814 // Uniform splats are cheaper for the following instructions.
815 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
816 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
817 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
818
819 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
820 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
821 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
822
823 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
824 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
825 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
826
827 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
828 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
829 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
830 };
831
832 if (ST->hasSSE2() && Op2Info.isUniform() &&
833 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
834 if (const auto *Entry =
835 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
836 if (auto KindCost = Entry->Cost[CostKind])
837 return LT.first * *KindCost;
838
839 static const CostKindTblEntry AVX512DQCostTable[] = {
840 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
841 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
842 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
843 };
844
845 // Look for AVX512DQ lowering tricks for custom cases.
846 if (ST->hasDQI())
847 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
848 if (auto KindCost = Entry->Cost[CostKind])
849 return LT.first * *KindCost;
850
851 static const CostKindTblEntry AVX512BWCostTable[] = {
852 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
853 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
854 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
855 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
856 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
857 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
858 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
859 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
860 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
861
862 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
863 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
864 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
865 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
866 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
867 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
868 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
869 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
870 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
871
872 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
873 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
874
875 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
876 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
877 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
878 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
879
880 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
881 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
882
883 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
884 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
885 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
886 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
887
888 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
889 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
890 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
891 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
892 };
893
894 // Look for AVX512BW lowering tricks for custom cases.
895 if (ST->hasBWI())
896 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
897 if (auto KindCost = Entry->Cost[CostKind])
898 return LT.first * *KindCost;
899
900 static const CostKindTblEntry AVX512CostTable[] = {
901 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
902 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
903 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
904
905 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
906 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
907 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
908
909 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
910 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
911 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
912 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
913 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
914 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
915 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
916 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
917 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
918
919 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
920 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
921 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
922 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
923 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
924 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
925 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
926 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
927 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
928
929 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
930 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
931
932 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
933 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
934
935 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
936 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
937 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
938 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
939
940 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
941 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
942 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
943 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
944
945 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
946 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
947 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
948 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
949
950 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
951 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
952 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
953 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
954 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
955
956 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
957
958 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
959 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
960 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
961 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
962 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
964 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
965 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
966 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
967
968 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
969 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
970 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
972
973 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
974 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
975 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
976 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
977 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
979 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
980 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
981 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
982
983 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
984 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
985 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
986 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
987 };
988
989 if (ST->hasAVX512())
990 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
991 if (auto KindCost = Entry->Cost[CostKind])
992 return LT.first * *KindCost;
993
994 static const CostKindTblEntry AVX2ShiftCostTable[] = {
995 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
996 // customize them to detect the cases where shift amount is a scalar one.
997 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
998 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
999 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
1000 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
1001 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
1002 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
1003 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
1004 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
1005 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
1006 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1007 };
1008
1009 if (ST->hasAVX512()) {
1010 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1011 // On AVX512, a packed v32i16 shift left by a constant build_vector
1012 // is lowered into a vector multiply (vpmullw).
1013 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1014 Op1Info.getNoProps(), Op2Info.getNoProps());
1015 }
1016
1017 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1018 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1019 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1020 Op2Info.isConstant())
1021 // On AVX2, a packed v16i16 shift left by a constant build_vector
1022 // is lowered into a vector multiply (vpmullw).
1023 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1024 Op1Info.getNoProps(), Op2Info.getNoProps());
1025
1026 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1027 if (auto KindCost = Entry->Cost[CostKind])
1028 return LT.first * *KindCost;
1029 }
1030
1031 static const CostKindTblEntry XOPShiftCostTable[] = {
1032 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1033 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1034 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1035 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1036 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1037 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1038 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1039 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1040 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1041 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1042 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1043 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1044 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1045 // 256bit shifts require splitting if AVX2 didn't catch them above.
1046 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1047 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1048 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1049 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1050 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1051 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1052 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1053 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1054 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1055 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1056 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1057 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1058 };
1059
1060 // Look for XOP lowering tricks.
1061 if (ST->hasXOP()) {
1062 // If the right shift is constant then we'll fold the negation so
1063 // it's as cheap as a left shift.
1064 int ShiftISD = ISD;
1065 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1066 ShiftISD = ISD::SHL;
1067 if (const auto *Entry =
1068 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1069 if (auto KindCost = Entry->Cost[CostKind])
1070 return LT.first * *KindCost;
1071 }
1072
1073 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1074 MVT VT = LT.second;
1075 // Vector shift left by non uniform constant can be lowered
1076 // into vector multiply.
1077 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1078 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1079 ISD = ISD::MUL;
1080 }
1081
1082 static const CostKindTblEntry GLMCostTable[] = {
1083 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1084 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1085 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1086 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1087 };
1088
1089 if (ST->useGLMDivSqrtCosts())
1090 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1091 if (auto KindCost = Entry->Cost[CostKind])
1092 return LT.first * *KindCost;
1093
1094 static const CostKindTblEntry SLMCostTable[] = {
1095 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1096 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1097 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1098 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1099 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1100 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1101 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1102 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1103 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1104 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1105 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1106 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1107 // v2i64/v4i64 mul is custom lowered as a series of long:
1108 // multiplies(3), shifts(3) and adds(2)
1109 // slm muldq version throughput is 2 and addq throughput 4
1110 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1111 // 3X4 (addq throughput) = 17
1112 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1113 // slm addq\subq throughput is 4
1114 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1115 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1116 };
1117
1118 if (ST->useSLMArithCosts())
1119 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1120 if (auto KindCost = Entry->Cost[CostKind])
1121 return LT.first * *KindCost;
1122
1123 static const CostKindTblEntry AVX2CostTable[] = {
1124 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1125 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1126 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1127 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1128
1129 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1130 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1131 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1132 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1133
1134 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1135 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1136 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1137 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1138 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1139 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1140
1141 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1142 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1143 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1144 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1145 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1146 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1147 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1148 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1149
1150 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1151 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1152 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1153 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1154 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1155 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1156 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1157
1158 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1159
1160 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1161 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1162
1163 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1164 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1165 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1166 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1167 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1168 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1169
1170 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1171 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1172 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1173 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1174 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1175 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1176
1177 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1178 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1179 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1180 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1181 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1182 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1183
1184 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1185 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1186 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1187 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1188 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1189 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1190 };
1191
1192 // Look for AVX2 lowering tricks for custom cases.
1193 if (ST->hasAVX2())
1194 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1195 if (auto KindCost = Entry->Cost[CostKind])
1196 return LT.first * *KindCost;
1197
1198 static const CostKindTblEntry AVX1CostTable[] = {
1199 // We don't have to scalarize unsupported ops. We can issue two half-sized
1200 // operations and we only need to extract the upper YMM half.
1201 // Two ops + 1 extract + 1 insert = 4.
1202 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1203 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1204 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1205 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1206 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1207 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1208
1209 { X86ISD::PMULUDQ, MVT::v4i64, { 3, 5, 5, 6 } }, // pmuludq + split
1210
1211 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1212 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1213 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1214 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1215
1216 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1217 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1218 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1219 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1220
1221 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1222 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1223 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1224 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1225
1226 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1227 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1228 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1229 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1230 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1231 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1232 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1233 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1234 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1235 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1236
1237 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1238 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1239 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1240 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1241 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1242 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1243 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1244 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1245
1246 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1247 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1248 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1249 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1250 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1251 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1252 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1253 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1254
1255 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1256 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1257 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1258 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1259 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1260 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1261 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1262 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1263
1264 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1265 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1266
1267 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1268 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1269 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1270 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1271 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1272 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1273
1274 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1275 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1276 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1277 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1278 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1279 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1280
1281 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1282 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1283 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1284 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1285 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1286 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1287
1288 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1289 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1290 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1291 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1292 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1293 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1294 };
1295
1296 if (ST->hasAVX())
1297 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1298 if (auto KindCost = Entry->Cost[CostKind])
1299 return LT.first * *KindCost;
1300
1301 static const CostKindTblEntry SSE42CostTable[] = {
1302 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1303 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1304 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1305 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1306
1307 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1308 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1309 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1310 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1311
1312 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1313 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1314 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1315 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1316
1317 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1318 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1319 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1320 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1321
1322 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1323 };
1324
1325 if (ST->hasSSE42())
1326 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1327 if (auto KindCost = Entry->Cost[CostKind])
1328 return LT.first * *KindCost;
1329
1330 static const CostKindTblEntry SSE41CostTable[] = {
1331 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1332 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1333 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1334
1335 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1336 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1337 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1338 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1339
1340 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1341 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1342 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1343 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1344
1345 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1346 };
1347
1348 if (ST->hasSSE41())
1349 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1350 if (auto KindCost = Entry->Cost[CostKind])
1351 return LT.first * *KindCost;
1352
1353 static const CostKindTblEntry SSSE3CostTable[] = {
1354 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1355 };
1356
1357 if (ST->hasSSSE3())
1358 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1359 if (auto KindCost = Entry->Cost[CostKind])
1360 return LT.first * *KindCost;
1361
1362 static const CostKindTblEntry SSE2CostTable[] = {
1363 // We don't correctly identify costs of casts because they are marked as
1364 // custom.
1365 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1366 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1367 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1368 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1369
1370 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1371 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1372 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1373 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1374
1375 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1376 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1377 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1378 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1379
1380 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1381 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1382 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1383 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1384
1385 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1386 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1387 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1388 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1389
1390 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1391 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1392 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1393 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1394
1395 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1396 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1397
1398 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1399 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1400 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1401 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1402
1403 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1404
1405 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1406 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1407 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409
1410 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1411 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1412 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1413 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1414
1415 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1417 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1418
1419 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1420 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1421 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1422
1423 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1424 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1425 };
1426
1427 if (ST->hasSSE2())
1428 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1429 if (auto KindCost = Entry->Cost[CostKind])
1430 return LT.first * *KindCost;
1431
1432 static const CostKindTblEntry SSE1CostTable[] = {
1433 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1434 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1435
1436 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1437 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1438
1439 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1440 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1441
1442 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1443 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1444
1445 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1446 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1447 };
1448
1449 if (ST->hasSSE1())
1450 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1451 if (auto KindCost = Entry->Cost[CostKind])
1452 return LT.first * *KindCost;
1453
1454 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1455 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1456 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1457 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1458 };
1459
1460 if (ST->is64Bit())
1461 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1462 if (auto KindCost = Entry->Cost[CostKind])
1463 return LT.first * *KindCost;
1464
1465 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1466 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1467 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1468 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1469
1470 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1471 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1472 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1473
1474 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1475 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1476 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1477
1478 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1479 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1480 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1481 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1482 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1483 };
1484
1485 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1486 if (auto KindCost = Entry->Cost[CostKind])
1487 return LT.first * *KindCost;
1488
1489 // It is not a good idea to vectorize division. We have to scalarize it and
1490 // in the process we will often end up having to spilling regular
1491 // registers. The overhead of division is going to dominate most kernels
1492 // anyways so try hard to prevent vectorization of division - it is
1493 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1494 // to hide "20 cycles" for each lane.
1495 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1496 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1497 ISD == ISD::UREM)) {
1498 InstructionCost ScalarCost =
1499 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
1500 Op1Info.getNoProps(), Op2Info.getNoProps());
1501 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1502 }
1503
1504 // Handle some basic single instruction code size cases.
1505 if (CostKind == TTI::TCK_CodeSize) {
1506 switch (ISD) {
1507 case ISD::FADD:
1508 case ISD::FSUB:
1509 case ISD::FMUL:
1510 case ISD::FDIV:
1511 case ISD::FNEG:
1512 case ISD::AND:
1513 case ISD::OR:
1514 case ISD::XOR:
1515 return LT.first;
1516 break;
1517 }
1518 }
1519
1520 // Fallback to the default implementation.
1521 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1522 Args, CxtI);
1523}
1524
1527 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1529 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1530 return TTI::TCC_Basic;
1532}
1533
1535 VectorType *DstTy, VectorType *SrcTy,
1536 ArrayRef<int> Mask,
1538 int Index, VectorType *SubTp,
1540 const Instruction *CxtI) const {
1541 assert((Mask.empty() || DstTy->isScalableTy() ||
1542 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1543 "Expected the Mask to match the return size if given");
1544 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1545 "Expected the same scalar types");
1546
1547 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1548 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1549 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1550
1551 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1552
1553 // If all args are constant than this will be constant folded away.
1554 if (!Args.empty() &&
1555 all_of(Args, [](const Value *Arg) { return isa<Constant>(Arg); }))
1556 return TTI::TCC_Free;
1557
1558 // Recognize a basic concat_vector shuffle.
1559 if (Kind == TTI::SK_PermuteTwoSrc &&
1560 Mask.size() == (2 * SrcTy->getElementCount().getKnownMinValue()) &&
1561 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1565 CostKind, Mask.size() / 2, SrcTy);
1566
1567 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1568 if (Kind == TTI::SK_Transpose)
1569 Kind = TTI::SK_PermuteTwoSrc;
1570
1571 if (Kind == TTI::SK_Broadcast) {
1572 // For Broadcasts we are splatting the first element from the first input
1573 // register, so only need to reference that input and all the output
1574 // registers are the same.
1575 LT.first = 1;
1576
1577 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1578 using namespace PatternMatch;
1579 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1580 (ST->hasAVX2() ||
1581 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1582 return TTI::TCC_Free;
1583 }
1584
1585 // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
1586 // permutation.
1587 // Attempt to detect a shuffle mask with a single defined element.
1588 bool IsInLaneShuffle = false;
1589 bool IsSingleElementMask = false;
1590 if (SrcTy->getPrimitiveSizeInBits() > 0 &&
1591 (SrcTy->getPrimitiveSizeInBits() % 128) == 0 &&
1592 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
1593 Mask.size() == SrcTy->getElementCount().getKnownMinValue()) {
1594 unsigned NumLanes = SrcTy->getPrimitiveSizeInBits() / 128;
1595 unsigned NumEltsPerLane = Mask.size() / NumLanes;
1596 if ((Mask.size() % NumLanes) == 0) {
1597 IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
1598 return P.value() == PoisonMaskElem ||
1599 ((P.value() % Mask.size()) / NumEltsPerLane) ==
1600 (P.index() / NumEltsPerLane);
1601 });
1602 IsSingleElementMask =
1603 (Mask.size() - 1) == static_cast<unsigned>(count_if(Mask, [](int M) {
1604 return M == PoisonMaskElem;
1605 }));
1606 }
1607 }
1608
1609 // Treat <X x bfloat> shuffles as <X x half>.
1610 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1611 LT.second = LT.second.changeVectorElementType(MVT::f16);
1612
1613 // Subvector extractions are free if they start at the beginning of a
1614 // vector and cheap if the subvectors are aligned.
1615 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1616 int NumElts = LT.second.getVectorNumElements();
1617 if ((Index % NumElts) == 0)
1618 return TTI::TCC_Free;
1619 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1620 if (SubLT.second.isVector()) {
1621 int NumSubElts = SubLT.second.getVectorNumElements();
1622 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1623 return SubLT.first;
1624 // Handle some cases for widening legalization. For now we only handle
1625 // cases where the original subvector was naturally aligned and evenly
1626 // fit in its legalized subvector type.
1627 // FIXME: Remove some of the alignment restrictions.
1628 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1629 // vectors.
1630 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1631 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1632 (NumSubElts % OrigSubElts) == 0 &&
1633 LT.second.getVectorElementType() ==
1634 SubLT.second.getVectorElementType() &&
1635 LT.second.getVectorElementType().getSizeInBits() ==
1636 SrcTy->getElementType()->getPrimitiveSizeInBits()) {
1637 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1638 "Unexpected number of elements!");
1639 auto *VecTy = FixedVectorType::get(SrcTy->getElementType(),
1640 LT.second.getVectorNumElements());
1641 auto *SubTy = FixedVectorType::get(SrcTy->getElementType(),
1642 SubLT.second.getVectorNumElements());
1643 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1644 InstructionCost ExtractCost =
1646 ExtractIndex, SubTy);
1647
1648 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1649 // if we have SSSE3 we can use pshufb.
1650 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1651 return ExtractCost + 1; // pshufd or pshufb
1652
1653 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1654 "Unexpected vector size");
1655
1656 return ExtractCost + 2; // worst case pshufhw + pshufd
1657 }
1658 }
1659 // If the extract subvector is not optimal, treat it as single op shuffle.
1661 }
1662
1663 // Subvector insertions are cheap if the subvectors are aligned.
1664 // Note that in general, the insertion starting at the beginning of a vector
1665 // isn't free, because we need to preserve the rest of the wide vector,
1666 // but if the destination vector legalizes to the same width as the subvector
1667 // then the insertion will simplify to a (free) register copy.
1668 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1669 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(DstTy);
1670 int NumElts = DstLT.second.getVectorNumElements();
1671 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1672 if (SubLT.second.isVector()) {
1673 int NumSubElts = SubLT.second.getVectorNumElements();
1674 bool MatchingTypes =
1675 NumElts == NumSubElts &&
1676 (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0;
1677 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1678 return MatchingTypes ? TTI::TCC_Free : SubLT.first;
1679 }
1680
1681 // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have
1682 // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of
1683 // v1f32 (legalised to f32) into a v4f32.
1684 if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 &&
1685 SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41()))
1686 return 1;
1687
1688 // If the insertion is the lowest subvector then it will be blended
1689 // otherwise treat it like a 2-op shuffle.
1690 Kind =
1691 (Index == 0 && LT.first == 1) ? TTI::SK_Select : TTI::SK_PermuteTwoSrc;
1692 }
1693
1694 // Handle some common (illegal) sub-vector types as they are often very cheap
1695 // to shuffle even on targets without PSHUFB.
1696 EVT VT = TLI->getValueType(DL, SrcTy);
1697 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1698 !ST->hasSSSE3()) {
1699 static const CostKindTblEntry SSE2SubVectorShuffleTbl[] = {
1700 {TTI::SK_Broadcast, MVT::v4i16, {1,1,1,1}}, // pshuflw
1701 {TTI::SK_Broadcast, MVT::v2i16, {1,1,1,1}}, // pshuflw
1702 {TTI::SK_Broadcast, MVT::v8i8, {2,2,2,2}}, // punpck/pshuflw
1703 {TTI::SK_Broadcast, MVT::v4i8, {2,2,2,2}}, // punpck/pshuflw
1704 {TTI::SK_Broadcast, MVT::v2i8, {1,1,1,1}}, // punpck
1705
1706 {TTI::SK_Reverse, MVT::v4i16, {1,1,1,1}}, // pshuflw
1707 {TTI::SK_Reverse, MVT::v2i16, {1,1,1,1}}, // pshuflw
1708 {TTI::SK_Reverse, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw/packus
1709 {TTI::SK_Reverse, MVT::v2i8, {1,1,1,1}}, // punpck
1710
1711 {TTI::SK_Splice, MVT::v4i16, {2,2,2,2}}, // punpck+psrldq
1712 {TTI::SK_Splice, MVT::v2i16, {2,2,2,2}}, // punpck+psrldq
1713 {TTI::SK_Splice, MVT::v4i8, {2,2,2,2}}, // punpck+psrldq
1714 {TTI::SK_Splice, MVT::v2i8, {2,2,2,2}}, // punpck+psrldq
1715
1716 {TTI::SK_PermuteTwoSrc, MVT::v4i16, {2,2,2,2}}, // punpck/pshuflw
1717 {TTI::SK_PermuteTwoSrc, MVT::v2i16, {2,2,2,2}}, // punpck/pshuflw
1718 {TTI::SK_PermuteTwoSrc, MVT::v8i8, {7,7,7,7}}, // punpck/pshuflw
1719 {TTI::SK_PermuteTwoSrc, MVT::v4i8, {4,4,4,4}}, // punpck/pshuflw
1720 {TTI::SK_PermuteTwoSrc, MVT::v2i8, {2,2,2,2}}, // punpck
1721
1722 {TTI::SK_PermuteSingleSrc, MVT::v4i16, {1,1,1,1}}, // pshuflw
1723 {TTI::SK_PermuteSingleSrc, MVT::v2i16, {1,1,1,1}}, // pshuflw
1724 {TTI::SK_PermuteSingleSrc, MVT::v8i8, {5,5,5,5}}, // punpck/pshuflw
1725 {TTI::SK_PermuteSingleSrc, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw
1726 {TTI::SK_PermuteSingleSrc, MVT::v2i8, {1,1,1,1}}, // punpck
1727 };
1728
1729 if (ST->hasSSE2())
1730 if (const auto *Entry =
1731 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1732 if (auto KindCost = Entry->Cost[CostKind])
1733 return LT.first * *KindCost;
1734 }
1735
1736 // We are going to permute multiple sources and the result will be in multiple
1737 // destinations. Providing an accurate cost only for splits where the element
1738 // type remains the same.
1739 if (LT.first != 1) {
1740 MVT LegalVT = LT.second;
1741 if (LegalVT.isVector() &&
1742 LegalVT.getVectorElementType().getSizeInBits() ==
1743 SrcTy->getElementType()->getPrimitiveSizeInBits() &&
1744 LegalVT.getVectorNumElements() <
1745 cast<FixedVectorType>(SrcTy)->getNumElements()) {
1746 unsigned VecTySize = DL.getTypeStoreSize(SrcTy);
1747 unsigned LegalVTSize = LegalVT.getStoreSize();
1748 // Number of source vectors after legalization:
1749 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1750 // Number of destination vectors after legalization:
1751 InstructionCost NumOfDests = LT.first;
1752
1753 auto *SingleOpTy = FixedVectorType::get(SrcTy->getElementType(),
1754 LegalVT.getVectorNumElements());
1755
1756 if (!Mask.empty() && NumOfDests.isValid()) {
1757 // Try to perform better estimation of the permutation.
1758 // 1. Split the source/destination vectors into real registers.
1759 // 2. Do the mask analysis to identify which real registers are
1760 // permuted. If more than 1 source registers are used for the
1761 // destination register building, the cost for this destination register
1762 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1763 // source register is used, build mask and calculate the cost as a cost
1764 // of PermuteSingleSrc.
1765 // Also, for the single register permute we try to identify if the
1766 // destination register is just a copy of the source register or the
1767 // copy of the previous destination register (the cost is
1768 // TTI::TCC_Basic). If the source register is just reused, the cost for
1769 // this operation is TTI::TCC_Free.
1770 NumOfDests =
1772 FixedVectorType::get(SrcTy->getElementType(), Mask.size()))
1773 .first;
1774 unsigned E = NumOfDests.getValue();
1775 unsigned NormalizedVF =
1776 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1777 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1778 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1779 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1780 copy(Mask, NormalizedMask.begin());
1781 unsigned PrevSrcReg = 0;
1782 ArrayRef<int> PrevRegMask;
1785 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1786 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1787 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1788 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1789 // Check if the previous register can be just copied to the next
1790 // one.
1791 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1792 PrevRegMask != RegMask)
1793 Cost +=
1795 SingleOpTy, RegMask, CostKind, 0, nullptr);
1796 else
1797 // Just a copy of previous destination register.
1799 return;
1800 }
1801 if (SrcReg != DestReg &&
1802 any_of(RegMask, not_equal_to(PoisonMaskElem))) {
1803 // Just a copy of the source register.
1805 }
1806 PrevSrcReg = SrcReg;
1807 PrevRegMask = RegMask;
1808 },
1809 [this, SingleOpTy, CostKind,
1810 &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/,
1811 unsigned /*Unused*/, bool /*Unused*/) {
1813 SingleOpTy, RegMask, CostKind, 0, nullptr);
1814 });
1815 return Cost;
1816 }
1817
1818 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1819 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1820 SingleOpTy, {}, CostKind, 0,
1821 nullptr);
1822 }
1823
1824 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1825 SubTp);
1826 }
1827
1828 // If we're just moving a single element around (probably as an alternative to
1829 // extracting it), we can assume this is cheap.
1830 if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask)
1831 return TTI::TCC_Basic;
1832
1833 static const CostKindTblEntry AVX512VBMIShuffleTbl[] = {
1834 { TTI::SK_Reverse, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1835 { TTI::SK_Reverse, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1836 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1837 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1838 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 2, 2, 2, 2 } }, // vpermt2b
1839 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // vpermt2b
1840 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 2, 2, 2, 2 } } // vpermt2b
1841 };
1842
1843 if (ST->hasVBMI())
1844 if (const auto *Entry =
1845 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1846 if (auto KindCost = Entry->Cost[CostKind])
1847 return LT.first * *KindCost;
1848
1849 static const CostKindTblEntry AVX512BWShuffleTbl[] = {
1850 { TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1851 { TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1852 { TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1853
1854 { TTI::SK_Reverse, MVT::v32i16, { 2, 6, 2, 4 } }, // vpermw
1855 { TTI::SK_Reverse, MVT::v32f16, { 2, 6, 2, 4 } }, // vpermw
1856 { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1857 { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1858 { TTI::SK_Reverse, MVT::v64i8, { 2, 9, 2, 3 } }, // pshufb + vshufi64x2
1859
1860 { TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
1861 { TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
1862 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1863 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1864 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 8, 8, 8, 8 } }, // extend to v32i16
1865
1866 { TTI::SK_PermuteTwoSrc, MVT::v32i16,{ 2, 2, 2, 2 } }, // vpermt2w
1867 { TTI::SK_PermuteTwoSrc, MVT::v32f16,{ 2, 2, 2, 2 } }, // vpermt2w
1868 { TTI::SK_PermuteTwoSrc, MVT::v16i16,{ 2, 2, 2, 2 } }, // vpermt2w
1869 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 2, 2, 2, 2 } }, // vpermt2w
1870 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 19, 19, 19, 19 } }, // 6 * v32i8 + 1
1871
1872 { TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vblendmw
1873 { TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vblendmb
1874
1875 { TTI::SK_Splice, MVT::v32i16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1876 { TTI::SK_Splice, MVT::v32f16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1877 { TTI::SK_Splice, MVT::v64i8, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1878 };
1879
1880 if (ST->hasBWI())
1881 if (const auto *Entry =
1882 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1883 if (auto KindCost = Entry->Cost[CostKind])
1884 return LT.first * *KindCost;
1885
1886 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1887 {TTI::SK_Broadcast, MVT::v8f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1888 {TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1889 {TTI::SK_Broadcast, MVT::v16f32, { 1, 3, 1, 1 } }, // vbroadcastss
1890 {TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 1 } }, // vbroadcastss
1891 {TTI::SK_Broadcast, MVT::v8i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1892 {TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1893 {TTI::SK_Broadcast, MVT::v16i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1894 {TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1895 {TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1896 {TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1897 {TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1898 {TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1899 {TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1900 {TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 1 }}, // vpbroadcastb
1901
1902 {TTI::SK_Reverse, MVT::v8f64, { 1, 5, 2, 3 } }, // vpermpd
1903 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 2, 3 } }, // vpermps
1904 {TTI::SK_Reverse, MVT::v8i64, { 1, 5, 2, 3 } }, // vpermq
1905 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 2, 3 } }, // vpermd
1906 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1907 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1908 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1909
1910 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1911 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1912 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1913 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1914 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1915 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1916 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1917 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1918 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1919 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1920 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1921
1922 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1923 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1924 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1925 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1926 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1927 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1928 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1929 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1930 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1931 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1932 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1933 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1934 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1935
1936 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1937 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1938 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1939 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1940 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1941 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1942 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1943 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1944 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1945 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1946 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1947 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1948
1949 // FIXME: This just applies the type legalization cost rules above
1950 // assuming these completely split.
1951 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1952 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1953 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1954 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1955 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1956 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1957
1958 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1959 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1960 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1961 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1962 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1963 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1964 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1965 };
1966
1967 if (ST->hasAVX512())
1968 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1969 if (auto KindCost = Entry->Cost[CostKind])
1970 return LT.first * *KindCost;
1971
1972 static const CostKindTblEntry AVX2InLaneShuffleTbl[] = {
1973 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 1, 1, 1, 1 } }, // vpshufb
1974 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 1, 1, 1, 1 } }, // vpshufb
1975 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpshufb
1976
1977 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
1978 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
1979 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
1980 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
1981 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1982 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1983 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1984 };
1985
1986 if (IsInLaneShuffle && ST->hasAVX2())
1987 if (const auto *Entry =
1988 CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second))
1989 if (auto KindCost = Entry->Cost[CostKind])
1990 return LT.first * *KindCost;
1991
1992 static const CostKindTblEntry AVX2ShuffleTbl[] = {
1993 { TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 2 } }, // vbroadcastpd
1994 { TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 2 } }, // vbroadcastps
1995 { TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 2 } }, // vpbroadcastq
1996 { TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 2 } }, // vpbroadcastd
1997 { TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 2 } }, // vpbroadcastw
1998 { TTI::SK_Broadcast, MVT::v8i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1999 { TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 2 } }, // vpbroadcastw
2000 { TTI::SK_Broadcast, MVT::v8f16, { 1, 3, 1, 1 } }, // vpbroadcastw
2001 { TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 2 } }, // vpbroadcastb
2002 { TTI::SK_Broadcast, MVT::v16i8, { 1, 3, 1, 1 } }, // vpbroadcastb
2003
2004 { TTI::SK_Reverse, MVT::v4f64, { 1, 6, 1, 2 } }, // vpermpd
2005 { TTI::SK_Reverse, MVT::v8f32, { 2, 7, 2, 4 } }, // vpermps
2006 { TTI::SK_Reverse, MVT::v4i64, { 1, 6, 1, 2 } }, // vpermq
2007 { TTI::SK_Reverse, MVT::v8i32, { 2, 7, 2, 4 } }, // vpermd
2008 { TTI::SK_Reverse, MVT::v16i16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2009 { TTI::SK_Reverse, MVT::v16f16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2010 { TTI::SK_Reverse, MVT::v32i8, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2011
2012 { TTI::SK_Select, MVT::v16i16, { 1, 1, 1, 1 } }, // vpblendvb
2013 { TTI::SK_Select, MVT::v16f16, { 1, 1, 1, 1 } }, // vpblendvb
2014 { TTI::SK_Select, MVT::v32i8, { 1, 1, 1, 1 } }, // vpblendvb
2015
2016 { TTI::SK_Splice, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2017 { TTI::SK_Splice, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2018 { TTI::SK_Splice, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2019 { TTI::SK_Splice, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2020 { TTI::SK_Splice, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2021
2022 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd
2023 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps
2024 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq
2025 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd
2026 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } },
2027 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } },
2028 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } },
2029
2030 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 3, 3, 3, 3 } }, // 2*vpermpd + vblendpd
2031 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 3, 3, 3, 3 } }, // 2*vpermps + vblendps
2032 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 3, 3, 3, 3 } }, // 2*vpermq + vpblendd
2033 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 3, 3, 3, 3 } }, // 2*vpermd + vpblendd
2034 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 7, 7, 7, 7 } },
2035 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 7, 7, 7, 7 } },
2036 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 7, 7, 7, 7 } },
2037 };
2038
2039 if (ST->hasAVX2())
2040 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
2041 if (auto KindCost = Entry->Cost[CostKind])
2042 return LT.first * *KindCost;
2043
2044 static const CostKindTblEntry XOPShuffleTbl[] = {
2045 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2046 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2047 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2048 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2049 { TTI::SK_PermuteSingleSrc, MVT::v16i16,{ 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2050 // + vinsertf128
2051 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2052 // + vinsertf128
2053
2054 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2055 // + vinsertf128
2056
2057 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 1, 1, 1, 1 } }, // vpperm
2058 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2059 // + vinsertf128
2060 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 1, 1, 1, 1 } }, // vpperm
2061 };
2062
2063 if (ST->hasXOP())
2064 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
2065 if (auto KindCost = Entry->Cost[CostKind])
2066 return LT.first * *KindCost;
2067
2068 static const CostKindTblEntry AVX1InLaneShuffleTbl[] = {
2069 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermilpd
2070 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermilpd
2071 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermilps
2072 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermilps
2073
2074 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2075 // + vpor + vinsertf128
2076 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2077 // + vpor + vinsertf128
2078 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2079 // + vpor + vinsertf128
2080
2081 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
2082 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
2083 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpermilpd + vblendpd
2084 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpermilps + vblendps
2085 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2086 // + 2*vpor + vinsertf128
2087 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2088 // + 2*vpor + vinsertf128
2089 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2090 // + 2*vpor + vinsertf128
2091 };
2092
2093 if (IsInLaneShuffle && ST->hasAVX())
2094 if (const auto *Entry =
2095 CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second))
2096 if (auto KindCost = Entry->Cost[CostKind])
2097 return LT.first * *KindCost;
2098
2099 static const CostKindTblEntry AVX1ShuffleTbl[] = {
2100 {TTI::SK_Broadcast, MVT::v4f64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2101 {TTI::SK_Broadcast, MVT::v8f32, {2,3,2,3}}, // vperm2f128 + vpermilps
2102 {TTI::SK_Broadcast, MVT::v4i64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2103 {TTI::SK_Broadcast, MVT::v8i32, {2,3,2,3}}, // vperm2f128 + vpermilps
2104 {TTI::SK_Broadcast, MVT::v16i16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2105 {TTI::SK_Broadcast, MVT::v16f16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2106 {TTI::SK_Broadcast, MVT::v32i8, {3,4,3,6}}, // vpshufb + vinsertf128
2107
2108 {TTI::SK_Reverse, MVT::v4f64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2109 {TTI::SK_Reverse, MVT::v8f32, {2,7,2,4}}, // vperm2f128 + vpermilps
2110 {TTI::SK_Reverse, MVT::v4i64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2111 {TTI::SK_Reverse, MVT::v8i32, {2,7,2,4}}, // vperm2f128 + vpermilps
2112 {TTI::SK_Reverse, MVT::v16i16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2113 // + vinsertf128
2114 {TTI::SK_Reverse, MVT::v16f16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2115 // + vinsertf128
2116 {TTI::SK_Reverse, MVT::v32i8, {2,9,5,5}}, // vextractf128 + 2*pshufb
2117 // + vinsertf128
2118
2119 {TTI::SK_Select, MVT::v4i64, {1,1,1,1}}, // vblendpd
2120 {TTI::SK_Select, MVT::v4f64, {1,1,1,1}}, // vblendpd
2121 {TTI::SK_Select, MVT::v8i32, {1,1,1,1}}, // vblendps
2122 {TTI::SK_Select, MVT::v8f32, {1,1,1,1}}, // vblendps
2123 {TTI::SK_Select, MVT::v16i16, {3,3,3,3}}, // vpand + vpandn + vpor
2124 {TTI::SK_Select, MVT::v16f16, {3,3,3,3}}, // vpand + vpandn + vpor
2125 {TTI::SK_Select, MVT::v32i8, {3,3,3,3}}, // vpand + vpandn + vpor
2126
2127 {TTI::SK_Splice, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + shufpd
2128 {TTI::SK_Splice, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + shufpd
2129 {TTI::SK_Splice, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2130 {TTI::SK_Splice, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2131 {TTI::SK_Splice, MVT::v16i16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2132 {TTI::SK_Splice, MVT::v16f16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2133 {TTI::SK_Splice, MVT::v32i8, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2134
2135 {TTI::SK_PermuteSingleSrc, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vshufpd
2136 {TTI::SK_PermuteSingleSrc, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vshufpd
2137 {TTI::SK_PermuteSingleSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2138 {TTI::SK_PermuteSingleSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2139 {TTI::SK_PermuteSingleSrc, MVT::v16i16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2140 // + 2*por + vinsertf128
2141 {TTI::SK_PermuteSingleSrc, MVT::v16f16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2142 // + 2*por + vinsertf128
2143 {TTI::SK_PermuteSingleSrc, MVT::v32i8, {8,8,8,8}}, // vextractf128 + 4*pshufb
2144 // + 2*por + vinsertf128
2145
2146 {TTI::SK_PermuteTwoSrc, MVT::v4f64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2147 {TTI::SK_PermuteTwoSrc, MVT::v4i64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2148 {TTI::SK_PermuteTwoSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2149 {TTI::SK_PermuteTwoSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2150 {TTI::SK_PermuteTwoSrc, MVT::v16i16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2151 // + 4*por + vinsertf128
2152 {TTI::SK_PermuteTwoSrc, MVT::v16f16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2153 // + 4*por + vinsertf128
2154 {TTI::SK_PermuteTwoSrc, MVT::v32i8, {15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2155 // + 4*por + vinsertf128
2156 };
2157
2158 if (ST->hasAVX())
2159 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2160 if (auto KindCost = Entry->Cost[CostKind])
2161 return LT.first * *KindCost;
2162
2163 static const CostKindTblEntry SSE41ShuffleTbl[] = {
2164 {TTI::SK_Select, MVT::v2i64, {1,1,1,1}}, // pblendw
2165 {TTI::SK_Select, MVT::v2f64, {1,1,1,1}}, // movsd
2166 {TTI::SK_Select, MVT::v4i32, {1,1,1,1}}, // pblendw
2167 {TTI::SK_Select, MVT::v4f32, {1,1,1,1}}, // blendps
2168 {TTI::SK_Select, MVT::v8i16, {1,1,1,1}}, // pblendw
2169 {TTI::SK_Select, MVT::v8f16, {1,1,1,1}}, // pblendw
2170 {TTI::SK_Select, MVT::v16i8, {1,1,1,1}} // pblendvb
2171 };
2172
2173 if (ST->hasSSE41())
2174 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2175 if (auto KindCost = Entry->Cost[CostKind])
2176 return LT.first * *KindCost;
2177
2178 static const CostKindTblEntry SSSE3ShuffleTbl[] = {
2179 {TTI::SK_Broadcast, MVT::v8i16, {1, 3, 2, 2}}, // pshufb
2180 {TTI::SK_Broadcast, MVT::v8f16, {1, 3, 2, 2}}, // pshufb
2181 {TTI::SK_Broadcast, MVT::v16i8, {1, 3, 2, 2}}, // pshufb
2182
2183 {TTI::SK_Reverse, MVT::v8i16, {1, 2, 1, 2}}, // pshufb
2184 {TTI::SK_Reverse, MVT::v8f16, {1, 2, 1, 2}}, // pshufb
2185 {TTI::SK_Reverse, MVT::v16i8, {1, 2, 1, 2}}, // pshufb
2186
2187 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2188 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2189 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2190
2191 {TTI::SK_Splice, MVT::v4i32, {1, 1, 1, 1}}, // palignr
2192 {TTI::SK_Splice, MVT::v4f32, {1, 1, 1, 1}}, // palignr
2193 {TTI::SK_Splice, MVT::v8i16, {1, 1, 1, 1}}, // palignr
2194 {TTI::SK_Splice, MVT::v8f16, {1, 1, 1, 1}}, // palignr
2195 {TTI::SK_Splice, MVT::v16i8, {1, 1, 1, 1}}, // palignr
2196
2197 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
2198 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
2199 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
2200
2201 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2202 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2203 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2204 };
2205
2206 if (ST->hasSSSE3())
2207 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2208 if (auto KindCost = Entry->Cost[CostKind])
2209 return LT.first * *KindCost;
2210
2211 static const CostKindTblEntry SSE2ShuffleTbl[] = {
2212 {TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2213 {TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2214 {TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2215 {TTI::SK_Broadcast, MVT::v8i16, {1, 2, 2, 2}}, // pshuflw + pshufd
2216 {TTI::SK_Broadcast, MVT::v8f16, {1, 2, 2, 2}}, // pshuflw + pshufd
2217 {TTI::SK_Broadcast, MVT::v16i8, {2, 3, 3, 4}}, // unpck + pshuflw + pshufd
2218
2219 {TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2220 {TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2221 {TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2222 {TTI::SK_Reverse, MVT::v8i16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2223 {TTI::SK_Reverse, MVT::v8f16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2224 {TTI::SK_Reverse, MVT::v16i8, {5, 6,11,11}}, // 2*pshuflw + 2*pshufhw
2225 // + 2*pshufd + 2*unpck + packus
2226
2227 {TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd
2228 {TTI::SK_Select, MVT::v2f64, {1, 1, 1, 1}}, // movsd
2229 {TTI::SK_Select, MVT::v4i32, {2, 2, 2, 2}}, // 2*shufps
2230 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // pand + pandn + por
2231 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // pand + pandn + por
2232 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // pand + pandn + por
2233
2234 {TTI::SK_Splice, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2235 {TTI::SK_Splice, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2236 {TTI::SK_Splice, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2237 {TTI::SK_Splice, MVT::v8i16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2238 {TTI::SK_Splice, MVT::v8f16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2239 {TTI::SK_Splice, MVT::v16i8, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2240
2241 {TTI::SK_PermuteSingleSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2242 {TTI::SK_PermuteSingleSrc, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2243 {TTI::SK_PermuteSingleSrc, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2244 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2245 // + pshufd/unpck
2246 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2247 // + pshufd/unpck
2248 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {8, 10, 10, 10}}, // 2*pshuflw + 2*pshufhw
2249 // + 2*pshufd + 2*unpck + 2*packus
2250
2251 {TTI::SK_PermuteTwoSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2252 {TTI::SK_PermuteTwoSrc, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2253 {TTI::SK_PermuteTwoSrc, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2254 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {6, 8, 8, 8}}, // blend+permute
2255 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {6, 8, 8, 8}}, // blend+permute
2256 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {11, 13, 13, 13}}, // blend+permute
2257 };
2258
2259 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2260 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2261 };
2262
2263 if (ST->hasSSE2()) {
2264 bool IsLoad =
2265 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2266 if (ST->hasSSE3() && IsLoad)
2267 if (const auto *Entry =
2268 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2269 assert(isLegalBroadcastLoad(SrcTy->getElementType(),
2270 LT.second.getVectorElementCount()) &&
2271 "Table entry missing from isLegalBroadcastLoad()");
2272 return LT.first * Entry->Cost;
2273 }
2274
2275 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2276 if (auto KindCost = Entry->Cost[CostKind])
2277 return LT.first * *KindCost;
2278 }
2279
2280 static const CostKindTblEntry SSE1ShuffleTbl[] = {
2281 { TTI::SK_Broadcast, MVT::v4f32, {1,1,1,1} }, // shufps
2282 { TTI::SK_Reverse, MVT::v4f32, {1,1,1,1} }, // shufps
2283 { TTI::SK_Select, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2284 { TTI::SK_Splice, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2285 { TTI::SK_PermuteSingleSrc, MVT::v4f32, {1,1,1,1} }, // shufps
2286 { TTI::SK_PermuteTwoSrc, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2287 };
2288
2289 if (ST->hasSSE1()) {
2290 if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) {
2291 // SHUFPS: both pairs must come from the same source register.
2292 auto MatchSHUFPS = [](int X, int Y) {
2293 return X < 0 || Y < 0 || ((X & 4) == (Y & 4));
2294 };
2295 if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3]))
2296 return 1;
2297 }
2298 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2299 if (auto KindCost = Entry->Cost[CostKind])
2300 return LT.first * *KindCost;
2301 }
2302
2303 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
2304 SubTp);
2305}
2306
2308 Type *Src,
2311 const Instruction *I) const {
2312 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2313 assert(ISD && "Invalid opcode");
2314
2315 // The cost tables include both specific, custom (non-legal) src/dst type
2316 // conversions and generic, legalized types. We test for customs first, before
2317 // falling back to legalization.
2318 // FIXME: Need a better design of the cost table to handle non-simple types of
2319 // potential massive combinations (elem_num x src_type x dst_type).
2320 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2321 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2322 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2323
2324 // Mask sign extend has an instruction.
2325 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2326 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2327 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2328 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2329 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2330 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2331 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2332 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2333 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2334 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2335 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2336 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2337 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2338 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2339 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2340 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2341 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2342
2343 // Mask zero extend is a sext + shift.
2344 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2345 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2346 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2347 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2348 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2349 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2350 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2351 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2352 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2353 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2354 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2355 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2356 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2357 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2358 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2359 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2360 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2361
2362 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2363 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2364 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2365 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2366 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2367 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2368 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2369 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2370 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2371 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2372 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2373 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2374 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2375 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2376 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2377 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2378 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2379
2380 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2381 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2382 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2383 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2384 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2385 };
2386
2387 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2388 // Mask sign extend has an instruction.
2389 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2390 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2391 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2392 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2393 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2394 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2395 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2396 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2397
2398 // Mask zero extend is a sext + shift.
2399 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2400 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2401 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2402 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2403 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2404 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2405 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2406 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2407
2408 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2409 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2410 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2411 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2412 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2413 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2414 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2415 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2416
2417 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2418 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2419
2420 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2421 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2422
2423 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2424 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2425
2426 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2427 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2428 };
2429
2430 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2431 // 256-bit wide vectors.
2432
2433 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2434 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2435 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2436 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2437 { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps
2438 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
2439 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2440 { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph
2441
2442 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2443 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2444 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2445 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2446 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2447 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2448 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2449 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2450 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2451 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2452 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2453 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2454 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2455 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2456 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2457 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2458 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2459 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2460 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2461 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2462 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2463 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2464 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2465 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2466 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2467 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2468 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2469 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2470 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2471 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2472 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2473 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2474 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2475 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2476
2477 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2478 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2479 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2480
2481 // Sign extend is zmm vpternlogd+vptruncdb.
2482 // Zero extend is zmm broadcast load+vptruncdw.
2483 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2484 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2485 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2486 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2487 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2488 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2489 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2490 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2491
2492 // Sign extend is zmm vpternlogd+vptruncdw.
2493 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2494 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2495 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2496 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2497 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2498 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2499 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2500 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2501 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2502
2503 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2504 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2505 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2506 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2507 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2508 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2509 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2510 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2511 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2512 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2513
2514 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2515 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2516 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2517 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2518
2519 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2520 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2521 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2522 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2523 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2524 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2525 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2526 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2527 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2528 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2529
2530 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2531 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2532
2533 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2534 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2535 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2536 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2537 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2538 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2539 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2540 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2541
2542 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2543 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2544 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2545 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2546 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2547 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2548 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2549 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2550 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2551 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2552
2553 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2554 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2555 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2556 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2557 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2558 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2559 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2560 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2561 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2562 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2563 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2564
2565 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2566 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2567 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2568 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2569 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2570 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2571 };
2572
2573 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2574 // Mask sign extend has an instruction.
2575 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2576 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2577 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2578 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2579 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2580 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2581 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2582 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2583 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2584 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2585 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2586 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2587 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2588 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2589 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2590 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2591 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2592
2593 // Mask zero extend is a sext + shift.
2594 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2595 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2596 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2597 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2598 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2599 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2600 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2601 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2602 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2603 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2604 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2605 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2606 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2607 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2608 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2609 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2610 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2611
2612 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2613 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2614 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2615 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2616 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2617 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2618 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2619 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2620 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2621 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2622 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2623 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2624 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2625 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2626 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2627 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2628 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2629
2630 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2631 };
2632
2633 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2634 // Mask sign extend has an instruction.
2635 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2636 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2637 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2638 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2639 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2640 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2641 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2642 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2643
2644 // Mask zero extend is a sext + shift.
2645 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2646 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2647 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2648 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2649 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2650 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2651 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2652 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2653
2654 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2655 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2656 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2657 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2658 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2659 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2660 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2661 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2662
2663 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2664 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2665 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2666 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2667
2668 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2669 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2670 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2671 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2672
2673 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2674 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2675 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2676 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2677
2678 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2679 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2680 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2681 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2682 };
2683
2684 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2685 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2686 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2687 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2688 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2689 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2690 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2691 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2692 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2693 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2694 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2695 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2696 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2697 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2698 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2699 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2700 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2701 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2702 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2703
2704 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2705 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2706 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2707 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2708 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2709 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2710 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2711 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2712 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2713 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2714
2715 // sign extend is vpcmpeq+maskedmove+vpmovdw
2716 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2717 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2718 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2719 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2720 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2721 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2722 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2723 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2724 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2725
2726 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2727 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2728 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2729 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2730 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2731 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2732 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2733 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2734
2735 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2736 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2737 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2738 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2739
2740 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2741 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2742 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2743 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2744 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2745 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2746 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2747 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2748 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2749 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2750 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2751 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2752
2753 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2754 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2755 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2756 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2757
2758 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2759 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2760 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2761 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2762 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2763 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2764 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2765 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2766 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2767 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2768 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2769 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2770 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2771
2772 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2773 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2774 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2775
2776 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2777 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2778 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2779 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2780 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2781 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2782 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2783 };
2784
2785 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2786 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2787 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2788 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2789 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2790 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2791 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2792
2793 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2794 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2795 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2796 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2797 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2798 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2799 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2800 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2801 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2802 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2803 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2804 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2805 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2806 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2807
2808 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2809
2810 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2811 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2812 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2813 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2814 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2815 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2816 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2817 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2818 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2819 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2820 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2821 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2822
2823 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2824 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2825
2826 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2827 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2828 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2829 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2830
2831 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2832 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2833 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2834 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2835 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2836 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2837 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2838 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2839
2840 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2841 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2842 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2843 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2844 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2845 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2846 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2847
2848 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2849 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2850 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2851 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2852 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2853 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2854 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2855 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2856 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2857 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2858 };
2859
2860 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2861 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2862 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2863 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2864 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2865 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2866 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2867
2868 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2869 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2870 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2871 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2872 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2873 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2874 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2875 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2876 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2877 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2878 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2879 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2880
2881 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2882 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2883 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2884 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2885 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2886
2887 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2888 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2889 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2890 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2891 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2892 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2893 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2894 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2895
2896 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2897 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2898 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2899 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2900 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2901 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2902 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2903 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2904 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2905 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2906 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2907 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2908
2909 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2910 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2911 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2912 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2913 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2914 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2915 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2916 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2917 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2918 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2919 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2920 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2921 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2922 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2923 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2924 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2925 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2926
2927 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2928 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2929 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2930 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2931 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2932 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2933 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2934 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2935 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2936 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2937 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2938
2939 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2940 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2941 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2942 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2943 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2944 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2945 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2946 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2947 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2948 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2949 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
2950 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
2951 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
2952
2953 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
2954 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
2955 };
2956
2957 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2958 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2959 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2960 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2961 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2962 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2963 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2964 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2965 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2966 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2967 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2968 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2969 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2970
2971 // These truncates end up widening elements.
2972 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
2973 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
2974 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
2975
2976 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
2977 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
2978 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
2979
2980 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2981 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2982 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2983 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2984 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2985 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2986 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2987 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2988 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2989 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
2990 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2991
2992 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2993 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2994 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
2995 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
2996 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2997 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2998 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2999 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
3000 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
3001 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3002 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
3003 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
3004 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
3005 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
3006
3007 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3008 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
3009 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3010 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
3011 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3012 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3013 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3014 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3015 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
3016 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
3017
3018 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3019 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3020 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3021 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3022 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3023 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3024 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3025 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3026 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3027 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3028 };
3029
3030 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
3031 // These are somewhat magic numbers justified by comparing the
3032 // output of llvm-mca for our various supported scheduler models
3033 // and basing it off the worst case scenario.
3034 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3035 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3036 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
3037 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
3038 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
3039 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3040 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
3041 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3042 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3043 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
3044 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
3045 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
3046
3047 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3048 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3049 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
3050 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
3051 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3052 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
3053 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
3054 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3055 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
3056 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
3057 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
3058 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
3059 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
3060
3061 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3062 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3063 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3064 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3065 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3066 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3067 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3068 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3069 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3070 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3071
3072 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3073 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3074 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3075 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
3076 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3077 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3078 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3079 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3080 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
3081 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
3082
3083 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3084 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3085 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
3086 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
3087 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3088 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
3089 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
3090 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
3091 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3092 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
3093 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3094 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
3095
3096 // These truncates are really widening elements.
3097 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
3098 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
3099 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
3100 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
3101 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
3102 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
3103
3104 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
3105 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
3106 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
3107 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
3108 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
3109 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
3110 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
3111 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
3112 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
3113 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
3114 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
3115 };
3116
3117 static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
3118 { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } },
3119 { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } },
3120 { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } },
3121 { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } },
3122 { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3123 { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } },
3124 { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } },
3125 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3126 };
3127
3128 // Attempt to map directly to (simple) MVT types to let us match custom entries.
3129 EVT SrcTy = TLI->getValueType(DL, Src);
3130 EVT DstTy = TLI->getValueType(DL, Dst);
3131
3132 // The function getSimpleVT only handles simple value types.
3133 if (SrcTy.isSimple() && DstTy.isSimple()) {
3134 MVT SimpleSrcTy = SrcTy.getSimpleVT();
3135 MVT SimpleDstTy = DstTy.getSimpleVT();
3136
3137 if (ST->useAVX512Regs()) {
3138 if (ST->hasBWI())
3139 if (const auto *Entry = ConvertCostTableLookup(
3140 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3141 if (auto KindCost = Entry->Cost[CostKind])
3142 return *KindCost;
3143
3144 if (ST->hasDQI())
3145 if (const auto *Entry = ConvertCostTableLookup(
3146 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3147 if (auto KindCost = Entry->Cost[CostKind])
3148 return *KindCost;
3149
3150 if (ST->hasAVX512())
3151 if (const auto *Entry = ConvertCostTableLookup(
3152 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3153 if (auto KindCost = Entry->Cost[CostKind])
3154 return *KindCost;
3155 }
3156
3157 if (ST->hasBWI())
3158 if (const auto *Entry = ConvertCostTableLookup(
3159 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3160 if (auto KindCost = Entry->Cost[CostKind])
3161 return *KindCost;
3162
3163 if (ST->hasDQI())
3164 if (const auto *Entry = ConvertCostTableLookup(
3165 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3166 if (auto KindCost = Entry->Cost[CostKind])
3167 return *KindCost;
3168
3169 if (ST->hasAVX512())
3170 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3171 SimpleDstTy, SimpleSrcTy))
3172 if (auto KindCost = Entry->Cost[CostKind])
3173 return *KindCost;
3174
3175 if (ST->hasAVX2()) {
3176 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3177 SimpleDstTy, SimpleSrcTy))
3178 if (auto KindCost = Entry->Cost[CostKind])
3179 return *KindCost;
3180 }
3181
3182 if (ST->hasAVX()) {
3183 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3184 SimpleDstTy, SimpleSrcTy))
3185 if (auto KindCost = Entry->Cost[CostKind])
3186 return *KindCost;
3187 }
3188
3189 if (ST->hasF16C()) {
3190 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3191 SimpleDstTy, SimpleSrcTy))
3192 if (auto KindCost = Entry->Cost[CostKind])
3193 return *KindCost;
3194 }
3195
3196 if (ST->hasSSE41()) {
3197 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3198 SimpleDstTy, SimpleSrcTy))
3199 if (auto KindCost = Entry->Cost[CostKind])
3200 return *KindCost;
3201 }
3202
3203 if (ST->hasSSE2()) {
3204 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3205 SimpleDstTy, SimpleSrcTy))
3206 if (auto KindCost = Entry->Cost[CostKind])
3207 return *KindCost;
3208 }
3209
3210 if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) ||
3211 (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) {
3212 // fp16 conversions not covered by any table entries require a libcall.
3213 // Return a large (arbitrary) number to model this.
3214 return InstructionCost(64);
3215 }
3216 }
3217
3218 // Fall back to legalized types.
3219 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3220 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3221
3222 // If we're truncating to the same legalized type - just assume its free.
3223 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3224 return TTI::TCC_Free;
3225
3226 if (ST->useAVX512Regs()) {
3227 if (ST->hasBWI())
3228 if (const auto *Entry = ConvertCostTableLookup(
3229 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3230 if (auto KindCost = Entry->Cost[CostKind])
3231 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3232
3233 if (ST->hasDQI())
3234 if (const auto *Entry = ConvertCostTableLookup(
3235 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3236 if (auto KindCost = Entry->Cost[CostKind])
3237 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3238
3239 if (ST->hasAVX512())
3240 if (const auto *Entry = ConvertCostTableLookup(
3241 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3242 if (auto KindCost = Entry->Cost[CostKind])
3243 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3244 }
3245
3246 if (ST->hasBWI())
3247 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3248 LTDest.second, LTSrc.second))
3249 if (auto KindCost = Entry->Cost[CostKind])
3250 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3251
3252 if (ST->hasDQI())
3253 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3254 LTDest.second, LTSrc.second))
3255 if (auto KindCost = Entry->Cost[CostKind])
3256 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3257
3258 if (ST->hasAVX512())
3259 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3260 LTDest.second, LTSrc.second))
3261 if (auto KindCost = Entry->Cost[CostKind])
3262 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3263
3264 if (ST->hasAVX2())
3265 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3266 LTDest.second, LTSrc.second))
3267 if (auto KindCost = Entry->Cost[CostKind])
3268 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3269
3270 if (ST->hasAVX())
3271 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3272 LTDest.second, LTSrc.second))
3273 if (auto KindCost = Entry->Cost[CostKind])
3274 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3275
3276 if (ST->hasF16C()) {
3277 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3278 LTDest.second, LTSrc.second))
3279 if (auto KindCost = Entry->Cost[CostKind])
3280 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3281 }
3282
3283 if (ST->hasSSE41())
3284 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3285 LTDest.second, LTSrc.second))
3286 if (auto KindCost = Entry->Cost[CostKind])
3287 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3288
3289 if (ST->hasSSE2())
3290 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3291 LTDest.second, LTSrc.second))
3292 if (auto KindCost = Entry->Cost[CostKind])
3293 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3294
3295 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3296 // sitofp.
3297 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3298 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3299 Type *ExtSrc = Src->getWithNewBitWidth(32);
3300 unsigned ExtOpc =
3301 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3302
3303 // For scalar loads the extend would be free.
3304 InstructionCost ExtCost = 0;
3305 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3306 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3307
3308 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3310 }
3311
3312 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3313 // i32.
3314 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3315 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3316 Type *TruncDst = Dst->getWithNewBitWidth(32);
3317 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3318 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3320 }
3321
3322 // TODO: Allow non-throughput costs that aren't binary.
3323 auto AdjustCost = [&CostKind](InstructionCost Cost,
3326 return Cost == 0 ? 0 : N;
3327 return Cost * N;
3328 };
3329 return AdjustCost(
3330 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3331}
3332
3334 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3336 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
3337 // Early out if this type isn't scalar/vector integer/float.
3338 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3339 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3340 Op1Info, Op2Info, I);
3341
3342 // Legalize the type.
3343 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3344
3345 MVT MTy = LT.second;
3346
3347 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3348 assert(ISD && "Invalid opcode");
3349
3350 InstructionCost ExtraCost = 0;
3351 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3352 // Some vector comparison predicates cost extra instructions.
3353 // TODO: Adjust ExtraCost based on CostKind?
3354 // TODO: Should we invert this and assume worst case cmp costs
3355 // and reduce for particular predicates?
3356 if (MTy.isVector() &&
3357 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3358 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3359 ST->hasBWI())) {
3360 // Fallback to I if a specific predicate wasn't specified.
3361 CmpInst::Predicate Pred = VecPred;
3362 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3364 Pred = cast<CmpInst>(I)->getPredicate();
3365
3366 bool CmpWithConstant = false;
3367 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3368 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3369
3370 switch (Pred) {
3372 // xor(cmpeq(x,y),-1)
3373 ExtraCost = CmpWithConstant ? 0 : 1;
3374 break;
3377 // xor(cmpgt(x,y),-1)
3378 ExtraCost = CmpWithConstant ? 0 : 1;
3379 break;
3382 // cmpgt(xor(x,signbit),xor(y,signbit))
3383 // xor(cmpeq(pmaxu(x,y),x),-1)
3384 ExtraCost = CmpWithConstant ? 1 : 2;
3385 break;
3388 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3389 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3390 // cmpeq(psubus(x,y),0)
3391 // cmpeq(pminu(x,y),x)
3392 ExtraCost = 1;
3393 } else {
3394 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3395 ExtraCost = CmpWithConstant ? 2 : 3;
3396 }
3397 break;
3400 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3401 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3402 if (CondTy && !ST->hasAVX())
3403 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3405 Op1Info, Op2Info) +
3406 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3408 Op1Info, Op2Info) +
3409 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3410
3411 break;
3414 // Assume worst case scenario and add the maximum extra cost.
3415 ExtraCost = 3;
3416 break;
3417 default:
3418 break;
3419 }
3420 }
3421 }
3422
3423 static const CostKindTblEntry SLMCostTbl[] = {
3424 // slm pcmpeq/pcmpgt throughput is 2
3425 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3426 // slm pblendvb/blendvpd/blendvps throughput is 4
3427 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3428 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3429 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3430 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3431 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3432 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3433 };
3434
3435 static const CostKindTblEntry AVX512BWCostTbl[] = {
3436 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3437 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3438 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3439 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3440
3441 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3442 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3443 };
3444
3445 static const CostKindTblEntry AVX512CostTbl[] = {
3446 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3447 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3448 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3449 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3450
3451 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3452 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3453 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3454 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3455 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3456 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3457 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3458
3459 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3460 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3461 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3462 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3463 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3464 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3465 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3466 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3467 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3468 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3469 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3470 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3471 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3472 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3473
3474 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3475 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3476 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3477 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3478 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3479 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3480 };
3481
3482 static const CostKindTblEntry AVX2CostTbl[] = {
3483 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3484 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3485 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3486 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3487 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3488 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3489
3490 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3491 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3492 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3493 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3494
3495 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3496 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3497 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3498 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3499 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3500 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3501 };
3502
3503 static const CostKindTblEntry XOPCostTbl[] = {
3504 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3505 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3506 };
3507
3508 static const CostKindTblEntry AVX1CostTbl[] = {
3509 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3510 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3511 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3512 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3513 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3514 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3515
3516 // AVX1 does not support 8-wide integer compare.
3517 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3518 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3519 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3520 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3521
3522 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3523 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3524 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3525 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3526 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3527 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3528 };
3529
3530 static const CostKindTblEntry SSE42CostTbl[] = {
3531 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3532 };
3533
3534 static const CostKindTblEntry SSE41CostTbl[] = {
3535 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3536 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3537
3538 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3539 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3540 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3541 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3542 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3543 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3544 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3545 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3546 };
3547
3548 static const CostKindTblEntry SSE2CostTbl[] = {
3549 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3550 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3551
3552 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3553 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3554 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3555 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3556
3557 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3558 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3559 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3560 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3561 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3562 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3563 };
3564
3565 static const CostKindTblEntry SSE1CostTbl[] = {
3566 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3567 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3568
3569 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3570 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3571 };
3572
3573 if (ST->useSLMArithCosts())
3574 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3575 if (auto KindCost = Entry->Cost[CostKind])
3576 return LT.first * (ExtraCost + *KindCost);
3577
3578 if (ST->hasBWI())
3579 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3580 if (auto KindCost = Entry->Cost[CostKind])
3581 return LT.first * (ExtraCost + *KindCost);
3582
3583 if (ST->hasAVX512())
3584 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3585 if (auto KindCost = Entry->Cost[CostKind])
3586 return LT.first * (ExtraCost + *KindCost);
3587
3588 if (ST->hasAVX2())
3589 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3590 if (auto KindCost = Entry->Cost[CostKind])
3591 return LT.first * (ExtraCost + *KindCost);
3592
3593 if (ST->hasXOP())
3594 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3595 if (auto KindCost = Entry->Cost[CostKind])
3596 return LT.first * (ExtraCost + *KindCost);
3597
3598 if (ST->hasAVX())
3599 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3600 if (auto KindCost = Entry->Cost[CostKind])
3601 return LT.first * (ExtraCost + *KindCost);
3602
3603 if (ST->hasSSE42())
3604 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3605 if (auto KindCost = Entry->Cost[CostKind])
3606 return LT.first * (ExtraCost + *KindCost);
3607
3608 if (ST->hasSSE41())
3609 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3610 if (auto KindCost = Entry->Cost[CostKind])
3611 return LT.first * (ExtraCost + *KindCost);
3612
3613 if (ST->hasSSE2())
3614 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3615 if (auto KindCost = Entry->Cost[CostKind])
3616 return LT.first * (ExtraCost + *KindCost);
3617
3618 if (ST->hasSSE1())
3619 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3620 if (auto KindCost = Entry->Cost[CostKind])
3621 return LT.first * (ExtraCost + *KindCost);
3622
3623 // Assume a 3cy latency for fp select ops.
3624 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3625 if (ValTy->getScalarType()->isFloatingPointTy())
3626 return 3;
3627
3628 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3629 Op1Info, Op2Info, I);
3630}
3631
3633
3637 // Costs should match the codegen from:
3638 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3639 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3640 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3641 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3642 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3643
3644 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3645 // specialized in these tables yet.
3646 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3647 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3648 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3649 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3650 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3651 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3652 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3653 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3654 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3655 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3656 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3657 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3658 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3659 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3660 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3661 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3662 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3663 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3664 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3665 };
3666 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3667 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3668 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3669 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3670 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3671 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3672 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3673 };
3674 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3675 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3676 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3677 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3678 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3679 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3680 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3681 };
3682 static const CostKindTblEntry AVX512CDCostTbl[] = {
3683 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3684 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3685 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3686 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3687 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3688 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3689 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3690 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3691 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3692 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3693 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3694 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3695
3696 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3697 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3698 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3699 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3700 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3701 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3702 };
3703 static const CostKindTblEntry AVX512BWCostTbl[] = {
3704 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3705 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3706 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3707 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3708 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3709 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3710 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3711 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3712 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3713 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3714 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3715 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3716 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3717 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3718 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3719 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3720 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3721 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3722 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3723 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3724 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3725 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3726 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3727 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3728 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3729 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3730 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3731 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3732 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3733 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3734 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3735 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3736 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3737 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3738 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3739 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3740 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3741 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3742 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3743 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3744 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3745 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3746 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3747 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3748 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3749 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3750 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3751 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3752 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3753 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3754 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3755 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3756 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3757 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3758 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3759 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3760 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3761 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3762 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3763 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3764 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3765 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3766 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3767 { ISD::SADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3768 { ISD::SADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3769 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3770 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3771 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3772 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3773 { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } },
3774 { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } },
3775 { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } },
3776 { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } },
3777 { ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3778 { ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3779 { ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3780 { ISD::UADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3781 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3782 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3783 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3784 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3785 { ISD::USUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3786 { ISD::USUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3787 };
3788 static const CostKindTblEntry AVX512CostTbl[] = {
3789 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3790 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3791 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3792 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3793 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3794 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3795 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3796 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3797 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3798 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3799 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3800 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3801 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3802 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3803 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3804 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3805 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3806 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3807 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3808 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3809 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3810 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3811 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3812 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3813 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3814 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3815 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3816 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3817 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3818 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3819 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3820 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3821 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3822 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3823 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3824 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3825 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3826 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3827 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3828 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3829 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3830 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3831 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3832 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3833 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3834 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3835 { ISD::SADDSAT, MVT::v2i64, { 3, 3, 8, 9 } },
3836 { ISD::SADDSAT, MVT::v4i64, { 2, 2, 6, 7 } },
3837 { ISD::SADDSAT, MVT::v8i64, { 3, 3, 6, 7 } },
3838 { ISD::SADDSAT, MVT::v4i32, { 2, 2, 6, 7 } },
3839 { ISD::SADDSAT, MVT::v8i32, { 2, 2, 6, 7 } },
3840 { ISD::SADDSAT, MVT::v16i32, { 3, 3, 6, 7 } },
3841 { ISD::SADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3842 { ISD::SADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3843 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3844 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3845 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3846 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3847 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3848 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3849 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3850 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3851 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3852 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3853 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3854 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3855 { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } },
3856 { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } },
3857 { ISD::SMULO, MVT::v32i16, { 6, 12, 17, 17 } },
3858 { ISD::SMULO, MVT::v64i8, { 22, 28, 42, 42 } },
3859 { ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } },
3860 { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } },
3861 { ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } },
3862 { ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } },
3863 { ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } },
3864 { ISD::SSUBSAT, MVT::v16i32, { 2, 14, 7, 8 } },
3865 { ISD::SSUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3866 { ISD::SSUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3867 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3868 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3869 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3870 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3871 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3872 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3873 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3874 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3875 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3876 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3877 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3878 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3879 { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} },
3880 { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } },
3881 { ISD::UMULO, MVT::v32i16, { 5, 13, 16, 16 } },
3882 { ISD::UMULO, MVT::v64i8, { 18, 24, 30, 30 } },
3883 { ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } },
3884 { ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } },
3885 { ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } },
3886 { ISD::UADDSAT, MVT::v4i32, { 1, 2, 4, 4 } },
3887 { ISD::UADDSAT, MVT::v8i32, { 1, 2, 4, 4 } },
3888 { ISD::UADDSAT, MVT::v16i32, { 2, 2, 4, 4 } },
3889 { ISD::UADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3890 { ISD::UADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3891 { ISD::USUBSAT, MVT::v2i64, { 1, 4, 2, 2 } },
3892 { ISD::USUBSAT, MVT::v4i64, { 1, 4, 2, 2 } },
3893 { ISD::USUBSAT, MVT::v8i64, { 1, 4, 2, 2 } },
3894 { ISD::USUBSAT, MVT::v8i32, { 1, 2, 2, 2 } },
3895 { ISD::USUBSAT, MVT::v16i32, { 1, 2, 2, 2 } },
3896 { ISD::USUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3897 { ISD::USUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3898 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3899 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3900 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3901 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3902 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3903 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3904 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3905 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3906 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3907 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3908 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3909 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3910 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3911 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3912 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3913 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3914 };
3915 static const CostKindTblEntry XOPCostTbl[] = {
3916 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3917 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3918 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3919 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3920 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3921 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3922 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3923 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3924 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3925 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3926 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3927 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3928 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3929 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3930 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3931 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3932 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3933 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3934 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3935 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3936 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3937 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3938 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3939 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3940 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3941 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3942 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3943 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3944 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3945 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3946 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3947 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3948 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3949 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3950 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3951 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3952 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3953 };
3954 static const CostKindTblEntry AVX2CostTbl[] = {
3955 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3956 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3957 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3958 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3959 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3960 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3961 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3962 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3963 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3964 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3965 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3966 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3967 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3968 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3969 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3970 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3971 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3972 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3973 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3974 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3975 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3976 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3977 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3978 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3979 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3980 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3981 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3982 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3983 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3984 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3985 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3986 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3987 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3988 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3989 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3990 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3991 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3992 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3993 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3994 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3995 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3996 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3997 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3998 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3999 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
4000 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
4001 { ISD::SADDSAT, MVT::v2i64, { 4, 13, 8, 11 } },
4002 { ISD::SADDSAT, MVT::v4i64, { 3, 10, 8, 12 } },
4003 { ISD::SADDSAT, MVT::v4i32, { 2, 6, 7, 9 } },
4004 { ISD::SADDSAT, MVT::v8i32, { 4, 6, 7, 13 } },
4005 { ISD::SADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4006 { ISD::SADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4007 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
4008 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
4009 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4010 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4011 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4012 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
4013 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
4014 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4015 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4016 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4017 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4018 { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } },
4019 { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } },
4020 { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } },
4021 { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } },
4022 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4023 { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } },
4024 { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } },
4025 { ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } },
4026 { ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } },
4027 { ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } },
4028 { ISD::SSUBSAT, MVT::v8i32, { 4, 15, 9, 16 } },
4029 { ISD::SSUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4030 { ISD::SSUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4031 { ISD::UADDSAT, MVT::v2i64, { 2, 8, 6, 6 } },
4032 { ISD::UADDSAT, MVT::v4i64, { 3, 8, 6, 10 } },
4033 { ISD::UADDSAT, MVT::v8i32, { 2, 2, 4, 8 } },
4034 { ISD::UADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4035 { ISD::UADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4036 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
4037 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
4038 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4039 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4040 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4041 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
4042 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
4043 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4044 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4045 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4046 { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } },
4047 { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } },
4048 { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } },
4049 { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } },
4050 { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } },
4051 { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } },
4052 { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } },
4053 { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } },
4054 { ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } },
4055 { ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } },
4056 { ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } },
4057 { ISD::USUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4058 { ISD::USUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4059 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4060 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4061 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4062 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4063 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4064 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4065 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
4066 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
4067 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
4068 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
4069 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
4070 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
4071 };
4072 static const CostKindTblEntry AVX1CostTbl[] = {
4073 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4074 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
4075 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
4076 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
4077 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4078 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
4079 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4080 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
4081 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4082 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
4083 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
4084 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
4085 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
4086 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
4087 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
4088 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
4089 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
4090 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
4091 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
4092 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
4093 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
4094 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
4095 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
4096 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
4097 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4098 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
4099 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4100 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
4101 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4102 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
4103 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
4104 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
4105 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
4106 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
4107 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
4108 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
4109 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
4110 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
4111 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4112 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
4113 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
4114 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
4115 { ISD::SADDSAT, MVT::v2i64, { 6, 13, 8, 11 } },
4116 { ISD::SADDSAT, MVT::v4i64, { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
4117 { ISD::SADDSAT, MVT::v8i32, { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
4118 { ISD::SADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4119 { ISD::SADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4120 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4121 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
4122 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4123 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4124 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4125 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4126 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4127 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4128 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4129 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4130 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4131 { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } },
4132 { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } },
4133 { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } },
4134 { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } },
4135 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4136 { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } },
4137 { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } },
4138 { ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } },
4139 { ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4140 { ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4141 { ISD::SSUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4142 { ISD::SSUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4143 { ISD::UADDSAT, MVT::v2i64, { 3, 8, 6, 6 } },
4144 { ISD::UADDSAT, MVT::v4i64, { 8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4145 { ISD::UADDSAT, MVT::v8i32, { 6, 6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
4146 { ISD::UADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4147 { ISD::UADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4148 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4149 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
4150 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4151 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4152 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4153 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4154 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
4155 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4156 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4157 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4158 { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } },
4159 { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } },
4160 { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } },
4161 { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } },
4162 { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } },
4163 { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } },
4164 { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } },
4165 { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } },
4166 { ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } },
4167 { ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4168 { ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert
4169 { ISD::USUBSAT, MVT::v8i32, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4170 { ISD::USUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4171 { ISD::USUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4172 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4173 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4174 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4175 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4176 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4177 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4178 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
4179 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
4180 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
4181 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
4182 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
4183 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
4184 };
4185 static const CostKindTblEntry GFNICostTbl[] = {
4186 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
4187 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
4188 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
4189 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
4190 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4191 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4192 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4193 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
4194 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4195 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4196 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
4197 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4198 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4199 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
4200 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4201 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4202 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4203 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4204 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4205 };
4206 static const CostKindTblEntry GLMCostTbl[] = {
4207 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
4208 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
4209 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
4210 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
4211 };
4212 static const CostKindTblEntry SLMCostTbl[] = {
4213 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
4214 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
4215 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
4216 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
4217 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
4218 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
4219 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
4220 };
4221 static const CostKindTblEntry SSE42CostTbl[] = {
4222 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4223 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4224 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4225 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4226 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4227 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4228 };
4229 static const CostKindTblEntry SSE41CostTbl[] = {
4230 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
4231 { ISD::SADDSAT, MVT::v2i64, { 10, 14, 17, 21 } },
4232 { ISD::SADDSAT, MVT::v4i32, { 5, 11, 8, 10 } },
4233 { ISD::SSUBSAT, MVT::v2i64, { 12, 19, 25, 29 } },
4234 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 10, 12 } },
4235 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
4236 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4237 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4238 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4239 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4240 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4241 { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } },
4242 { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } },
4243 { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } },
4244 { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } },
4245 { ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } },
4246 { ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } },
4247 { ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } },
4248 { ISD::USUBSAT, MVT::v4i32, { 1, 2, 2, 2 } },
4249 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
4250 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4251 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4252 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
4253 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4254 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4255 { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } },
4256 { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } },
4257 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4258 { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } },
4259 };
4260 static const CostKindTblEntry SSSE3CostTbl[] = {
4261 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
4262 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
4263 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
4264 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
4265 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
4266 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
4267 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
4268 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
4269 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
4270 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
4271 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4272 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4273 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4274 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4275 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4276 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4277 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4278 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4279 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4280 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4281 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4282 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4283 };
4284 static const CostKindTblEntry SSE2CostTbl[] = {
4285 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4286 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4287 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4288 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4289 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4290 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4291 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4292 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4293 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4294 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4295 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4296 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4297 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4298 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4299 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4300 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4301 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4302 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4303 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4304 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4305 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4306 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4307 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4308 { ISD::SADDSAT, MVT::v2i64, { 12, 14, 24, 24 } },
4309 { ISD::SADDSAT, MVT::v4i32, { 6, 11, 11, 12 } },
4310 { ISD::SADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4311 { ISD::SADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4312 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4313 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4314 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4315 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4316 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4317 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4318 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4319 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4320 { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } },
4321 { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } },
4322 { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } },
4323 { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } },
4324 { ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } },
4325 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } },
4326 { ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4327 { ISD::SSUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4328 { ISD::UADDSAT, MVT::v2i64, { 7, 13, 14, 14 } },
4329 { ISD::UADDSAT, MVT::v4i32, { 4, 5, 7, 7 } },
4330 { ISD::UADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4331 { ISD::UADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4332 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4333 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4334 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4335 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4336 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4337 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4338 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4339 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4340 { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } },
4341 { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } },
4342 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4343 { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } },
4344 { ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } },
4345 { ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } },
4346 { ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4347 { ISD::USUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4348 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4349 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4350 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4351 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4352 };
4353 static const CostKindTblEntry SSE1CostTbl[] = {
4354 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4355 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4356 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4357 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4358 };
4359 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4360 { ISD::CTTZ, MVT::i64, { 1, 1, 1, 1 } },
4361 };
4362 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4363 { ISD::CTTZ, MVT::i32, { 1, 1, 1, 1 } },
4364 { ISD::CTTZ, MVT::i16, { 2, 1, 1, 1 } },
4365 { ISD::CTTZ, MVT::i8, { 2, 1, 1, 1 } },
4366 };
4367 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4368 { ISD::CTLZ, MVT::i64, { 1, 1, 1, 1 } },
4369 };
4370 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4371 { ISD::CTLZ, MVT::i32, { 1, 1, 1, 1 } },
4372 { ISD::CTLZ, MVT::i16, { 2, 1, 1, 1 } },
4373 { ISD::CTLZ, MVT::i8, { 2, 1, 1, 1 } },
4374 };
4375 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4376 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4377 };
4378 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4379 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4380 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4381 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4382 };
4383 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4384 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4385 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4386 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4387 { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4388 { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4389 { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR
4390 { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR
4391 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
4392 { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF
4393 { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF
4394 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF
4395 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF
4396 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
4397 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4398 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4399 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4400 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4401 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4402 { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } },
4403 { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } },
4404 { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } },
4405 { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } },
4406 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4407 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4408 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4409 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4410 { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
4411 { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
4412 { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
4413 { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
4414 };
4415 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4416 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4417 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4418 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4419 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4420 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4421 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4422 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4423 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4424 { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4425 { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4426 { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4427 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
4428 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
4429 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4430 { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH
4431 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4432 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4433 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 1, 2 } }, // BSF
4434 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 2 } }, // BSF
4435 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 2 } }, // BSF
4436 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4437 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4438 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4439 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4440 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4441 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4442 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4443 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4444 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4445 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4446 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4447 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4448 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4449 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4450 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4451 { ISD::SADDSAT, MVT::i32, { 3, 4, 6, 9 } },
4452 { ISD::SADDSAT, MVT::i16, { 4, 4, 7, 10 } },
4453 { ISD::SADDSAT, MVT::i8, { 4, 5, 8, 11 } },
4454 { ISD::SSUBSAT, MVT::i32, { 4, 4, 7, 10 } },
4455 { ISD::SSUBSAT, MVT::i16, { 4, 4, 7, 10 } },
4456 { ISD::SSUBSAT, MVT::i8, { 4, 5, 8, 11 } },
4457 { ISD::UADDSAT, MVT::i32, { 2, 3, 4, 7 } },
4458 { ISD::UADDSAT, MVT::i16, { 2, 3, 4, 7 } },
4459 { ISD::UADDSAT, MVT::i8, { 3, 3, 5, 8 } },
4460 { ISD::USUBSAT, MVT::i32, { 2, 3, 4, 7 } },
4461 { ISD::USUBSAT, MVT::i16, { 2, 3, 4, 7 } },
4462 { ISD::USUBSAT, MVT::i8, { 3, 3, 5, 8 } },
4463 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4464 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4465 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4466 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4467 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4468 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4469 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4470 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4471 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4472 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4473 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4474 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4475 { ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } },
4476 { ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } },
4477 { ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } },
4478 { ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } },
4479 { ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } },
4480 { ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } },
4481 { ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } },
4482 { ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } },
4483 { ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } },
4484 { ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } },
4485 { ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } },
4486 { ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } },
4487 };
4488
4489 Type *RetTy = ICA.getReturnType();
4490 Type *OpTy = RetTy;
4491 Intrinsic::ID IID = ICA.getID();
4492 unsigned ISD = ISD::DELETED_NODE;
4493 switch (IID) {
4494 default:
4495 break;
4496 case Intrinsic::abs:
4497 ISD = ISD::ABS;
4498 break;
4499 case Intrinsic::bitreverse:
4501 break;
4502 case Intrinsic::bswap:
4503 ISD = ISD::BSWAP;
4504 break;
4505 case Intrinsic::ctlz:
4506 ISD = ISD::CTLZ;
4507 break;
4508 case Intrinsic::ctpop:
4509 ISD = ISD::CTPOP;
4510 break;
4511 case Intrinsic::cttz:
4512 ISD = ISD::CTTZ;
4513 break;
4514 case Intrinsic::fshl:
4515 ISD = ISD::FSHL;
4516 if (!ICA.isTypeBasedOnly()) {
4517 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4518 if (Args[0] == Args[1]) {
4519 ISD = ISD::ROTL;
4520 // Handle uniform constant rotation amounts.
4521 // TODO: Handle funnel-shift cases.
4522 const APInt *Amt;
4523 if (Args[2] &&
4526 }
4527 }
4528 break;
4529 case Intrinsic::fshr:
4530 // FSHR has same costs so don't duplicate.
4531 ISD = ISD::FSHL;
4532 if (!ICA.isTypeBasedOnly()) {
4533 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4534 if (Args[0] == Args[1]) {
4535 ISD = ISD::ROTR;
4536 // Handle uniform constant rotation amount.
4537 // TODO: Handle funnel-shift cases.
4538 const APInt *Amt;
4539 if (Args[2] &&
4542 }
4543 }
4544 break;
4545 case Intrinsic::lrint:
4546 case Intrinsic::llrint: {
4547 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4548 // have the same costs as the CVTTP2SI (fptosi) instructions
4549 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4550 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4552 }
4553 case Intrinsic::maxnum:
4554 case Intrinsic::minnum:
4555 // FMINNUM has same costs so don't duplicate.
4556 ISD = ISD::FMAXNUM;
4557 break;
4558 case Intrinsic::sadd_sat:
4559 ISD = ISD::SADDSAT;
4560 break;
4561 case Intrinsic::smax:
4562 ISD = ISD::SMAX;
4563 break;
4564 case Intrinsic::smin:
4565 ISD = ISD::SMIN;
4566 break;
4567 case Intrinsic::ssub_sat:
4568 ISD = ISD::SSUBSAT;
4569 break;
4570 case Intrinsic::uadd_sat:
4571 ISD = ISD::UADDSAT;
4572 break;
4573 case Intrinsic::umax:
4574 ISD = ISD::UMAX;
4575 break;
4576 case Intrinsic::umin:
4577 ISD = ISD::UMIN;
4578 break;
4579 case Intrinsic::usub_sat:
4580 ISD = ISD::USUBSAT;
4581 break;
4582 case Intrinsic::sqrt:
4583 ISD = ISD::FSQRT;
4584 break;
4585 case Intrinsic::sadd_with_overflow:
4586 case Intrinsic::ssub_with_overflow:
4587 // SSUBO has same costs so don't duplicate.
4588 ISD = ISD::SADDO;
4589 OpTy = RetTy->getContainedType(0);
4590 break;
4591 case Intrinsic::uadd_with_overflow:
4592 case Intrinsic::usub_with_overflow:
4593 // USUBO has same costs so don't duplicate.
4594 ISD = ISD::UADDO;
4595 OpTy = RetTy->getContainedType(0);
4596 break;
4597 case Intrinsic::smul_with_overflow:
4598 ISD = ISD::SMULO;
4599 OpTy = RetTy->getContainedType(0);
4600 break;
4601 case Intrinsic::umul_with_overflow:
4602 ISD = ISD::UMULO;
4603 OpTy = RetTy->getContainedType(0);
4604 break;
4605 }
4606
4607 if (ISD != ISD::DELETED_NODE) {
4608 auto adjustTableCost = [&](int ISD, unsigned Cost,
4609 std::pair<InstructionCost, MVT> LT,
4611 InstructionCost LegalizationCost = LT.first;
4612 MVT MTy = LT.second;
4613
4614 // If there are no NANs to deal with, then these are reduced to a
4615 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4616 // assume is used in the non-fast case.
4617 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4618 if (FMF.noNaNs())
4619 return LegalizationCost * 1;
4620 }
4621
4622 // For cases where some ops can be folded into a load/store, assume free.
4623 if (MTy.isScalarInteger()) {
4624 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4625 if (const Instruction *II = ICA.getInst()) {
4626 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4627 return TTI::TCC_Free;
4628 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4629 if (LI->hasOneUse())
4630 return TTI::TCC_Free;
4631 }
4632 }
4633 }
4634 }
4635
4636 return LegalizationCost * (int)Cost;
4637 };
4638
4639 // Legalize the type.
4640 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4641 MVT MTy = LT.second;
4642
4643 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4644 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4645 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4646 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4647 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4648 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4649 if (Cst->isAllOnesValue())
4651 }
4652
4653 // FSQRT is a single instruction.
4655 return LT.first;
4656
4657 if (ST->useGLMDivSqrtCosts())
4658 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4659 if (auto KindCost = Entry->Cost[CostKind])
4660 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4661
4662 if (ST->useSLMArithCosts())
4663 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4664 if (auto KindCost = Entry->Cost[CostKind])
4665 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4666
4667 if (ST->hasVBMI2())
4668 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4669 if (auto KindCost = Entry->Cost[CostKind])
4670 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4671
4672 if (ST->hasBITALG())
4673 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4674 if (auto KindCost = Entry->Cost[CostKind])
4675 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4676
4677 if (ST->hasVPOPCNTDQ())
4678 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4679 if (auto KindCost = Entry->Cost[CostKind])
4680 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4681
4682 if (ST->hasGFNI())
4683 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4684 if (auto KindCost = Entry->Cost[CostKind])
4685 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4686
4687 if (ST->hasCDI())
4688 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4689 if (auto KindCost = Entry->Cost[CostKind])
4690 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4691
4692 if (ST->hasBWI())
4693 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4694 if (auto KindCost = Entry->Cost[CostKind])
4695 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4696
4697 if (ST->hasAVX512())
4698 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4699 if (auto KindCost = Entry->Cost[CostKind])
4700 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4701
4702 if (ST->hasXOP())
4703 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4704 if (auto KindCost = Entry->Cost[CostKind])
4705 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4706
4707 if (ST->hasAVX2())
4708 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4709 if (auto KindCost = Entry->Cost[CostKind])
4710 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4711
4712 if (ST->hasAVX())
4713 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4714 if (auto KindCost = Entry->Cost[CostKind])
4715 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4716
4717 if (ST->hasSSE42())
4718 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4719 if (auto KindCost = Entry->Cost[CostKind])
4720 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4721
4722 if (ST->hasSSE41())
4723 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4724 if (auto KindCost = Entry->Cost[CostKind])
4725 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4726
4727 if (ST->hasSSSE3())
4728 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4729 if (auto KindCost = Entry->Cost[CostKind])
4730 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4731
4732 if (ST->hasSSE2())
4733 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4734 if (auto KindCost = Entry->Cost[CostKind])
4735 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4736
4737 if (ST->hasSSE1())
4738 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4739 if (auto KindCost = Entry->Cost[CostKind])
4740 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4741
4742 if (ST->hasBMI()) {
4743 if (ST->is64Bit())
4744 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4745 if (auto KindCost = Entry->Cost[CostKind])
4746 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4747
4748 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4749 if (auto KindCost = Entry->Cost[CostKind])
4750 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4751 }
4752
4753 if (ST->hasLZCNT()) {
4754 if (ST->is64Bit())
4755 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4756 if (auto KindCost = Entry->Cost[CostKind])
4757 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4758
4759 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4760 if (auto KindCost = Entry->Cost[CostKind])
4761 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4762 }
4763
4764 if (ST->hasPOPCNT()) {
4765 if (ST->is64Bit())
4766 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4767 if (auto KindCost = Entry->Cost[CostKind])
4768 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4769
4770 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4771 if (auto KindCost = Entry->Cost[CostKind])
4772 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4773 }
4774
4775 if (ST->is64Bit())
4776 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4777 if (auto KindCost = Entry->Cost[CostKind])
4778 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4779
4780 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4781 if (auto KindCost = Entry->Cost[CostKind])
4782 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4783
4784 // Without arg data, we need to compute the expanded costs of custom lowered
4785 // intrinsics to prevent use of the (very low) default costs.
4786 if (ICA.isTypeBasedOnly() &&
4787 (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) {
4788 Type *CondTy = RetTy->getWithNewBitWidth(1);
4790 Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
4791 Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
4792 Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
4793 Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind);
4794 Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
4795 Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
4797 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
4799 return Cost;
4800 }
4801 }
4802
4804}
4805
4807 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4808 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4809 static const CostTblEntry SLMCostTbl[] = {
4810 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4811 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4812 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4813 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4814 };
4815
4816 assert(Val->isVectorTy() && "This must be a vector type");
4817 auto *VT = cast<VectorType>(Val);
4818 if (VT->isScalableTy())
4820
4821 Type *ScalarType = Val->getScalarType();
4822 InstructionCost RegisterFileMoveCost = 0;
4823
4824 // Non-immediate extraction/insertion can be handled as a sequence of
4825 // aliased loads+stores via the stack.
4826 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4827 Opcode == Instruction::InsertElement)) {
4828 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4829 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4830
4831 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4832 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4833 Align VecAlign = DL.getPrefTypeAlign(Val);
4834 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4835
4836 // Extract - store vector to stack, load scalar.
4837 if (Opcode == Instruction::ExtractElement) {
4838 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4839 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4840 CostKind);
4841 }
4842 // Insert - store vector to stack, store scalar, load vector.
4843 if (Opcode == Instruction::InsertElement) {
4844 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4845 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4846 CostKind) +
4847 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4848 }
4849 }
4850
4851 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4852 Opcode == Instruction::InsertElement)) {
4853 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4854 if (Opcode == Instruction::ExtractElement &&
4855 ScalarType->getScalarSizeInBits() == 1 &&
4856 cast<FixedVectorType>(Val)->getNumElements() > 1)
4857 return 1;
4858
4859 // Legalize the type.
4860 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4861
4862 // This type is legalized to a scalar type.
4863 if (!LT.second.isVector())
4864 return TTI::TCC_Free;
4865
4866 // The type may be split. Normalize the index to the new type.
4867 unsigned SizeInBits = LT.second.getSizeInBits();
4868 unsigned NumElts = LT.second.getVectorNumElements();
4869 unsigned SubNumElts = NumElts;
4870 Index = Index % NumElts;
4871
4872 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4873 // For inserts, we also need to insert the subvector back.
4874 if (SizeInBits > 128) {
4875 assert((SizeInBits % 128) == 0 && "Illegal vector");
4876 unsigned NumSubVecs = SizeInBits / 128;
4877 SubNumElts = NumElts / NumSubVecs;
4878 if (SubNumElts <= Index) {
4879 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4880 Index %= SubNumElts;
4881 }
4882 }
4883
4884 MVT MScalarTy = LT.second.getScalarType();
4885 auto IsCheapPInsrPExtrInsertPS = [&]() {
4886 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4887 // Inserting f32 into index0 is just movss.
4888 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4889 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4890 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4891 (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 &&
4892 Opcode == Instruction::InsertElement) ||
4893 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4894 Opcode == Instruction::InsertElement);
4895 };
4896
4897 if (Index == 0) {
4898 // Floating point scalars are already located in index #0.
4899 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4900 // true for all.
4901 if (ScalarType->isFloatingPointTy() &&
4902 (Opcode != Instruction::InsertElement || !Op0 ||
4903 isa<UndefValue>(Op0)))
4904 return RegisterFileMoveCost;
4905
4906 if (Opcode == Instruction::InsertElement &&
4908 // Consider the gather cost to be cheap.
4910 return RegisterFileMoveCost;
4911 if (!IsCheapPInsrPExtrInsertPS()) {
4912 // mov constant-to-GPR + movd/movq GPR -> XMM.
4913 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4914 return 2 + RegisterFileMoveCost;
4915 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4916 return 1 + RegisterFileMoveCost;
4917 }
4918 }
4919
4920 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4921 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4922 return 1 + RegisterFileMoveCost;
4923 }
4924
4925 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4926 assert(ISD && "Unexpected vector opcode");
4927 if (ST->useSLMArithCosts())
4928 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4929 return Entry->Cost + RegisterFileMoveCost;
4930
4931 // Consider cheap cases.
4932 if (IsCheapPInsrPExtrInsertPS())
4933 return 1 + RegisterFileMoveCost;
4934
4935 // For extractions we just need to shuffle the element to index 0, which
4936 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4937 // the elements to its destination. In both cases we must handle the
4938 // subvector move(s).
4939 // If the vector type is already less than 128-bits then don't reduce it.
4940 // TODO: Under what circumstances should we shuffle using the full width?
4941 InstructionCost ShuffleCost = 1;
4942 if (Opcode == Instruction::InsertElement) {
4943 auto *SubTy = cast<VectorType>(Val);
4944 EVT VT = TLI->getValueType(DL, Val);
4945 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4946 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4947 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, SubTy, {},
4948 CostKind, 0, SubTy);
4949 }
4950 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4951 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4952 }
4953
4954 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1,
4955 VIC) +
4956 RegisterFileMoveCost;
4957}
4958
4960 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4961 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
4962 TTI::VectorInstrContext VIC) const {
4963 assert(DemandedElts.getBitWidth() ==
4964 cast<FixedVectorType>(Ty)->getNumElements() &&
4965 "Vector size mismatch");
4966
4967 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4968 MVT MScalarTy = LT.second.getScalarType();
4969 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4971
4972 constexpr unsigned LaneBitWidth = 128;
4973 assert((LegalVectorBitWidth < LaneBitWidth ||
4974 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4975 "Illegal vector");
4976
4977 const int NumLegalVectors = LT.first.getValue();
4978 assert(NumLegalVectors >= 0 && "Negative cost!");
4979
4980 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4981 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. SLPVectorizer has
4982 // a special heuristic regarding poison input which is passed here in
4983 // ForPoisonSrc.
4984 if (Insert && !ForPoisonSrc) {
4985 // This is nearly identical to BaseT::getScalarizationOverhead(), except
4986 // it is passing nullptr to getVectorInstrCost() for Op0 (instead of
4987 // Constant::getNullValue()), which makes the X86TTIImpl
4988 // getVectorInstrCost() return 0 instead of 1.
4989 for (unsigned I : seq(DemandedElts.getBitWidth())) {
4990 if (!DemandedElts[I])
4991 continue;
4992 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, I,
4994 VL.empty() ? nullptr : VL[I],
4996 }
4997 return Cost;
4998 }
4999
5000 if (Insert) {
5001 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
5002 (MScalarTy.isInteger() && ST->hasSSE41()) ||
5003 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
5004 // For types we can insert directly, insertion into 128-bit sub vectors is
5005 // cheap, followed by a cheap chain of concatenations.
5006 if (LegalVectorBitWidth <= LaneBitWidth) {
5007 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
5008 /*Extract*/ false, CostKind);
5009 } else {
5010 // In each 128-lane, if at least one index is demanded but not all
5011 // indices are demanded and this 128-lane is not the first 128-lane of
5012 // the legalized-vector, then this 128-lane needs a extracti128; If in
5013 // each 128-lane, there is at least one demanded index, this 128-lane
5014 // needs a inserti128.
5015
5016 // The following cases will help you build a better understanding:
5017 // Assume we insert several elements into a v8i32 vector in avx2,
5018 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
5019 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
5020 // inserti128.
5021 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
5022 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
5023 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5024 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5025 unsigned NumLegalElts =
5026 LT.second.getVectorNumElements() * NumLegalVectors;
5027 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5028 "Vector has been legalized to smaller element count");
5029 assert((NumLegalElts % NumLanesTotal) == 0 &&
5030 "Unexpected elts per lane");
5031 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5032
5033 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5034 auto *LaneTy =
5035 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5036
5037 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5038 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5039 NumEltsPerLane, NumEltsPerLane * I);
5040 if (LaneEltMask.isZero())
5041 continue;
5042 // FIXME: we don't need to extract if all non-demanded elements
5043 // are legalization-inserted padding.
5044 if (!LaneEltMask.isAllOnes())
5046 CostKind, I * NumEltsPerLane, LaneTy);
5047 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
5048 /*Extract*/ false, CostKind);
5049 }
5050
5051 APInt AffectedLanes =
5052 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
5053 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
5054 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
5055 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
5056 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
5057 unsigned I = NumLegalLanes * LegalVec + Lane;
5058 // No need to insert unaffected lane; or lane 0 of each legal vector
5059 // iff ALL lanes of that vector were affected and will be inserted.
5060 if (!AffectedLanes[I] ||
5061 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
5062 continue;
5064 CostKind, I * NumEltsPerLane, LaneTy);
5065 }
5066 }
5067 }
5068 } else if (LT.second.isVector()) {
5069 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
5070 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
5071 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
5072 // considered cheap.
5073 if (Ty->isIntOrIntVectorTy())
5074 Cost += DemandedElts.popcount();
5075
5076 // Get the smaller of the legalized or original pow2-extended number of
5077 // vector elements, which represents the number of unpacks we'll end up
5078 // performing.
5079 unsigned NumElts = LT.second.getVectorNumElements();
5080 unsigned Pow2Elts =
5082 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
5083 }
5084 }
5085
5086 if (Extract) {
5087 // vXi1 can be efficiently extracted with MOVMSK.
5088 // TODO: AVX512 predicate mask handling.
5089 // NOTE: This doesn't work well for roundtrip scalarization.
5090 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
5091 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
5092 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
5093 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
5094 return MOVMSKCost;
5095 }
5096
5097 if (LT.second.isVector()) {
5098 unsigned NumLegalElts =
5099 LT.second.getVectorNumElements() * NumLegalVectors;
5100 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5101 "Vector has been legalized to smaller element count");
5102
5103 // If we're extracting elements from a 128-bit subvector lane,
5104 // we only need to extract each lane once, not for every element.
5105 if (LegalVectorBitWidth > LaneBitWidth) {
5106 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5107 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5108 assert((NumLegalElts % NumLanesTotal) == 0 &&
5109 "Unexpected elts per lane");
5110 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5111
5112 // Add cost for each demanded 128-bit subvector extraction.
5113 // Luckily this is a lot easier than for insertion.
5114 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5115 auto *LaneTy =
5116 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5117
5118 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5119 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5120 NumEltsPerLane, I * NumEltsPerLane);
5121 if (LaneEltMask.isZero())
5122 continue;
5124 I * NumEltsPerLane, LaneTy);
5126 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
5127 }
5128
5129 return Cost;
5130 }
5131 }
5132
5133 // Fallback to default extraction.
5134 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
5135 Extract, CostKind);
5136 }
5137
5138 return Cost;
5139}
5140
5142X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
5143 int VF, const APInt &DemandedDstElts,
5145 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
5146 // We don't differentiate element types here, only element bit width.
5147 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
5148
5149 auto bailout = [&]() {
5150 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
5151 DemandedDstElts, CostKind);
5152 };
5153
5154 // For now, only deal with AVX512 cases.
5155 if (!ST->hasAVX512())
5156 return bailout();
5157
5158 // Do we have a native shuffle for this element type, or should we promote?
5159 unsigned PromEltTyBits = EltTyBits;
5160 switch (EltTyBits) {
5161 case 32:
5162 case 64:
5163 break; // AVX512F.
5164 case 16:
5165 if (!ST->hasBWI())
5166 PromEltTyBits = 32; // promote to i32, AVX512F.
5167 break; // AVX512BW
5168 case 8:
5169 if (!ST->hasVBMI())
5170 PromEltTyBits = 32; // promote to i32, AVX512F.
5171 break; // AVX512VBMI
5172 case 1:
5173 // There is no support for shuffling i1 elements. We *must* promote.
5174 if (ST->hasBWI()) {
5175 if (ST->hasVBMI())
5176 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
5177 else
5178 PromEltTyBits = 16; // promote to i16, AVX512BW.
5179 break;
5180 }
5181 PromEltTyBits = 32; // promote to i32, AVX512F.
5182 break;
5183 default:
5184 return bailout();
5185 }
5186 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
5187
5188 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
5189 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
5190
5191 int NumDstElements = VF * ReplicationFactor;
5192 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
5193 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
5194
5195 // Legalize the types.
5196 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
5197 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
5198 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
5199 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
5200 // They should have legalized into vector types.
5201 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
5202 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
5203 return bailout();
5204
5205 if (PromEltTyBits != EltTyBits) {
5206 // If we have to perform the shuffle with wider elt type than our data type,
5207 // then we will first need to anyext (we don't care about the new bits)
5208 // the source elements, and then truncate Dst elements.
5209 InstructionCost PromotionCost;
5210 PromotionCost += getCastInstrCost(
5211 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
5213 PromotionCost +=
5214 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
5215 /*Src=*/PromDstVecTy,
5217 return PromotionCost + getReplicationShuffleCost(PromEltTy,
5218 ReplicationFactor, VF,
5219 DemandedDstElts, CostKind);
5220 }
5221
5222 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
5223 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
5224 "We expect that the legalization doesn't affect the element width, "
5225 "doesn't coalesce/split elements.");
5226
5227 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
5228 unsigned NumDstVectors =
5229 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
5230
5231 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
5232
5233 // Not all the produced Dst elements may be demanded. In our case,
5234 // given that a single Dst vector is formed by a single shuffle,
5235 // if all elements that will form a single Dst vector aren't demanded,
5236 // then we won't need to do that shuffle, so adjust the cost accordingly.
5237 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
5238 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
5239 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
5240
5241 InstructionCost SingleShuffleCost =
5242 getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, SingleDstVecTy,
5243 /*Mask=*/{}, CostKind,
5244 /*Index=*/0, /*SubTp=*/nullptr);
5245 return NumDstVectorsDemanded * SingleShuffleCost;
5246}
5247
5249 Align Alignment,
5250 unsigned AddressSpace,
5252 TTI::OperandValueInfo OpInfo,
5253 const Instruction *I) const {
5254 // TODO: Handle other cost kinds.
5256 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
5257 // Store instruction with index and scale costs 2 Uops.
5258 // Check the preceding GEP to identify non-const indices.
5259 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
5260 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
5261 return TTI::TCC_Basic * 2;
5262 }
5263 }
5264 return TTI::TCC_Basic;
5265 }
5266
5267 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5268 "Invalid Opcode");
5269 // Type legalization can't handle structs
5270 if (TLI->getValueType(DL, Src, true) == MVT::Other)
5271 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5272 CostKind, OpInfo, I);
5273
5274 // Legalize the type.
5275 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
5276
5277 auto *VTy = dyn_cast<FixedVectorType>(Src);
5278
5280
5281 // Add a cost for constant load to vector.
5282 if (Opcode == Instruction::Store && OpInfo.isConstant())
5283 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
5284 /*AddressSpace=*/0, CostKind, OpInfo);
5285
5286 // Handle the simple case of non-vectors.
5287 // NOTE: this assumes that legalization never creates vector from scalars!
5288 if (!VTy || !LT.second.isVector()) {
5289 // Each load/store unit costs 1.
5290 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
5291 }
5292
5293 bool IsLoad = Opcode == Instruction::Load;
5294
5295 Type *EltTy = VTy->getElementType();
5296
5297 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
5298
5299 // Source of truth: how many elements were there in the original IR vector?
5300 const unsigned SrcNumElt = VTy->getNumElements();
5301
5302 // How far have we gotten?
5303 int NumEltRemaining = SrcNumElt;
5304 // Note that we intentionally capture by-reference, NumEltRemaining changes.
5305 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
5306
5307 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
5308
5309 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
5310 const unsigned XMMBits = 128;
5311 if (XMMBits % EltTyBits != 0)
5312 // Vector size must be a multiple of the element size. I.e. no padding.
5313 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5314 CostKind, OpInfo, I);
5315 const int NumEltPerXMM = XMMBits / EltTyBits;
5316
5317 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
5318
5319 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
5320 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
5321 // How many elements would a single op deal with at once?
5322 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
5323 // Vector size must be a multiple of the element size. I.e. no padding.
5324 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5325 CostKind, OpInfo, I);
5326 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
5327
5328 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
5329 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
5330 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
5331 "Unless we haven't halved the op size yet, "
5332 "we have less than two op's sized units of work left.");
5333
5334 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
5335 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
5336 : XMMVecTy;
5337
5338 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
5339 "After halving sizes, the vector elt count is no longer a multiple "
5340 "of number of elements per operation?");
5341 auto *CoalescedVecTy =
5342 CurrNumEltPerOp == 1
5343 ? CurrVecTy
5345 IntegerType::get(Src->getContext(),
5346 EltTyBits * CurrNumEltPerOp),
5347 CurrVecTy->getNumElements() / CurrNumEltPerOp);
5348 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
5349 DL.getTypeSizeInBits(CurrVecTy) &&
5350 "coalesciing elements doesn't change vector width.");
5351
5352 while (NumEltRemaining > 0) {
5353 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
5354
5355 // Can we use this vector size, as per the remaining element count?
5356 // Iff the vector is naturally aligned, we can do a wide load regardless.
5357 if (NumEltRemaining < CurrNumEltPerOp &&
5358 (!IsLoad || Alignment < CurrOpSizeBytes) && CurrOpSizeBytes != 1)
5359 break; // Try smalled vector size.
5360
5361 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5362 // as a proxy for a double-pumped AVX memory interface such as on
5363 // Sandybridge.
5364 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5365 // will be scalarized.
5366 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5367 Cost += 2;
5368 else if (CurrOpSizeBytes < 4)
5369 Cost += 2;
5370 else
5371 Cost += 1;
5372
5373 // If we're loading a uniform value, then we don't need to split the load,
5374 // loading just a single (widest) vector can be reused by all splits.
5375 if (IsLoad && OpInfo.isUniform())
5376 return Cost;
5377
5378 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5379
5380 // If we have fully processed the previous reg, we need to replenish it.
5381 if (SubVecEltsLeft == 0) {
5382 SubVecEltsLeft += CurrVecTy->getNumElements();
5383 // And that's free only for the 0'th subvector of a legalized vector.
5384 if (!Is0thSubVec)
5385 Cost +=
5388 VTy, VTy, {}, CostKind, NumEltDone(), CurrVecTy);
5389 }
5390
5391 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5392 // for smaller widths (32/16/8) we have to insert/extract them separately.
5393 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5394 // but let's pretend that it is also true for 16/8 bit wide ops...)
5395 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5396 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5397 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5398 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5399 APInt DemandedElts =
5400 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5401 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5402 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5403 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5404 !IsLoad, CostKind);
5405 }
5406
5407 SubVecEltsLeft -= CurrNumEltPerOp;
5408 NumEltRemaining -= CurrNumEltPerOp;
5409 Alignment = commonAlignment(Alignment, CurrOpSizeBytes);
5410 }
5411 }
5412
5413 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5414
5415 return Cost;
5416}
5417
5421 switch (MICA.getID()) {
5422 case Intrinsic::masked_scatter:
5423 case Intrinsic::masked_gather:
5424 return getGatherScatterOpCost(MICA, CostKind);
5425 case Intrinsic::masked_load:
5426 case Intrinsic::masked_store:
5427 return getMaskedMemoryOpCost(MICA, CostKind);
5428 }
5430}
5431
5435 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
5436 : Instruction::Store;
5437 Type *SrcTy = MICA.getDataType();
5438 Align Alignment = MICA.getAlignment();
5439 unsigned AddressSpace = MICA.getAddressSpace();
5440
5441 bool IsLoad = (Instruction::Load == Opcode);
5442 bool IsStore = (Instruction::Store == Opcode);
5443
5444 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5445 if (!SrcVTy)
5446 // To calculate scalar take the regular cost, without mask
5447 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5448
5449 unsigned NumElem = SrcVTy->getNumElements();
5450 auto *MaskTy =
5451 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5452 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment, AddressSpace)) ||
5453 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment, AddressSpace))) {
5454 // Scalarization
5455 APInt DemandedElts = APInt::getAllOnes(NumElem);
5457 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5458 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5459 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5461 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5462 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5464 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5465 InstructionCost MemopCost =
5466 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5467 Alignment, AddressSpace, CostKind);
5468 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5469 }
5470
5471 // Legalize the type.
5472 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5473 auto VT = TLI->getValueType(DL, SrcVTy);
5475 MVT Ty = LT.second;
5476 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5477 // APX masked load/store for scalar is cheap.
5478 return Cost + LT.first;
5479
5480 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5481 LT.second.getVectorNumElements() == NumElem)
5482 // Promotion requires extend/truncate for data and a shuffle for mask.
5483 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, SrcVTy, {}, CostKind,
5484 0, nullptr) +
5485 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, MaskTy, {}, CostKind,
5486 0, nullptr);
5487
5488 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5489 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5490 (unsigned)LT.first.getValue() *
5491 Ty.getVectorNumElements());
5492 // Expanding requires fill mask with zeroes
5493 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, NewMaskTy, {},
5494 CostKind, 0, MaskTy);
5495 }
5496
5497 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5498 if (!ST->hasAVX512())
5499 return Cost + LT.first * (IsLoad ? 2 : 8);
5500
5501 // AVX-512 masked load/store is cheaper
5502 return Cost + LT.first;
5503}
5504
5506 ArrayRef<const Value *> Ptrs, const Value *Base,
5507 const TTI::PointersChainInfo &Info, Type *AccessTy,
5509 if (Info.isSameBase() && Info.isKnownStride()) {
5510 // If all the pointers have known stride all the differences are translated
5511 // into constants. X86 memory addressing allows encoding it into
5512 // displacement. So we just need to take the base GEP cost.
5513 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5514 SmallVector<const Value *> Indices(BaseGEP->indices());
5515 return getGEPCost(BaseGEP->getSourceElementType(),
5516 BaseGEP->getPointerOperand(), Indices, nullptr,
5517 CostKind);
5518 }
5519 return TTI::TCC_Free;
5520 }
5521 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5522}
5523
5526 const SCEV *Ptr,
5528 // Address computations in vectorized code with non-consecutive addresses will
5529 // likely result in more instructions compared to scalar code where the
5530 // computation can more often be merged into the index mode. The resulting
5531 // extra micro-ops can significantly decrease throughput.
5532 const unsigned NumVectorInstToHideOverhead = 10;
5533
5534 // Cost modeling of Strided Access Computation is hidden by the indexing
5535 // modes of X86 regardless of the stride value. We dont believe that there
5536 // is a difference between constant strided access in gerenal and constant
5537 // strided value which is less than or equal to 64.
5538 // Even in the case of (loop invariant) stride whose value is not known at
5539 // compile time, the address computation will not incur more than one extra
5540 // ADD instruction.
5541 if (PtrTy->isVectorTy() && SE && !ST->hasAVX2()) {
5542 // TODO: AVX2 is the current cut-off because we don't have correct
5543 // interleaving costs for prior ISA's.
5544 if (!BaseT::isStridedAccess(Ptr))
5545 return NumVectorInstToHideOverhead;
5546 if (!BaseT::getConstantStrideStep(SE, Ptr))
5547 return 1;
5548 }
5549
5550 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
5551}
5552
5555 std::optional<FastMathFlags> FMF,
5558 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5559
5560 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5561 // and make it as the cost.
5562
5563 static const CostTblEntry SLMCostTbl[] = {
5564 { ISD::FADD, MVT::v2f64, 3 },
5565 { ISD::ADD, MVT::v2i64, 5 },
5566 };
5567
5568 static const CostTblEntry SSE2CostTbl[] = {
5569 { ISD::FADD, MVT::v2f64, 2 },
5570 { ISD::FADD, MVT::v2f32, 2 },
5571 { ISD::FADD, MVT::v4f32, 4 },
5572 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5573 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5574 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5575 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5576 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5577 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5578 { ISD::ADD, MVT::v2i8, 2 },
5579 { ISD::ADD, MVT::v4i8, 2 },
5580 { ISD::ADD, MVT::v8i8, 2 },
5581 { ISD::ADD, MVT::v16i8, 3 },
5582 };
5583
5584 static const CostTblEntry AVX1CostTbl[] = {
5585 { ISD::FADD, MVT::v4f64, 3 },
5586 { ISD::FADD, MVT::v4f32, 3 },
5587 { ISD::FADD, MVT::v8f32, 4 },
5588 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5589 { ISD::ADD, MVT::v4i64, 3 },
5590 { ISD::ADD, MVT::v8i32, 5 },
5591 { ISD::ADD, MVT::v16i16, 5 },
5592 { ISD::ADD, MVT::v32i8, 4 },
5593 };
5594
5595 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5596 assert(ISD && "Invalid opcode");
5597
5598 // Before legalizing the type, give a chance to look up illegal narrow types
5599 // in the table.
5600 // FIXME: Is there a better way to do this?
5601 EVT VT = TLI->getValueType(DL, ValTy);
5602 if (VT.isSimple()) {
5603 MVT MTy = VT.getSimpleVT();
5604 if (ST->useSLMArithCosts())
5605 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5606 return Entry->Cost;
5607
5608 if (ST->hasAVX())
5609 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5610 return Entry->Cost;
5611
5612 if (ST->hasSSE2())
5613 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5614 return Entry->Cost;
5615 }
5616
5617 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5618
5619 MVT MTy = LT.second;
5620
5621 auto *ValVTy = cast<FixedVectorType>(ValTy);
5622
5623 // Special case: vXi8 mul reductions are performed as vXi16.
5624 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5625 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5626 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5627 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5629 CostKind) +
5630 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5631 }
5632
5633 InstructionCost ArithmeticCost = 0;
5634 if (LT.first != 1 && MTy.isVector() &&
5635 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5636 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5637 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5638 MTy.getVectorNumElements());
5639 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5640 ArithmeticCost *= LT.first - 1;
5641 }
5642
5643 if (ST->useSLMArithCosts())
5644 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5645 return ArithmeticCost + Entry->Cost;
5646
5647 if (ST->hasAVX())
5648 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5649 return ArithmeticCost + Entry->Cost;
5650
5651 if (ST->hasSSE2())
5652 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5653 return ArithmeticCost + Entry->Cost;
5654
5655 // FIXME: These assume a naive kshift+binop lowering, which is probably
5656 // conservative in most cases.
5657 static const CostTblEntry AVX512BoolReduction[] = {
5658 { ISD::AND, MVT::v2i1, 3 },
5659 { ISD::AND, MVT::v4i1, 5 },
5660 { ISD::AND, MVT::v8i1, 7 },
5661 { ISD::AND, MVT::v16i1, 9 },
5662 { ISD::AND, MVT::v32i1, 11 },
5663 { ISD::AND, MVT::v64i1, 13 },
5664 { ISD::OR, MVT::v2i1, 3 },
5665 { ISD::OR, MVT::v4i1, 5 },
5666 { ISD::OR, MVT::v8i1, 7 },
5667 { ISD::OR, MVT::v16i1, 9 },
5668 { ISD::OR, MVT::v32i1, 11 },
5669 { ISD::OR, MVT::v64i1, 13 },
5670 };
5671
5672 static const CostTblEntry AVX2BoolReduction[] = {
5673 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5674 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5675 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5676 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5677 };
5678
5679 static const CostTblEntry AVX1BoolReduction[] = {
5680 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5681 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5682 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5683 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5684 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5685 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5686 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5687 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5688 };
5689
5690 static const CostTblEntry SSE2BoolReduction[] = {
5691 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5692 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5693 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5694 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5695 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5696 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5697 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5698 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5699 };
5700
5701 // Handle bool allof/anyof patterns.
5702 if (ValVTy->getElementType()->isIntegerTy(1)) {
5703 if (ISD == ISD::ADD) {
5704 // vXi1 addition reduction will bitcast to scalar and perform a popcount.
5705 auto *IntTy = IntegerType::getIntNTy(ValVTy->getContext(),
5706 ValVTy->getNumElements());
5707 IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy});
5708 return getCastInstrCost(Instruction::BitCast, IntTy, ValVTy,
5710 CostKind) +
5712 }
5713
5714 InstructionCost ArithmeticCost = 0;
5715 if (LT.first != 1 && MTy.isVector() &&
5716 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5717 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5718 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5719 MTy.getVectorNumElements());
5720 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5721 ArithmeticCost *= LT.first - 1;
5722 }
5723
5724 if (ST->hasAVX512())
5725 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5726 return ArithmeticCost + Entry->Cost;
5727 if (ST->hasAVX2())
5728 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5729 return ArithmeticCost + Entry->Cost;
5730 if (ST->hasAVX())
5731 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5732 return ArithmeticCost + Entry->Cost;
5733 if (ST->hasSSE2())
5734 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5735 return ArithmeticCost + Entry->Cost;
5736
5737 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5738 }
5739
5740 unsigned NumVecElts = ValVTy->getNumElements();
5741 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5742
5743 // Special case power of 2 reductions where the scalar type isn't changed
5744 // by type legalization.
5745 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5746 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5747
5748 InstructionCost ReductionCost = 0;
5749
5750 auto *Ty = ValVTy;
5751 if (LT.first != 1 && MTy.isVector() &&
5752 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5753 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5754 Ty = FixedVectorType::get(ValVTy->getElementType(),
5755 MTy.getVectorNumElements());
5756 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5757 ReductionCost *= LT.first - 1;
5758 NumVecElts = MTy.getVectorNumElements();
5759 }
5760
5761 // Now handle reduction with the legal type, taking into account size changes
5762 // at each level.
5763 while (NumVecElts > 1) {
5764 // Determine the size of the remaining vector we need to reduce.
5765 unsigned Size = NumVecElts * ScalarSize;
5766 NumVecElts /= 2;
5767 // If we're reducing from 256/512 bits, use an extract_subvector.
5768 if (Size > 128) {
5769 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5770 ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
5771 CostKind, NumVecElts, SubTy);
5772 Ty = SubTy;
5773 } else if (Size == 128) {
5774 // Reducing from 128 bits is a permute of v2f64/v2i64.
5775 FixedVectorType *ShufTy;
5776 if (ValVTy->isFloatingPointTy())
5777 ShufTy =
5778 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5779 else
5780 ShufTy =
5781 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5782 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5783 {}, CostKind, 0, nullptr);
5784 } else if (Size == 64) {
5785 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5786 FixedVectorType *ShufTy;
5787 if (ValVTy->isFloatingPointTy())
5788 ShufTy =
5789 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5790 else
5791 ShufTy =
5792 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5793 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5794 {}, CostKind, 0, nullptr);
5795 } else {
5796 // Reducing from smaller size is a shift by immediate.
5797 auto *ShiftTy = FixedVectorType::get(
5798 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5799 ReductionCost += getArithmeticInstrCost(
5800 Instruction::LShr, ShiftTy, CostKind,
5803 }
5804
5805 // Add the arithmetic op for this level.
5806 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5807 }
5808
5809 // Add the final extract element to the cost.
5810 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5811 CostKind, 0, nullptr, nullptr,
5813}
5814
5817 FastMathFlags FMF) const {
5818 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5819 return getIntrinsicInstrCost(ICA, CostKind);
5820}
5821
5824 FastMathFlags FMF,
5826 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5827
5828 MVT MTy = LT.second;
5829
5830 int ISD;
5831 if (ValTy->isIntOrIntVectorTy()) {
5832 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5833 : ISD::SMIN;
5834 } else {
5835 assert(ValTy->isFPOrFPVectorTy() &&
5836 "Expected float point or integer vector type.");
5837 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5838 ? ISD::FMINNUM
5839 : ISD::FMINIMUM;
5840 }
5841
5842 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5843 // and make it as the cost.
5844
5845 static const CostTblEntry SSE2CostTbl[] = {
5846 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5847 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5848 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5849 };
5850
5851 static const CostTblEntry SSE41CostTbl[] = {
5852 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5853 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5854 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5855 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5856 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5857 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5858 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5859 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5860 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5861 {ISD::SMIN, MVT::v16i8, 6},
5862 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5863 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5864 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5865 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5866 };
5867
5868 static const CostTblEntry AVX1CostTbl[] = {
5869 {ISD::SMIN, MVT::v16i16, 6},
5870 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5871 {ISD::SMIN, MVT::v32i8, 8},
5872 {ISD::UMIN, MVT::v32i8, 8},
5873 };
5874
5875 static const CostTblEntry AVX512BWCostTbl[] = {
5876 {ISD::SMIN, MVT::v32i16, 8},
5877 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5878 {ISD::SMIN, MVT::v64i8, 10},
5879 {ISD::UMIN, MVT::v64i8, 10},
5880 };
5881
5882 // Before legalizing the type, give a chance to look up illegal narrow types
5883 // in the table.
5884 // FIXME: Is there a better way to do this?
5885 EVT VT = TLI->getValueType(DL, ValTy);
5886 if (VT.isSimple()) {
5887 MVT MTy = VT.getSimpleVT();
5888 if (ST->hasBWI())
5889 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5890 return Entry->Cost;
5891
5892 if (ST->hasAVX())
5893 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5894 return Entry->Cost;
5895
5896 if (ST->hasSSE41())
5897 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5898 return Entry->Cost;
5899
5900 if (ST->hasSSE2())
5901 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5902 return Entry->Cost;
5903 }
5904
5905 auto *ValVTy = cast<FixedVectorType>(ValTy);
5906 unsigned NumVecElts = ValVTy->getNumElements();
5907
5908 auto *Ty = ValVTy;
5909 InstructionCost MinMaxCost = 0;
5910 if (LT.first != 1 && MTy.isVector() &&
5911 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5912 // Type needs to be split. We need LT.first - 1 operations ops.
5913 Ty = FixedVectorType::get(ValVTy->getElementType(),
5914 MTy.getVectorNumElements());
5915 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5916 MinMaxCost *= LT.first - 1;
5917 NumVecElts = MTy.getVectorNumElements();
5918 }
5919
5920 if (ST->hasBWI())
5921 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5922 return MinMaxCost + Entry->Cost;
5923
5924 if (ST->hasAVX())
5925 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5926 return MinMaxCost + Entry->Cost;
5927
5928 if (ST->hasSSE41())
5929 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5930 return MinMaxCost + Entry->Cost;
5931
5932 if (ST->hasSSE2())
5933 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5934 return MinMaxCost + Entry->Cost;
5935
5936 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5937
5938 // Special case power of 2 reductions where the scalar type isn't changed
5939 // by type legalization.
5940 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5941 ScalarSize != MTy.getScalarSizeInBits())
5942 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5943
5944 // Now handle reduction with the legal type, taking into account size changes
5945 // at each level.
5946 while (NumVecElts > 1) {
5947 // Determine the size of the remaining vector we need to reduce.
5948 unsigned Size = NumVecElts * ScalarSize;
5949 NumVecElts /= 2;
5950 // If we're reducing from 256/512 bits, use an extract_subvector.
5951 if (Size > 128) {
5952 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5953 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
5954 CostKind, NumVecElts, SubTy);
5955 Ty = SubTy;
5956 } else if (Size == 128) {
5957 // Reducing from 128 bits is a permute of v2f64/v2i64.
5958 VectorType *ShufTy;
5959 if (ValTy->isFloatingPointTy())
5960 ShufTy =
5961 FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
5962 else
5963 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5964 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
5965 CostKind, 0, nullptr);
5966 } else if (Size == 64) {
5967 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5968 FixedVectorType *ShufTy;
5969 if (ValTy->isFloatingPointTy())
5970 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5971 else
5972 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5973 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
5974 CostKind, 0, nullptr);
5975 } else {
5976 // Reducing from smaller size is a shift by immediate.
5977 auto *ShiftTy = FixedVectorType::get(
5978 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5979 MinMaxCost += getArithmeticInstrCost(
5980 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5983 }
5984
5985 // Add the arithmetic op for this level.
5986 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5987 }
5988
5989 // Add the final extract element to the cost.
5990 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5991 CostKind, 0, nullptr, nullptr,
5993}
5994
5995/// Calculate the cost of materializing a 64-bit value. This helper
5996/// method might only calculate a fraction of a larger immediate. Therefore it
5997/// is valid to return a cost of ZERO.
5999 if (Val == 0)
6000 return TTI::TCC_Free;
6001
6002 if (isInt<32>(Val))
6003 return TTI::TCC_Basic;
6004
6005 return 2 * TTI::TCC_Basic;
6006}
6007
6010 assert(Ty->isIntegerTy());
6011
6012 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6013 if (BitSize == 0)
6014 return ~0U;
6015
6016 // Never hoist constants larger than 128bit, because this might lead to
6017 // incorrect code generation or assertions in codegen.
6018 // Fixme: Create a cost model for types larger than i128 once the codegen
6019 // issues have been fixed.
6020 if (BitSize > 128)
6021 return TTI::TCC_Free;
6022
6023 if (Imm == 0)
6024 return TTI::TCC_Free;
6025
6026 // Sign-extend all constants to a multiple of 64-bit.
6027 APInt ImmVal = Imm;
6028 if (BitSize % 64 != 0)
6029 ImmVal = Imm.sext(alignTo(BitSize, 64));
6030
6031 // Split the constant into 64-bit chunks and calculate the cost for each
6032 // chunk.
6034 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
6035 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
6036 int64_t Val = Tmp.getSExtValue();
6037 Cost += getIntImmCost(Val);
6038 }
6039 // We need at least one instruction to materialize the constant.
6040 return std::max<InstructionCost>(1, Cost);
6041}
6042
6044 const APInt &Imm, Type *Ty,
6046 Instruction *Inst) const {
6047 assert(Ty->isIntegerTy());
6048
6049 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6050 unsigned ImmBitWidth = Imm.getBitWidth();
6051
6052 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6053 // here, so that constant hoisting will ignore this constant.
6054 if (BitSize == 0)
6055 return TTI::TCC_Free;
6056
6057 unsigned ImmIdx = ~0U;
6058 switch (Opcode) {
6059 default:
6060 return TTI::TCC_Free;
6061 case Instruction::GetElementPtr:
6062 // Always hoist the base address of a GetElementPtr. This prevents the
6063 // creation of new constants for every base constant that gets constant
6064 // folded with the offset.
6065 if (Idx == 0)
6066 return 2 * TTI::TCC_Basic;
6067 return TTI::TCC_Free;
6068 case Instruction::Store:
6069 ImmIdx = 0;
6070 break;
6071 case Instruction::ICmp:
6072 // This is an imperfect hack to prevent constant hoisting of
6073 // compares that might be trying to check if a 64-bit value fits in
6074 // 32-bits. The backend can optimize these cases using a right shift by 32.
6075 // There are other predicates and immediates the backend can use shifts for.
6076 if (Idx == 1 && ImmBitWidth == 64) {
6077 uint64_t ImmVal = Imm.getZExtValue();
6078 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
6079 return TTI::TCC_Free;
6080
6081 if (auto *Cmp = dyn_cast_or_null<CmpInst>(Inst)) {
6082 if (Cmp->isEquality()) {
6083 KnownBits Known = computeKnownBits(Cmp->getOperand(0), DL);
6084 if (Known.countMinTrailingZeros() >= 32)
6085 return TTI::TCC_Free;
6086 }
6087 }
6088 }
6089 ImmIdx = 1;
6090 break;
6091 case Instruction::And:
6092 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
6093 // by using a 32-bit operation with implicit zero extension. Detect such
6094 // immediates here as the normal path expects bit 31 to be sign extended.
6095 if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(32))
6096 return TTI::TCC_Free;
6097 // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits.
6098 if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() &&
6099 Imm.isMask())
6100 return X86TTIImpl::getIntImmCost(ST->hasBMI2() ? 255 : 65535);
6101 ImmIdx = 1;
6102 break;
6103 case Instruction::Add:
6104 case Instruction::Sub:
6105 // For add/sub, we can use the opposite instruction for INT32_MIN.
6106 if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000)
6107 return TTI::TCC_Free;
6108 ImmIdx = 1;
6109 break;
6110 case Instruction::UDiv:
6111 case Instruction::SDiv:
6112 case Instruction::URem:
6113 case Instruction::SRem:
6114 // Division by constant is typically expanded later into a different
6115 // instruction sequence. This completely changes the constants.
6116 // Report them as "free" to stop ConstantHoist from marking them as opaque.
6117 return TTI::TCC_Free;
6118 case Instruction::Mul:
6119 case Instruction::Or:
6120 case Instruction::Xor:
6121 ImmIdx = 1;
6122 break;
6123 // Always return TCC_Free for the shift value of a shift instruction.
6124 case Instruction::Shl:
6125 case Instruction::LShr:
6126 case Instruction::AShr:
6127 if (Idx == 1)
6128 return TTI::TCC_Free;
6129 break;
6130 case Instruction::Trunc:
6131 case Instruction::ZExt:
6132 case Instruction::SExt:
6133 case Instruction::IntToPtr:
6134 case Instruction::PtrToInt:
6135 case Instruction::BitCast:
6136 case Instruction::PHI:
6137 case Instruction::Call:
6138 case Instruction::Select:
6139 case Instruction::Ret:
6140 case Instruction::Load:
6141 break;
6142 }
6143
6144 if (Idx == ImmIdx) {
6145 uint64_t NumConstants = divideCeil(BitSize, 64);
6147 return (Cost <= NumConstants * TTI::TCC_Basic)
6148 ? static_cast<int>(TTI::TCC_Free)
6149 : Cost;
6150 }
6151
6152 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6153}
6154
6157 const APInt &Imm, Type *Ty,
6159 assert(Ty->isIntegerTy());
6160
6161 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6162 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6163 // here, so that constant hoisting will ignore this constant.
6164 if (BitSize == 0)
6165 return TTI::TCC_Free;
6166
6167 switch (IID) {
6168 default:
6169 return TTI::TCC_Free;
6170 case Intrinsic::sadd_with_overflow:
6171 case Intrinsic::uadd_with_overflow:
6172 case Intrinsic::ssub_with_overflow:
6173 case Intrinsic::usub_with_overflow:
6174 case Intrinsic::smul_with_overflow:
6175 case Intrinsic::umul_with_overflow:
6176 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
6177 return TTI::TCC_Free;
6178 break;
6179 case Intrinsic::experimental_stackmap:
6180 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6181 return TTI::TCC_Free;
6182 break;
6183 case Intrinsic::experimental_patchpoint_void:
6184 case Intrinsic::experimental_patchpoint:
6185 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6186 return TTI::TCC_Free;
6187 break;
6188 }
6189 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6190}
6191
6194 const Instruction *I) const {
6196 return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic;
6197 // Branches are assumed to be predicted.
6198 return TTI::TCC_Free;
6199}
6200
6201int X86TTIImpl::getGatherOverhead() const {
6202 // Some CPUs have more overhead for gather. The specified overhead is relative
6203 // to the Load operation. "2" is the number provided by Intel architects. This
6204 // parameter is used for cost estimation of Gather Op and comparison with
6205 // other alternatives.
6206 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
6207 // enable gather with a -march.
6208 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
6209 return 2;
6210
6211 return 1024;
6212}
6213
6214int X86TTIImpl::getScatterOverhead() const {
6215 if (ST->hasAVX512())
6216 return 2;
6217
6218 return 1024;
6219}
6220
6221// Return an average cost of Gather / Scatter instruction, maybe improved later.
6222InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
6224 Type *SrcVTy, const Value *Ptr,
6225 Align Alignment,
6226 unsigned AddressSpace) const {
6227
6228 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
6229 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
6230
6231 // Try to reduce index size from 64 bit (default for GEP)
6232 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
6233 // operation will use 16 x 64 indices which do not fit in a zmm and needs
6234 // to split. Also check that the base pointer is the same for all lanes,
6235 // and that there's at most one variable index.
6236 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
6237 unsigned IndexSize = DL.getPointerSizeInBits();
6238 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
6239 if (IndexSize < 64 || !GEP)
6240 return IndexSize;
6241
6242 unsigned NumOfVarIndices = 0;
6243 const Value *Ptrs = GEP->getPointerOperand();
6244 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
6245 return IndexSize;
6246 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
6247 if (isa<Constant>(GEP->getOperand(I)))
6248 continue;
6249 Type *IndxTy = GEP->getOperand(I)->getType();
6250 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
6251 IndxTy = IndexVTy->getElementType();
6252 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
6253 !isa<SExtInst>(GEP->getOperand(I))) ||
6254 ++NumOfVarIndices > 1)
6255 return IndexSize; // 64
6256 }
6257 return (unsigned)32;
6258 };
6259
6260 // Trying to reduce IndexSize to 32 bits for vector 16.
6261 // By default the IndexSize is equal to pointer size.
6262 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
6263 ? getIndexSizeInBits(Ptr, DL)
6264 : DL.getPointerSizeInBits();
6265
6266 auto *IndexVTy = FixedVectorType::get(
6267 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
6268 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
6269 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
6270 InstructionCost::CostType SplitFactor =
6271 std::max(IdxsLT.first, SrcLT.first).getValue();
6272 if (SplitFactor > 1) {
6273 // Handle splitting of vector of pointers
6274 auto *SplitSrcTy =
6275 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
6276 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
6277 Alignment, AddressSpace);
6278 }
6279
6280 // If we didn't split, this will be a single gather/scatter instruction.
6282 return 1;
6283
6284 // The gather / scatter cost is given by Intel architects. It is a rough
6285 // number since we are looking at one instruction in a time.
6286 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
6287 : getScatterOverhead();
6288 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
6289 Alignment, AddressSpace, CostKind);
6290}
6291
6292/// Calculate the cost of Gather / Scatter operation
6296 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
6297 MICA.getID() == Intrinsic::vp_gather;
6298 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
6299 Type *SrcVTy = MICA.getDataType();
6300 const Value *Ptr = MICA.getPointer();
6301 Align Alignment = MICA.getAlignment();
6302 if ((Opcode == Instruction::Load &&
6303 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
6305 Align(Alignment)))) ||
6306 (Opcode == Instruction::Store &&
6307 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
6309 Align(Alignment)))))
6311
6312 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
6313 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
6314 if (!PtrTy && Ptr->getType()->isVectorTy())
6315 PtrTy = dyn_cast<PointerType>(
6316 cast<VectorType>(Ptr->getType())->getElementType());
6317 assert(PtrTy && "Unexpected type for Ptr argument");
6318 unsigned AddressSpace = PtrTy->getAddressSpace();
6319 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
6320 AddressSpace);
6321}
6322
6324 const TargetTransformInfo::LSRCost &C2) const {
6325 // X86 specific here are "instruction number 1st priority".
6326 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
6327 C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6328 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
6329 C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6330}
6331
6333 return ST->hasMacroFusion() || ST->hasBranchFusion();
6334}
6335
6336static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST) {
6337 if (!ST->hasAVX())
6338 return false;
6339
6340 if (ScalarTy->isPointerTy())
6341 return true;
6342
6343 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6344 return true;
6345
6346 if (ScalarTy->isHalfTy() && ST->hasBWI())
6347 return true;
6348
6349 if (ScalarTy->isBFloatTy() && ST->hasBF16())
6350 return true;
6351
6352 if (!ScalarTy->isIntegerTy())
6353 return false;
6354
6355 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6356 return IntWidth == 32 || IntWidth == 64 ||
6357 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
6358}
6359
6361 unsigned AddressSpace,
6362 TTI::MaskKind MaskKind) const {
6363 Type *ScalarTy = DataTy->getScalarType();
6364
6365 // The backend can't handle a single element vector w/o CFCMOV.
6366 if (isa<VectorType>(DataTy) &&
6367 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6368 return ST->hasCF() &&
6369 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/false);
6370
6371 return isLegalMaskedLoadStore(ScalarTy, ST);
6372}
6373
6375 unsigned AddressSpace,
6376 TTI::MaskKind MaskKind) const {
6377 Type *ScalarTy = DataTy->getScalarType();
6378
6379 // The backend can't handle a single element vector w/o CFCMOV.
6380 if (isa<VectorType>(DataTy) &&
6381 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6382 return ST->hasCF() &&
6383 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/true);
6384
6385 return isLegalMaskedLoadStore(ScalarTy, ST);
6386}
6387
6388bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) const {
6389 unsigned DataSize = DL.getTypeStoreSize(DataType);
6390 // The only supported nontemporal loads are for aligned vectors of 16 or 32
6391 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
6392 // (the equivalent stores only require AVX).
6393 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
6394 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
6395
6396 return false;
6397}
6398
6399bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) const {
6400 unsigned DataSize = DL.getTypeStoreSize(DataType);
6401
6402 // SSE4A supports nontemporal stores of float and double at arbitrary
6403 // alignment.
6404 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
6405 return true;
6406
6407 // Besides the SSE4A subtarget exception above, only aligned stores are
6408 // available nontemporaly on any other subtarget. And only stores with a size
6409 // of 4..32 bytes (powers of 2, only) are permitted.
6410 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6412 return false;
6413
6414 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6415 // loads require AVX2).
6416 if (DataSize == 32)
6417 return ST->hasAVX();
6418 if (DataSize == 16)
6419 return ST->hasSSE1();
6420 return true;
6421}
6422
6424 ElementCount NumElements) const {
6425 // movddup
6426 return ST->hasSSE3() && !NumElements.isScalable() &&
6427 NumElements.getFixedValue() == 2 &&
6428 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6429}
6430
6431bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const {
6432 if (!isa<VectorType>(DataTy))
6433 return false;
6434
6435 if (!ST->hasAVX512())
6436 return false;
6437
6438 // The backend can't handle a single element vector.
6439 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6440 return false;
6441
6442 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6443
6444 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6445 return true;
6446
6447 if (!ScalarTy->isIntegerTy())
6448 return false;
6449
6450 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6451 return IntWidth == 32 || IntWidth == 64 ||
6452 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6453}
6454
6456 Align Alignment) const {
6457 return isLegalMaskedExpandLoad(DataTy, Alignment);
6458}
6459
6460bool X86TTIImpl::supportsGather() const {
6461 // Some CPUs have better gather performance than others.
6462 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6463 // enable gather with a -march.
6464 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6465}
6466
6468 Align Alignment) const {
6469 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6470 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6471 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6472 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6473 // Check, maybe the gather/scatter instruction is better in the VariableMask
6474 // case.
6475 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6476 return NumElts == 1 ||
6477 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6478}
6479
6481 Align Alignment) const {
6482 Type *ScalarTy = DataTy->getScalarType();
6483 if (ScalarTy->isPointerTy())
6484 return true;
6485
6486 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6487 return true;
6488
6489 if (!ScalarTy->isIntegerTy())
6490 return false;
6491
6492 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6493 return IntWidth == 32 || IntWidth == 64;
6494}
6495
6496bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) const {
6497 if (!supportsGather() || !ST->preferGather())
6498 return false;
6499 return isLegalMaskedGatherScatter(DataTy, Alignment);
6500}
6501
6502bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6503 unsigned Opcode1,
6504 const SmallBitVector &OpcodeMask) const {
6505 // ADDSUBPS 4xf32 SSE3
6506 // VADDSUBPS 4xf32 AVX
6507 // VADDSUBPS 8xf32 AVX2
6508 // ADDSUBPD 2xf64 SSE3
6509 // VADDSUBPD 2xf64 AVX
6510 // VADDSUBPD 4xf64 AVX2
6511
6512 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6513 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6514 if (!isPowerOf2_32(NumElements))
6515 return false;
6516 // Check the opcode pattern. We apply the mask on the opcode arguments and
6517 // then check if it is what we expect.
6518 for (int Lane : seq<int>(0, NumElements)) {
6519 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6520 // We expect FSub for even lanes and FAdd for odd lanes.
6521 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6522 return false;
6523 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6524 return false;
6525 }
6526 // Now check that the pattern is supported by the target ISA.
6527 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6528 if (ElemTy->isFloatTy())
6529 return ST->hasSSE3() && NumElements % 4 == 0;
6530 if (ElemTy->isDoubleTy())
6531 return ST->hasSSE3() && NumElements % 2 == 0;
6532 return false;
6533}
6534
6535bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) const {
6536 // AVX2 doesn't support scatter
6537 if (!ST->hasAVX512() || !ST->preferScatter())
6538 return false;
6539 return isLegalMaskedGatherScatter(DataType, Alignment);
6540}
6541
6542bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const {
6543 EVT VT = TLI->getValueType(DL, DataType);
6544 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6545}
6546
6548 // FDIV is always expensive, even if it has a very low uop count.
6549 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6550 if (I->getOpcode() == Instruction::FDiv)
6551 return true;
6552
6554}
6555
6556bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return false; }
6557
6559 const Function *Callee) const {
6560 const TargetMachine &TM = getTLI()->getTargetMachine();
6561
6562 // Work this as a subsetting of subtarget features.
6563 const FeatureBitset &CallerBits =
6564 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6565 const FeatureBitset &CalleeBits =
6566 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6567
6568 // Check whether features are the same (apart from the ignore list).
6569 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6570 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6571 if (RealCallerBits == RealCalleeBits)
6572 return true;
6573
6574 // If the features are a subset, we need to additionally check for calls
6575 // that may become ABI-incompatible as a result of inlining.
6576 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6577 return false;
6578
6579 for (const Instruction &I : instructions(Callee)) {
6580 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6581 // Having more target features is fine for inline ASM and intrinsics.
6582 if (CB->isInlineAsm() || CB->getIntrinsicID() != Intrinsic::not_intrinsic)
6583 continue;
6584
6586 for (Value *Arg : CB->args())
6587 Types.push_back(Arg->getType());
6588 if (!CB->getType()->isVoidTy())
6589 Types.push_back(CB->getType());
6590
6591 // Simple types are always ABI compatible.
6592 auto IsSimpleTy = [](Type *Ty) {
6593 return !Ty->isVectorTy() && !Ty->isAggregateType();
6594 };
6595 if (all_of(Types, IsSimpleTy))
6596 continue;
6597
6598 // Do a precise compatibility check.
6599 if (!areTypesABICompatible(Caller, Callee, Types))
6600 return false;
6601 }
6602 }
6603 return true;
6604}
6605
6607 const Function *Callee,
6608 ArrayRef<Type *> Types) const {
6609 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6610 return false;
6611
6612 // If we get here, we know the target features match. If one function
6613 // considers 512-bit vectors legal and the other does not, consider them
6614 // incompatible.
6615 const TargetMachine &TM = getTLI()->getTargetMachine();
6616
6617 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6619 return true;
6620
6621 // Consider the arguments compatible if they aren't vectors or aggregates.
6622 // FIXME: Look at the size of vectors.
6623 // FIXME: Look at the element types of aggregates to see if there are vectors.
6624 return llvm::none_of(Types,
6625 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6626}
6627
6629X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6631 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6632 Options.NumLoadsPerBlock = 2;
6633 // All GPR and vector loads can be unaligned.
6634 Options.AllowOverlappingLoads = true;
6635 if (IsZeroCmp) {
6636 // Only enable vector loads for equality comparison. Right now the vector
6637 // version is not as fast for three way compare (see #33329).
6638 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6639 if (PreferredWidth >= 512 && ST->hasAVX512())
6640 Options.LoadSizes.push_back(64);
6641 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6642 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6643 }
6644 if (ST->is64Bit()) {
6645 Options.LoadSizes.push_back(8);
6646 }
6647 Options.LoadSizes.push_back(4);
6648 Options.LoadSizes.push_back(2);
6649 Options.LoadSizes.push_back(1);
6650 return Options;
6651}
6652
6654 return supportsGather();
6655}
6656
6658 return false;
6659}
6660
6662 // TODO: We expect this to be beneficial regardless of arch,
6663 // but there are currently some unexplained performance artifacts on Atom.
6664 // As a temporary solution, disable on Atom.
6665 return !(ST->isAtom());
6666}
6667
6668// Get estimation for interleaved load/store operations and strided load.
6669// \p Indices contains indices for strided load.
6670// \p Factor - the factor of interleaving.
6671// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6673 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6674 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6675 TTI::TargetCostKind CostKind, bool UseMaskForCond,
6676 bool UseMaskForGaps) const {
6677 // VecTy for interleave memop is <VF*Factor x Elt>.
6678 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6679 // VecTy = <12 x i32>.
6680
6681 // Calculate the number of memory operations (NumOfMemOps), required
6682 // for load/store the VecTy.
6683 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6684 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6685 unsigned LegalVTSize = LegalVT.getStoreSize();
6686 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6687
6688 // Get the cost of one memory operation.
6689 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6690 LegalVT.getVectorNumElements());
6691 InstructionCost MemOpCost;
6692 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6693 if (UseMaskedMemOp) {
6694 unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load
6695 : Intrinsic::masked_store;
6696 MemOpCost = getMaskedMemoryOpCost(
6697 {IID, SingleMemOpTy, Alignment, AddressSpace}, CostKind);
6698 } else
6699 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace,
6700 CostKind);
6701
6702 unsigned VF = VecTy->getNumElements() / Factor;
6703 MVT VT =
6704 MVT::getVectorVT(TLI->getSimpleValueType(DL, VecTy->getScalarType()), VF);
6705
6706 InstructionCost MaskCost;
6707 if (UseMaskedMemOp) {
6708 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6709 for (unsigned Index : Indices) {
6710 assert(Index < Factor && "Invalid index for interleaved memory op");
6711 for (unsigned Elm = 0; Elm < VF; Elm++)
6712 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6713 }
6714
6715 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6716
6717 MaskCost = getReplicationShuffleCost(
6718 I1Type, Factor, VF,
6719 UseMaskForGaps ? DemandedLoadStoreElts
6721 CostKind);
6722
6723 // The Gaps mask is invariant and created outside the loop, therefore the
6724 // cost of creating it is not accounted for here. However if we have both
6725 // a MaskForGaps and some other mask that guards the execution of the
6726 // memory access, we need to account for the cost of And-ing the two masks
6727 // inside the loop.
6728 if (UseMaskForGaps) {
6729 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6730 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6731 }
6732 }
6733
6734 if (Opcode == Instruction::Load) {
6735 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6736 // contain the cost of the optimized shuffle sequence that the
6737 // X86InterleavedAccess pass will generate.
6738 // The cost of loads and stores are computed separately from the table.
6739
6740 // X86InterleavedAccess support only the following interleaved-access group.
6741 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6742 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6743 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6744 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6745 };
6746
6747 if (const auto *Entry =
6748 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6749 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6750 //If an entry does not exist, fallback to the default implementation.
6751
6752 // Kind of shuffle depends on number of loaded values.
6753 // If we load the entire data in one register, we can use a 1-src shuffle.
6754 // Otherwise, we'll merge 2 sources in each operation.
6755 TTI::ShuffleKind ShuffleKind =
6756 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6757
6758 InstructionCost ShuffleCost = getShuffleCost(
6759 ShuffleKind, SingleMemOpTy, SingleMemOpTy, {}, CostKind, 0, nullptr);
6760
6761 unsigned NumOfLoadsInInterleaveGrp =
6762 Indices.size() ? Indices.size() : Factor;
6763 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6764 VecTy->getNumElements() / Factor);
6765 InstructionCost NumOfResults =
6766 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6767
6768 // About a half of the loads may be folded in shuffles when we have only
6769 // one result. If we have more than one result, or the loads are masked,
6770 // we do not fold loads at all.
6771 unsigned NumOfUnfoldedLoads =
6772 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6773
6774 // Get a number of shuffle operations per result.
6775 unsigned NumOfShufflesPerResult =
6776 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6777
6778 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6779 // When we have more than one destination, we need additional instructions
6780 // to keep sources.
6781 InstructionCost NumOfMoves = 0;
6782 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6783 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6784
6785 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6786 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6787 NumOfMoves;
6788
6789 return Cost;
6790 }
6791
6792 // Store.
6793 assert(Opcode == Instruction::Store &&
6794 "Expected Store Instruction at this point");
6795 // X86InterleavedAccess support only the following interleaved-access group.
6796 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6797 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6798 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6799 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6800
6801 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6802 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6803 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6804 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6805 };
6806
6807 if (const auto *Entry =
6808 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6809 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6810 //If an entry does not exist, fallback to the default implementation.
6811
6812 // There is no strided stores meanwhile. And store can't be folded in
6813 // shuffle.
6814 unsigned NumOfSources = Factor; // The number of values to be merged.
6815 InstructionCost ShuffleCost =
6816 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, SingleMemOpTy, {},
6817 CostKind, 0, nullptr);
6818 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6819
6820 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6821 // We need additional instructions to keep sources.
6822 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6824 MaskCost +
6825 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6826 NumOfMoves;
6827 return Cost;
6828}
6829
6831 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6832 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6833 bool UseMaskForCond, bool UseMaskForGaps) const {
6834 auto *VecTy = cast<FixedVectorType>(BaseTy);
6835
6836 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6837 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6838 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6839 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6840 return true;
6841 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6842 return ST->hasBWI();
6843 if (EltTy->isBFloatTy())
6844 return ST->hasBF16();
6845 return false;
6846 };
6847 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6849 Opcode, VecTy, Factor, Indices, Alignment,
6850 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6851
6852 if (UseMaskForCond || UseMaskForGaps)
6853 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6854 Alignment, AddressSpace, CostKind,
6855 UseMaskForCond, UseMaskForGaps);
6856
6857 // Get estimation for interleaved load/store operations for SSE-AVX2.
6858 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6859 // computing the cost using a generic formula as a function of generic
6860 // shuffles. We therefore use a lookup table instead, filled according to
6861 // the instruction sequences that codegen currently generates.
6862
6863 // VecTy for interleave memop is <VF*Factor x Elt>.
6864 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6865 // VecTy = <12 x i32>.
6866 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6867
6868 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6869 // the VF=2, while v2i128 is an unsupported MVT vector type
6870 // (see MachineValueType.h::getVectorVT()).
6871 if (!LegalVT.isVector())
6872 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6873 Alignment, AddressSpace, CostKind);
6874
6875 unsigned VF = VecTy->getNumElements() / Factor;
6876 Type *ScalarTy = VecTy->getElementType();
6877 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6878 if (!ScalarTy->isIntegerTy())
6879 ScalarTy =
6880 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6881
6882 // Get the cost of all the memory operations.
6883 // FIXME: discount dead loads.
6884 InstructionCost MemOpCosts =
6885 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
6886
6887 auto *VT = FixedVectorType::get(ScalarTy, VF);
6888 EVT ETy = TLI->getValueType(DL, VT);
6889 if (!ETy.isSimple())
6890 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6891 Alignment, AddressSpace, CostKind);
6892
6893 // TODO: Complete for other data-types and strides.
6894 // Each combination of Stride, element bit width and VF results in a different
6895 // sequence; The cost tables are therefore accessed with:
6896 // Factor (stride) and VectorType=VFxiN.
6897 // The Cost accounts only for the shuffle sequence;
6898 // The cost of the loads/stores is accounted for separately.
6899 //
6900 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6901 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6902 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6903 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6904 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6905 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6906
6907 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6908 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6909 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6910
6911 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6912 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6913 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6914
6915 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6916 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6917 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6918 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6919
6920 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6921 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6922 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6923 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6924 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6925
6926 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6927 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6928 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6929 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6930 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6931
6932 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6933 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6934 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6935 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6936 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6937
6938 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6939 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6940 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6941 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6942
6943 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6944 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6945 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6946 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6947 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6948
6949 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6950 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6951 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6952 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6953 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6954
6955 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6956 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6957 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6958 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6959 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6960
6961 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6962 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6963 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6964 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6965
6966 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6967 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6968 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6969 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6970 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6971
6972 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6973 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6974 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6975 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6976 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6977
6978 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6979 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6980 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6981 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6982
6983 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6984 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6985 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6986
6987 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6988 };
6989
6990 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6991 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6992 };
6993
6994 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6995 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6996 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6997
6998 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6999 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
7000
7001 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
7002 };
7003
7004 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
7005 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
7006 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
7007
7008 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
7009 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
7010 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
7011
7012 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
7013 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
7014 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
7015 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
7016
7017 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
7018 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
7019 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
7020 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
7021 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
7022
7023 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
7024 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
7025 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
7026 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
7027 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
7028
7029 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
7030 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
7031 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
7032 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
7033 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
7034
7035 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
7036 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
7037 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
7038 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
7039 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
7040
7041 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
7042 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
7043 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
7044 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
7045
7046 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
7047 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
7048 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
7049 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
7050 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
7051
7052 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
7053 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
7054 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
7055 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
7056 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
7057
7058 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
7059 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
7060 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
7061 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
7062 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
7063
7064 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
7065 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
7066 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
7067 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
7068
7069 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
7070 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
7071 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
7072 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
7073 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
7074
7075 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
7076 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
7077 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
7078 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
7079 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
7080
7081 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
7082 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
7083 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
7084 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
7085
7086 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
7087 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
7088 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
7089 };
7090
7091 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
7092 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
7093 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
7094 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
7095
7096 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
7097 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
7098
7099 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
7100 };
7101
7102 if (Opcode == Instruction::Load) {
7103 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
7104 MemOpCosts](const CostTblEntry *Entry) {
7105 // NOTE: this is just an approximation!
7106 // It can over/under -estimate the cost!
7107 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
7108 };
7109
7110 if (ST->hasAVX2())
7111 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
7112 ETy.getSimpleVT()))
7113 return GetDiscountedCost(Entry);
7114
7115 if (ST->hasSSSE3())
7116 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
7117 ETy.getSimpleVT()))
7118 return GetDiscountedCost(Entry);
7119
7120 if (ST->hasSSE2())
7121 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
7122 ETy.getSimpleVT()))
7123 return GetDiscountedCost(Entry);
7124 } else {
7125 assert(Opcode == Instruction::Store &&
7126 "Expected Store Instruction at this point");
7127 assert((!Indices.size() || Indices.size() == Factor) &&
7128 "Interleaved store only supports fully-interleaved groups.");
7129 if (ST->hasAVX2())
7130 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
7131 ETy.getSimpleVT()))
7132 return MemOpCosts + Entry->Cost;
7133
7134 if (ST->hasSSE2())
7135 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
7136 ETy.getSimpleVT()))
7137 return MemOpCosts + Entry->Cost;
7138 }
7139
7140 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7141 Alignment, AddressSpace, CostKind,
7142 UseMaskForCond, UseMaskForGaps);
7143}
7144
7146 StackOffset BaseOffset,
7147 bool HasBaseReg, int64_t Scale,
7148 unsigned AddrSpace) const {
7149 // Scaling factors are not free at all.
7150 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
7151 // will take 2 allocations in the out of order engine instead of 1
7152 // for plain addressing mode, i.e. inst (reg1).
7153 // E.g.,
7154 // vaddps (%rsi,%rdx), %ymm0, %ymm1
7155 // Requires two allocations (one for the load, one for the computation)
7156 // whereas:
7157 // vaddps (%rsi), %ymm0, %ymm1
7158 // Requires just 1 allocation, i.e., freeing allocations for other operations
7159 // and having less micro operations to execute.
7160 //
7161 // For some X86 architectures, this is even worse because for instance for
7162 // stores, the complex addressing mode forces the instruction to use the
7163 // "load" ports instead of the dedicated "store" port.
7164 // E.g., on Haswell:
7165 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
7166 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
7168 AM.BaseGV = BaseGV;
7169 AM.BaseOffs = BaseOffset.getFixed();
7170 AM.HasBaseReg = HasBaseReg;
7171 AM.Scale = Scale;
7172 AM.ScalableOffset = BaseOffset.getScalable();
7173 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
7174 // Scale represents reg2 * scale, thus account for 1
7175 // as soon as we use a second register.
7176 return AM.Scale != 0;
7178}
7179
7181 // TODO: Hook MispredictPenalty of SchedMachineModel into this.
7182 return 14;
7183}
7184
7186 unsigned Bits = Ty->getScalarSizeInBits();
7187
7188 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
7189 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
7190 if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
7191 return false;
7192
7193 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
7194 // shifts just as cheap as scalar ones.
7195 if (ST->hasAVX2() && (Bits == 32 || Bits == 64))
7196 return false;
7197
7198 // AVX512BW has shifts such as vpsllvw.
7199 if (ST->hasBWI() && Bits == 16)
7200 return false;
7201
7202 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
7203 // fully general vector.
7204 return true;
7205}
7206
7207unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
7208 Type *ScalarValTy) const {
7209 if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
7210 return 4;
7211 }
7212 return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
7213}
7214
7216 SmallVectorImpl<Use *> &Ops) const {
7217 using namespace llvm::PatternMatch;
7218
7219 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
7220 if (!VTy)
7221 return false;
7222
7223 if (I->getOpcode() == Instruction::Mul &&
7224 VTy->getElementType()->isIntegerTy(64)) {
7225 for (auto &Op : I->operands()) {
7226 // Make sure we are not already sinking this operand
7227 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7228 continue;
7229
7230 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
7231 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
7232 if (ST->hasSSE41() &&
7233 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
7234 m_SpecificInt(32)))) {
7235 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7236 Ops.push_back(&Op);
7237 } else if (ST->hasSSE2() &&
7238 match(Op.get(),
7239 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
7240 Ops.push_back(&Op);
7241 }
7242 }
7243
7244 return !Ops.empty();
7245 }
7246
7247 // A uniform shift amount in a vector shift or funnel shift may be much
7248 // cheaper than a generic variable vector shift, so make that pattern visible
7249 // to SDAG by sinking the shuffle instruction next to the shift.
7250 int ShiftAmountOpNum = -1;
7251 if (I->isShift())
7252 ShiftAmountOpNum = 1;
7253 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
7254 if (II->getIntrinsicID() == Intrinsic::fshl ||
7255 II->getIntrinsicID() == Intrinsic::fshr)
7256 ShiftAmountOpNum = 2;
7257 }
7258
7259 if (ShiftAmountOpNum == -1)
7260 return false;
7261
7262 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
7263 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
7264 isVectorShiftByScalarCheap(I->getType())) {
7265 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
7266 return true;
7267 }
7268
7269 return false;
7270}
7271
7273 bool HasEGPR = ST->hasEGPR();
7274 const TargetMachine &TM = getTLI()->getTargetMachine();
7275
7276 for (User *U : F.users()) {
7278 if (!CB || CB->getCalledOperand() != &F)
7279 continue;
7280 Function *CallerFunc = CB->getFunction();
7281 if (TM.getSubtarget<X86Subtarget>(*CallerFunc).hasEGPR() != HasEGPR)
7282 return false;
7283 }
7284
7285 return true;
7286}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T
uint64_t IntrinsicInst * II
#define P(N)
static unsigned getNumElements(Type *Ty)
This file implements the SmallBitVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
CostTblEntryT< CostKindCosts > CostKindTblEntry
static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST)
TypeConversionCostTblEntryT< CostKindCosts > TypeConversionCostKindTblEntry
This file a TargetTransformInfoImplBase conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1023
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1679
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1339
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1497
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1052
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:828
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1571
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const override
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Value * getCalledOperand() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
const FeatureBitset & getFeatureBits() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
Class to represent pointers.
unsigned getAddressSpace() const
Return the address space of the Pointer type.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
virtual bool isExpensiveToSpeculativelyExecute(const Instruction *I) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
MaskKind
Some targets only support masked load/store with a constant mask.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
LLVM_ABI unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:285
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition Type.h:381
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
Base class of all SIMD vector types.
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Type * getElementType() const
bool useAVX512Regs() const
bool hasAVX512() const
bool hasAVX2() const
bool useFastCCForInternalCall(Function &F) const override
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
bool isLegalNTLoad(Type *DataType, Align Alignment) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
unsigned getRegisterClassForType(bool Vector, Type *Ty) const override
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalNTStore(Type *DataType, Align Alignment) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isVectorShiftByScalarCheap(Type *Ty) const override
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
unsigned getAtomicMemIntrinsicMaxElementSize() const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) const override
InstructionCost getBranchMispredictPenalty() const override
bool isExpensiveToSpeculativelyExecute(const Instruction *I) const override
bool hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment) const
bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
bool enableInterleavedAccessVectorization() const override
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const override
unsigned getNumberOfRegisters(unsigned ClassID) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool hasDivRemOp(Type *DataType, bool IsSigned) const override
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool supportsEfficientVectorElementLoadStore() const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCost(int64_t) const
Calculate the cost of materializing a 64-bit value.
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool canMacroFuseCmp() const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool prefersVectorizedAddressing() const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Type) const override
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
Calculate the cost of Gather / Scatter operation.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3020
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:788
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:787
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
ap_match< APInt > m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2170
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t DataSize
Definition InstrProf.h:267
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1883
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Cost Table Entry.
Definition CostTable.h:25
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:255
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition CostTable.h:55