LLVM 23.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
103
105X86TTIImpl::getPopcntSupport(unsigned TyWidth) const {
106 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
107 // TODO: Currently the __builtin_popcount() implementation using SSE3
108 // instructions is inefficient. Once the problem is fixed, we should
109 // call ST->hasSSE3() instead of ST->hasPOPCNT().
110 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
111}
112
113std::optional<unsigned> X86TTIImpl::getCacheSize(
115 switch (Level) {
117 // - Penryn
118 // - Nehalem
119 // - Westmere
120 // - Sandy Bridge
121 // - Ivy Bridge
122 // - Haswell
123 // - Broadwell
124 // - Skylake
125 // - Kabylake
126 return 32 * 1024; // 32 KiB
128 // - Penryn
129 // - Nehalem
130 // - Westmere
131 // - Sandy Bridge
132 // - Ivy Bridge
133 // - Haswell
134 // - Broadwell
135 // - Skylake
136 // - Kabylake
137 return 256 * 1024; // 256 KiB
138 }
139
140 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
141}
142
143std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
145 // - Penryn
146 // - Nehalem
147 // - Westmere
148 // - Sandy Bridge
149 // - Ivy Bridge
150 // - Haswell
151 // - Broadwell
152 // - Skylake
153 // - Kabylake
154 switch (Level) {
156 [[fallthrough]];
158 return 8;
159 }
160
161 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
162}
163
165
167 return Vector ? VectorClass
168 : Ty && Ty->isFloatingPointTy() ? ScalarFPClass
169 : GPRClass;
170}
171
172unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
173 if (ClassID == VectorClass && !ST->hasSSE1())
174 return 0;
175
176 if (!ST->is64Bit())
177 return 8;
178
179 if ((ClassID == GPRClass && ST->hasEGPR()) ||
180 (ClassID != GPRClass && ST->hasAVX512()))
181 return 32;
182
183 return 16;
184}
185
187 if (!ST->hasCF())
188 return false;
189 if (!Ty)
190 return true;
191 // Conditional faulting is supported by CFCMOV, which only accepts
192 // 16/32/64-bit operands.
193 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
194 // profitable.
195 auto *VTy = dyn_cast<FixedVectorType>(Ty);
196 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
197 return false;
198 auto *ScalarTy = Ty->getScalarType();
199 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
200 default:
201 return false;
202 case 16:
203 case 32:
204 case 64:
205 return true;
206 }
207}
208
211 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
212 switch (K) {
214 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
216 if (ST->hasAVX512() && PreferVectorWidth >= 512)
217 return TypeSize::getFixed(512);
218 if (ST->hasAVX() && PreferVectorWidth >= 256)
219 return TypeSize::getFixed(256);
220 if (ST->hasSSE1() && PreferVectorWidth >= 128)
221 return TypeSize::getFixed(128);
222 return TypeSize::getFixed(0);
224 return TypeSize::getScalable(0);
225 }
226
227 llvm_unreachable("Unsupported register kind");
228}
229
234
236 // If the loop will not be vectorized, don't interleave the loop.
237 // Let regular unroll to unroll the loop, which saves the overflow
238 // check and memory check cost.
239 if (VF.isScalar())
240 return 1;
241
242 if (ST->isAtom())
243 return 1;
244
245 // Sandybridge and Haswell have multiple execution ports and pipelined
246 // vector units.
247 if (ST->hasAVX())
248 return 4;
249
250 return 2;
251}
252
254 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
256 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
257
258 // vXi8 multiplications are always promoted to vXi16.
259 // Sub-128-bit types can be extended/packed more efficiently.
260 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
261 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
262 Type *WideVecTy =
264 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
266 CostKind) +
267 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
269 CostKind) +
270 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
271 }
272
273 // Legalize the type.
274 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
275
276 int ISD = TLI->InstructionOpcodeToISD(Opcode);
277 assert(ISD && "Invalid opcode");
278
279 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
280 (LT.second.getScalarType() == MVT::i32 ||
281 LT.second.getScalarType() == MVT::i64)) {
282 // Check if the operands can be represented as a smaller datatype.
283 bool Op1Signed = false, Op2Signed = false;
284 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
285 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
286 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
287 bool SignedMode = Op1Signed || Op2Signed;
288
289 // If both vXi32 are representable as i15 and at least one is constant,
290 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
291 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
292 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
293 LT.second.getScalarType() == MVT::i32) {
294 bool Op1Constant =
295 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
296 bool Op2Constant =
297 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
298 bool Op1Sext = isa<SExtInst>(Args[0]) &&
299 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
300 bool Op2Sext = isa<SExtInst>(Args[1]) &&
301 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
302
303 bool IsZeroExtended = !Op1Signed || !Op2Signed;
304 bool IsConstant = Op1Constant || Op2Constant;
305 bool IsSext = Op1Sext || Op2Sext;
306 if (IsConstant || IsZeroExtended || IsSext)
307 LT.second =
308 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
309 }
310
311 // Check if the vXi32 operands can be shrunk into a smaller datatype.
312 // This should match the codegen from reduceVMULWidth.
313 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
314 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
315 if (OpMinSize <= 7)
316 return LT.first * 3; // pmullw/sext
317 if (!SignedMode && OpMinSize <= 8)
318 return LT.first * 3; // pmullw/zext
319 if (OpMinSize <= 15)
320 return LT.first * 5; // pmullw/pmulhw/pshuf
321 if (!SignedMode && OpMinSize <= 16)
322 return LT.first * 5; // pmullw/pmulhw/pshuf
323 }
324
325 // If both vXi64 are representable as (unsigned) i32, then we can perform
326 // the multiple with a single PMULUDQ instruction.
327 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
328 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
329 ISD = X86ISD::PMULUDQ;
330 }
331
332 // Vector multiply by pow2 will be simplified to shifts.
333 // Vector multiply by -pow2 will be simplified to shifts/negates.
334 if (ISD == ISD::MUL && Op2Info.isConstant() &&
335 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
337 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
338 Op1Info.getNoProps(), Op2Info.getNoProps());
339 if (Op2Info.isNegatedPowerOf2())
340 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
341 return Cost;
342 }
343
344 // On X86, vector signed division by constants power-of-two are
345 // normally expanded to the sequence SRA + SRL + ADD + SRA.
346 // The OperandValue properties may not be the same as that of the previous
347 // operation; conservatively assume OP_None.
348 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
349 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
351 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
352 Op1Info.getNoProps(), Op2Info.getNoProps());
353 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
354 Op1Info.getNoProps(), Op2Info.getNoProps());
355 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
356 Op1Info.getNoProps(), Op2Info.getNoProps());
357
358 if (ISD == ISD::SREM) {
359 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
360 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
361 Op2Info.getNoProps());
362 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
363 Op2Info.getNoProps());
364 }
365
366 return Cost;
367 }
368
369 // Vector unsigned division/remainder will be simplified to shifts/masks.
370 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
371 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
372 if (ISD == ISD::UDIV)
373 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
374 Op1Info.getNoProps(), Op2Info.getNoProps());
375 // UREM
376 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
377 Op1Info.getNoProps(), Op2Info.getNoProps());
378 }
379
380 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
381 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
382 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
383 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
384 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
385 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
386 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
387 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
388 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
389 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
390 };
391
392 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
393 if (const auto *Entry =
394 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
395 if (auto KindCost = Entry->Cost[CostKind])
396 return LT.first * *KindCost;
397
398 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
399 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
400 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
401 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
402 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
403 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
404 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
405 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
406 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
407 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
408
409 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
410 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
411 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
412 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
413 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
414 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
415 };
416
417 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
418 if (const auto *Entry =
419 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
420 if (auto KindCost = Entry->Cost[CostKind])
421 return LT.first * *KindCost;
422
423 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
424 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
425 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
426 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
427
428 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
429 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
430 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
431
432 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
433 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
434 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
435 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
436 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
437 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
438
439 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
440 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
441 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
442 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
443 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
444 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
445 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
446
447 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
448 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
449 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
450 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
451 };
452
453 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
454 if (const auto *Entry =
455 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
456 if (auto KindCost = Entry->Cost[CostKind])
457 return LT.first * *KindCost;
458
459 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
460 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
461 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
462 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
463 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
464 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
465 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
466
467 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
468 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
469 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
470 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
471 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
472 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
473
474 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
475 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
476 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
477 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
478 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
479 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
480
481 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
482 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
483 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
484 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
485 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
486 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
487
488 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
489 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
490 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
491 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
492 };
493
494 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
495 if (const auto *Entry =
496 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
497 if (auto KindCost = Entry->Cost[CostKind])
498 return LT.first * *KindCost;
499
500 static const CostKindTblEntry AVXUniformConstCostTable[] = {
501 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
502 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
503 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
504 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
505 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
506 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
507
508 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
509 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
510 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
511 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
512 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
513 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
514
515 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
516 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
517 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
518 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
519 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
520 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
521
522 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
523 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
524 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
525 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
526 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
527 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
528
529 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
530 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
531 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
532 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
533 };
534
535 // XOP has faster vXi8 shifts.
536 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
537 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
538 if (const auto *Entry =
539 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
540 if (auto KindCost = Entry->Cost[CostKind])
541 return LT.first * *KindCost;
542
543 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
544 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
545 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
546 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
547
548 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
549 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
550 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
551
552 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
553 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
554 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
555
556 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
557 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
558 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
559
560 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
561 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
562 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
563 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
564 };
565
566 // XOP has faster vXi8 shifts.
567 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
568 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
569 if (const auto *Entry =
570 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
571 if (auto KindCost = Entry->Cost[CostKind])
572 return LT.first * *KindCost;
573
574 static const CostKindTblEntry AVX512BWConstCostTable[] = {
575 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
576 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
577 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
578 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
579
580 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
581 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
582 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
583 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
584 };
585
586 if (Op2Info.isConstant() && ST->hasBWI())
587 if (const auto *Entry =
588 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
589 if (auto KindCost = Entry->Cost[CostKind])
590 return LT.first * *KindCost;
591
592 static const CostKindTblEntry AVX512ConstCostTable[] = {
593 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
594 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
595 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
596 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
597
598 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
599 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
600 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
601 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
602
603 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
604 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
605 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
606 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
607 };
608
609 if (Op2Info.isConstant() && ST->hasAVX512())
610 if (const auto *Entry =
611 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
612 if (auto KindCost = Entry->Cost[CostKind])
613 return LT.first * *KindCost;
614
615 static const CostKindTblEntry AVX2ConstCostTable[] = {
616 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
617 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
618 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
619 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
620
621 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
622 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
623 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
624 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
625
626 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
627 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
628 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
629 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
630 };
631
632 if (Op2Info.isConstant() && ST->hasAVX2())
633 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
634 if (auto KindCost = Entry->Cost[CostKind])
635 return LT.first * *KindCost;
636
637 static const CostKindTblEntry AVXConstCostTable[] = {
638 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
639 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
640 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
641 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
642
643 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
644 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
645 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
646 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
647
648 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
649 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
650 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
651 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
652 };
653
654 if (Op2Info.isConstant() && ST->hasAVX())
655 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
656 if (auto KindCost = Entry->Cost[CostKind])
657 return LT.first * *KindCost;
658
659 static const CostKindTblEntry SSE41ConstCostTable[] = {
660 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
661 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
662 };
663
664 if (Op2Info.isConstant() && ST->hasSSE41())
665 if (const auto *Entry =
666 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
667 if (auto KindCost = Entry->Cost[CostKind])
668 return LT.first * *KindCost;
669
670 static const CostKindTblEntry SSE2ConstCostTable[] = {
671 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
672 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
673 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
674 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
675
676 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
677 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
678 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
679 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
680
681 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
682 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
683 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
684 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
685 };
686
687 if (Op2Info.isConstant() && ST->hasSSE2())
688 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
689 if (auto KindCost = Entry->Cost[CostKind])
690 return LT.first * *KindCost;
691
692 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
693 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
694 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
695 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
696 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
697 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
698 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
699 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
700 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
701 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
702
703 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
704 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
705 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
706 };
707
708 if (ST->hasBWI() && Op2Info.isUniform())
709 if (const auto *Entry =
710 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
711 if (auto KindCost = Entry->Cost[CostKind])
712 return LT.first * *KindCost;
713
714 static const CostKindTblEntry AVX512UniformCostTable[] = {
715 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
716 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
717 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
718
719 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
720 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
721 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
722
723 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
724 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
725 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
726 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
727 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
728 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
729 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
730 };
731
732 if (ST->hasAVX512() && Op2Info.isUniform())
733 if (const auto *Entry =
734 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
735 if (auto KindCost = Entry->Cost[CostKind])
736 return LT.first * *KindCost;
737
738 static const CostKindTblEntry AVX2UniformCostTable[] = {
739 // Uniform splats are cheaper for the following instructions.
740 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
741 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
742 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
743 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
744 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
745 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
746
747 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
748 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
749 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
750 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
751 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
752 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
753
754 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
755 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
756 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
757 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
758 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
759 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
760
761 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
762 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
763 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
764 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
765 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
766 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
767 };
768
769 if (ST->hasAVX2() && Op2Info.isUniform())
770 if (const auto *Entry =
771 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
772 if (auto KindCost = Entry->Cost[CostKind])
773 return LT.first * *KindCost;
774
775 static const CostKindTblEntry AVXUniformCostTable[] = {
776 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
777 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
778 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
779 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
780 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
781 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
782
783 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
784 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
785 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
786 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
787 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
788 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
789
790 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
791 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
792 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
793 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
794 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
795 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
796
797 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
798 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
799 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
800 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
801 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
802 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
803 };
804
805 // XOP has faster vXi8 shifts.
806 if (ST->hasAVX() && Op2Info.isUniform() &&
807 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
808 if (const auto *Entry =
809 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
810 if (auto KindCost = Entry->Cost[CostKind])
811 return LT.first * *KindCost;
812
813 static const CostKindTblEntry SSE2UniformCostTable[] = {
814 // Uniform splats are cheaper for the following instructions.
815 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
816 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
817 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
818
819 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
820 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
821 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
822
823 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
824 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
825 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
826
827 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
828 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
829 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
830 };
831
832 if (ST->hasSSE2() && Op2Info.isUniform() &&
833 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
834 if (const auto *Entry =
835 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
836 if (auto KindCost = Entry->Cost[CostKind])
837 return LT.first * *KindCost;
838
839 static const CostKindTblEntry AVX512DQCostTable[] = {
840 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
841 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
842 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
843 };
844
845 // Look for AVX512DQ lowering tricks for custom cases.
846 if (ST->hasDQI())
847 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
848 if (auto KindCost = Entry->Cost[CostKind])
849 return LT.first * *KindCost;
850
851 static const CostKindTblEntry AVX512BWCostTable[] = {
852 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
853 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
854 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
855 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
856 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
857 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
858 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
859 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
860 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
861
862 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
863 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
864 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
865 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
866 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
867 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
868 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
869 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
870 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
871
872 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
873 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
874
875 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
876 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
877 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
878 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
879
880 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
881 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
882
883 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
884 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
885 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
886 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
887
888 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
889 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
890 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
891 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
892 };
893
894 // Look for AVX512BW lowering tricks for custom cases.
895 if (ST->hasBWI())
896 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
897 if (auto KindCost = Entry->Cost[CostKind])
898 return LT.first * *KindCost;
899
900 static const CostKindTblEntry AVX512CostTable[] = {
901 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
902 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
903 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
904
905 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
906 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
907 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
908
909 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
910 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
911 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
912 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
913 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
914 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
915 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
916 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
917 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
918
919 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
920 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
921 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
922 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
923 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
924 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
925 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
926 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
927 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
928
929 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
930 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
931
932 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
933 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
934
935 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
936 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
937 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
938 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
939
940 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
941 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
942 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
943 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
944
945 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
946 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
947 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
948 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
949
950 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
951 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
952 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
953 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
954 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
955
956 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
957
958 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
959 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
960 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
961 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
962 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
964 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
965 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
966 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
967
968 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
969 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
970 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
972
973 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
974 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
975 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
976 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
977 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
979 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
980 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
981 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
982
983 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
984 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
985 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
986 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
987 };
988
989 if (ST->hasAVX512())
990 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
991 if (auto KindCost = Entry->Cost[CostKind])
992 return LT.first * *KindCost;
993
994 static const CostKindTblEntry AVX2ShiftCostTable[] = {
995 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
996 // customize them to detect the cases where shift amount is a scalar one.
997 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
998 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
999 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
1000 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
1001 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
1002 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
1003 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
1004 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
1005 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
1006 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1007 };
1008
1009 if (ST->hasAVX512()) {
1010 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1011 // On AVX512, a packed v32i16 shift left by a constant build_vector
1012 // is lowered into a vector multiply (vpmullw).
1013 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1014 Op1Info.getNoProps(), Op2Info.getNoProps());
1015 }
1016
1017 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1018 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1019 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1020 Op2Info.isConstant())
1021 // On AVX2, a packed v16i16 shift left by a constant build_vector
1022 // is lowered into a vector multiply (vpmullw).
1023 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1024 Op1Info.getNoProps(), Op2Info.getNoProps());
1025
1026 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1027 if (auto KindCost = Entry->Cost[CostKind])
1028 return LT.first * *KindCost;
1029 }
1030
1031 static const CostKindTblEntry XOPShiftCostTable[] = {
1032 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1033 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1034 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1035 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1036 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1037 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1038 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1039 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1040 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1041 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1042 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1043 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1044 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1045 // 256bit shifts require splitting if AVX2 didn't catch them above.
1046 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1047 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1048 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1049 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1050 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1051 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1052 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1053 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1054 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1055 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1056 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1057 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1058 };
1059
1060 // Look for XOP lowering tricks.
1061 if (ST->hasXOP()) {
1062 // If the right shift is constant then we'll fold the negation so
1063 // it's as cheap as a left shift.
1064 int ShiftISD = ISD;
1065 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1066 ShiftISD = ISD::SHL;
1067 if (const auto *Entry =
1068 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1069 if (auto KindCost = Entry->Cost[CostKind])
1070 return LT.first * *KindCost;
1071 }
1072
1073 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1074 MVT VT = LT.second;
1075 // Vector shift left by non uniform constant can be lowered
1076 // into vector multiply.
1077 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1078 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1079 ISD = ISD::MUL;
1080 }
1081
1082 static const CostKindTblEntry GLMCostTable[] = {
1083 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1084 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1085 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1086 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1087 };
1088
1089 if (ST->useGLMDivSqrtCosts())
1090 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1091 if (auto KindCost = Entry->Cost[CostKind])
1092 return LT.first * *KindCost;
1093
1094 static const CostKindTblEntry SLMCostTable[] = {
1095 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1096 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1097 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1098 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1099 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1100 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1101 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1102 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1103 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1104 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1105 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1106 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1107 // v2i64/v4i64 mul is custom lowered as a series of long:
1108 // multiplies(3), shifts(3) and adds(2)
1109 // slm muldq version throughput is 2 and addq throughput 4
1110 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1111 // 3X4 (addq throughput) = 17
1112 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1113 // slm addq\subq throughput is 4
1114 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1115 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1116 };
1117
1118 if (ST->useSLMArithCosts())
1119 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1120 if (auto KindCost = Entry->Cost[CostKind])
1121 return LT.first * *KindCost;
1122
1123 static const CostKindTblEntry AVX2CostTable[] = {
1124 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1125 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1126 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1127 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1128
1129 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1130 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1131 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1132 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1133
1134 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1135 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1136 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1137 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1138 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1139 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1140
1141 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1142 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1143 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1144 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1145 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1146 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1147 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1148 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1149
1150 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1151 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1152 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1153 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1154 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1155 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1156 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1157
1158 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1159
1160 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1161 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1162
1163 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1164 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1165 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1166 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1167 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1168 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1169
1170 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1171 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1172 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1173 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1174 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1175 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1176
1177 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1178 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1179 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1180 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1181 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1182 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1183
1184 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1185 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1186 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1187 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1188 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1189 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1190 };
1191
1192 // Look for AVX2 lowering tricks for custom cases.
1193 if (ST->hasAVX2())
1194 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1195 if (auto KindCost = Entry->Cost[CostKind])
1196 return LT.first * *KindCost;
1197
1198 static const CostKindTblEntry AVX1CostTable[] = {
1199 // We don't have to scalarize unsupported ops. We can issue two half-sized
1200 // operations and we only need to extract the upper YMM half.
1201 // Two ops + 1 extract + 1 insert = 4.
1202 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1203 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1204 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1205 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1206 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1207 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1208
1209 { X86ISD::PMULUDQ, MVT::v4i64, { 3, 5, 5, 6 } }, // pmuludq + split
1210
1211 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1212 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1213 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1214 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1215
1216 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1217 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1218 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1219 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1220
1221 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1222 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1223 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1224 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1225
1226 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1227 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1228 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1229 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1230 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1231 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1232 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1233 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1234 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1235 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1236
1237 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1238 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1239 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1240 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1241 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1242 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1243 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1244 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1245
1246 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1247 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1248 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1249 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1250 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1251 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1252 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1253 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1254
1255 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1256 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1257 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1258 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1259 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1260 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1261 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1262 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1263
1264 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1265 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1266
1267 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1268 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1269 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1270 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1271 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1272 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1273
1274 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1275 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1276 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1277 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1278 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1279 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1280
1281 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1282 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1283 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1284 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1285 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1286 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1287
1288 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1289 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1290 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1291 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1292 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1293 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1294 };
1295
1296 if (ST->hasAVX())
1297 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1298 if (auto KindCost = Entry->Cost[CostKind])
1299 return LT.first * *KindCost;
1300
1301 static const CostKindTblEntry SSE42CostTable[] = {
1302 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1303 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1304 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1305 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1306
1307 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1308 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1309 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1310 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1311
1312 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1313 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1314 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1315 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1316
1317 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1318 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1319 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1320 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1321
1322 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1323 };
1324
1325 if (ST->hasSSE42())
1326 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1327 if (auto KindCost = Entry->Cost[CostKind])
1328 return LT.first * *KindCost;
1329
1330 static const CostKindTblEntry SSE41CostTable[] = {
1331 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1332 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1333 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1334
1335 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1336 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1337 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1338 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1339
1340 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1341 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1342 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1343 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1344
1345 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1346 };
1347
1348 if (ST->hasSSE41())
1349 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1350 if (auto KindCost = Entry->Cost[CostKind])
1351 return LT.first * *KindCost;
1352
1353 static const CostKindTblEntry SSSE3CostTable[] = {
1354 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1355 };
1356
1357 if (ST->hasSSSE3())
1358 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1359 if (auto KindCost = Entry->Cost[CostKind])
1360 return LT.first * *KindCost;
1361
1362 static const CostKindTblEntry SSE2CostTable[] = {
1363 // We don't correctly identify costs of casts because they are marked as
1364 // custom.
1365 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1366 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1367 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1368 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1369
1370 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1371 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1372 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1373 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1374
1375 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1376 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1377 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1378 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1379
1380 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1381 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1382 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1383 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1384
1385 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1386 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1387 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1388 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1389
1390 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1391 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1392 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1393 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1394
1395 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1396 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1397
1398 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1399 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1400 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1401 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1402
1403 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1404
1405 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1406 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1407 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409
1410 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1411 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1412 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1413 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1414
1415 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1417 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1418
1419 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1420 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1421 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1422
1423 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1424 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1425 };
1426
1427 if (ST->hasSSE2())
1428 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1429 if (auto KindCost = Entry->Cost[CostKind])
1430 return LT.first * *KindCost;
1431
1432 static const CostKindTblEntry SSE1CostTable[] = {
1433 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1434 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1435
1436 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1437 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1438
1439 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1440 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1441
1442 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1443 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1444
1445 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1446 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1447 };
1448
1449 if (ST->hasSSE1())
1450 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1451 if (auto KindCost = Entry->Cost[CostKind])
1452 return LT.first * *KindCost;
1453
1454 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1455 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1456 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1457 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1458 };
1459
1460 if (ST->is64Bit())
1461 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1462 if (auto KindCost = Entry->Cost[CostKind])
1463 return LT.first * *KindCost;
1464
1465 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1466 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1467 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1468 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1469
1470 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1471 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1472 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1473
1474 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1475 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1476 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1477
1478 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1479 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1480 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1481 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1482 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1483 };
1484
1485 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1486 if (auto KindCost = Entry->Cost[CostKind])
1487 return LT.first * *KindCost;
1488
1489 // It is not a good idea to vectorize division. We have to scalarize it and
1490 // in the process we will often end up having to spilling regular
1491 // registers. The overhead of division is going to dominate most kernels
1492 // anyways so try hard to prevent vectorization of division - it is
1493 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1494 // to hide "20 cycles" for each lane.
1495 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1496 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1497 ISD == ISD::UREM)) {
1498 InstructionCost ScalarCost =
1499 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
1500 Op1Info.getNoProps(), Op2Info.getNoProps());
1501 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1502 }
1503
1504 // Handle some basic single instruction code size cases.
1505 if (CostKind == TTI::TCK_CodeSize) {
1506 switch (ISD) {
1507 case ISD::FADD:
1508 case ISD::FSUB:
1509 case ISD::FMUL:
1510 case ISD::FDIV:
1511 case ISD::FNEG:
1512 case ISD::AND:
1513 case ISD::OR:
1514 case ISD::XOR:
1515 return LT.first;
1516 break;
1517 }
1518 }
1519
1520 // Fallback to the default implementation.
1521 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1522 Args, CxtI);
1523}
1524
1527 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1529 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1530 return TTI::TCC_Basic;
1532}
1533
1535 VectorType *DstTy, VectorType *SrcTy,
1536 ArrayRef<int> Mask,
1538 int Index, VectorType *SubTp,
1540 const Instruction *CxtI) const {
1541 assert((Mask.empty() || DstTy->isScalableTy() ||
1542 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1543 "Expected the Mask to match the return size if given");
1544 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1545 "Expected the same scalar types");
1546
1547 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1548 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1549 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1550
1551 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1552
1553 // If all args are constant than this will be constant folded away.
1554 if (!Args.empty() &&
1555 all_of(Args, [](const Value *Arg) { return isa<Constant>(Arg); }))
1556 return TTI::TCC_Free;
1557
1558 // Recognize a basic concat_vector shuffle.
1559 if (Kind == TTI::SK_PermuteTwoSrc &&
1560 Mask.size() == (2 * SrcTy->getElementCount().getKnownMinValue()) &&
1561 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1565 CostKind, Mask.size() / 2, SrcTy);
1566
1567 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1568 if (Kind == TTI::SK_Transpose)
1569 if (LT.second != MVT::v4f64 && LT.second != MVT::v4i64)
1570 Kind = TTI::SK_PermuteTwoSrc;
1571
1572 if (Kind == TTI::SK_Broadcast) {
1573 // For Broadcasts we are splatting the first element from the first input
1574 // register, so only need to reference that input and all the output
1575 // registers are the same.
1576 LT.first = 1;
1577
1578 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1579 // If many-used-load whose every use is one of a small set of operations
1580 // that SLP can rewrite into a single vector lane, codegen can fold it into
1581 // the free broadcast.
1582 using namespace PatternMatch;
1583 auto IsBroadcastLoadFoldUser = [&](const User *U) {
1584 if (isa<InsertElementInst>(U) && U->getOperand(1) == Args[0])
1585 return true;
1586 if (U->getType()->isVectorTy())
1587 return false;
1588 // Terminators (return/branch/switch/indirectbr/resume/invoke EH)
1589 // and phis carry the value across control flow.
1590 if (const auto *I = dyn_cast<Instruction>(U))
1591 if (I->isTerminator() ||
1593 return false;
1594 // Only pure calls can be folded.
1595 if (const auto *CB = dyn_cast<CallBase>(U))
1596 return CB->doesNotAccessMemory() && !CB->mayHaveSideEffects();
1597 return true;
1598 };
1599 auto IsFoldableSLPBroadcastLoad = [&]() {
1600 if (!match(Args[0], m_Load(m_Value())))
1601 return false;
1602 auto *FVT = dyn_cast<FixedVectorType>(DstTy);
1603 if (!FVT)
1604 return false;
1605 // getNumUses() counts each Use, matching the per-lane broadcast
1606 // accounting (a use like `op %x, %x` consumes two broadcast lanes).
1607 if (Args[0]->getNumUses() != FVT->getNumElements())
1608 return false;
1609 return all_of(Args[0]->users(), IsBroadcastLoadFoldUser);
1610 };
1611 if (!Args.empty() &&
1612 (match(Args[0], m_OneUse(m_Load(m_Value()))) ||
1613 IsFoldableSLPBroadcastLoad()) &&
1614 (ST->hasAVX2() ||
1615 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1616 return TTI::TCC_Free;
1617 }
1618
1619 // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
1620 // permutation.
1621 // Attempt to detect a shuffle mask with a single defined element.
1622 bool IsInLaneShuffle = false;
1623 bool IsSingleElementMask = false;
1624 if (SrcTy->getPrimitiveSizeInBits() > 0 &&
1625 (SrcTy->getPrimitiveSizeInBits() % 128) == 0 &&
1626 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
1627 Mask.size() == SrcTy->getElementCount().getKnownMinValue()) {
1628 unsigned NumLanes = SrcTy->getPrimitiveSizeInBits() / 128;
1629 unsigned NumEltsPerLane = Mask.size() / NumLanes;
1630 if ((Mask.size() % NumLanes) == 0) {
1631 IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
1632 return P.value() == PoisonMaskElem ||
1633 ((P.value() % Mask.size()) / NumEltsPerLane) ==
1634 (P.index() / NumEltsPerLane);
1635 });
1636 IsSingleElementMask =
1637 (Mask.size() - 1) == static_cast<unsigned>(count_if(Mask, [](int M) {
1638 return M == PoisonMaskElem;
1639 }));
1640 }
1641 }
1642
1643 // Treat <X x bfloat> shuffles as <X x half>.
1644 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1645 LT.second = LT.second.changeVectorElementType(MVT::f16);
1646
1647 // Subvector extractions are free if they start at the beginning of a
1648 // vector and cheap if the subvectors are aligned.
1649 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1650 int NumElts = LT.second.getVectorNumElements();
1651 if ((Index % NumElts) == 0)
1652 return TTI::TCC_Free;
1653 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1654 if (SubLT.second.isVector()) {
1655 int NumSubElts = SubLT.second.getVectorNumElements();
1656 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1657 return SubLT.first;
1658 // Handle some cases for widening legalization. For now we only handle
1659 // cases where the original subvector was naturally aligned and evenly
1660 // fit in its legalized subvector type.
1661 // FIXME: Remove some of the alignment restrictions.
1662 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1663 // vectors.
1664 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1665 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1666 (NumSubElts % OrigSubElts) == 0 &&
1667 LT.second.getVectorElementType() ==
1668 SubLT.second.getVectorElementType() &&
1669 LT.second.getVectorElementType().getSizeInBits() ==
1670 SrcTy->getElementType()->getPrimitiveSizeInBits()) {
1671 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1672 "Unexpected number of elements!");
1673 auto *VecTy = FixedVectorType::get(SrcTy->getElementType(),
1674 LT.second.getVectorNumElements());
1675 auto *SubTy = FixedVectorType::get(SrcTy->getElementType(),
1676 SubLT.second.getVectorNumElements());
1677 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1678 InstructionCost ExtractCost =
1680 ExtractIndex, SubTy);
1681
1682 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1683 // if we have SSSE3 we can use pshufb.
1684 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1685 return ExtractCost + 1; // pshufd or pshufb
1686
1687 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1688 "Unexpected vector size");
1689
1690 return ExtractCost + 2; // worst case pshufhw + pshufd
1691 }
1692 }
1693 // If the extract subvector is not optimal, treat it as single op shuffle.
1695 }
1696
1697 // Subvector insertions are cheap if the subvectors are aligned.
1698 // Note that in general, the insertion starting at the beginning of a vector
1699 // isn't free, because we need to preserve the rest of the wide vector,
1700 // but if the destination vector legalizes to the same width as the subvector
1701 // then the insertion will simplify to a (free) register copy.
1702 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1703 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(DstTy);
1704 int NumElts = DstLT.second.getVectorNumElements();
1705 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1706 if (SubLT.second.isVector()) {
1707 int NumSubElts = SubLT.second.getVectorNumElements();
1708 bool MatchingTypes =
1709 NumElts == NumSubElts &&
1710 (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0;
1711 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1712 return MatchingTypes ? TTI::TCC_Free : SubLT.first;
1713 }
1714
1715 // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have
1716 // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of
1717 // v1f32 (legalised to f32) into a v4f32.
1718 if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 &&
1719 SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41()))
1720 return 1;
1721
1722 // If the insertion is the lowest subvector then it will be blended
1723 // otherwise treat it like a 2-op shuffle.
1724 Kind =
1725 (Index == 0 && LT.first == 1) ? TTI::SK_Select : TTI::SK_PermuteTwoSrc;
1726 }
1727
1728 // Handle some common (illegal) sub-vector types as they are often very cheap
1729 // to shuffle even on targets without PSHUFB.
1730 EVT VT = TLI->getValueType(DL, SrcTy);
1731 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1732 !ST->hasSSSE3()) {
1733 static const CostKindTblEntry SSE2SubVectorShuffleTbl[] = {
1734 {TTI::SK_Broadcast, MVT::v4i16, {1,1,1,1}}, // pshuflw
1735 {TTI::SK_Broadcast, MVT::v2i16, {1,1,1,1}}, // pshuflw
1736 {TTI::SK_Broadcast, MVT::v8i8, {2,2,2,2}}, // punpck/pshuflw
1737 {TTI::SK_Broadcast, MVT::v4i8, {2,2,2,2}}, // punpck/pshuflw
1738 {TTI::SK_Broadcast, MVT::v2i8, {1,1,1,1}}, // punpck
1739
1740 {TTI::SK_Reverse, MVT::v4i16, {1,1,1,1}}, // pshuflw
1741 {TTI::SK_Reverse, MVT::v2i16, {1,1,1,1}}, // pshuflw
1742 {TTI::SK_Reverse, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw/packus
1743 {TTI::SK_Reverse, MVT::v2i8, {1,1,1,1}}, // punpck
1744
1745 {TTI::SK_Splice, MVT::v4i16, {2,2,2,2}}, // punpck+psrldq
1746 {TTI::SK_Splice, MVT::v2i16, {2,2,2,2}}, // punpck+psrldq
1747 {TTI::SK_Splice, MVT::v4i8, {2,2,2,2}}, // punpck+psrldq
1748 {TTI::SK_Splice, MVT::v2i8, {2,2,2,2}}, // punpck+psrldq
1749
1750 {TTI::SK_PermuteTwoSrc, MVT::v4i16, {2,2,2,2}}, // punpck/pshuflw
1751 {TTI::SK_PermuteTwoSrc, MVT::v2i16, {2,2,2,2}}, // punpck/pshuflw
1752 {TTI::SK_PermuteTwoSrc, MVT::v8i8, {7,7,7,7}}, // punpck/pshuflw
1753 {TTI::SK_PermuteTwoSrc, MVT::v4i8, {4,4,4,4}}, // punpck/pshuflw
1754 {TTI::SK_PermuteTwoSrc, MVT::v2i8, {2,2,2,2}}, // punpck
1755
1756 {TTI::SK_PermuteSingleSrc, MVT::v4i16, {1,1,1,1}}, // pshuflw
1757 {TTI::SK_PermuteSingleSrc, MVT::v2i16, {1,1,1,1}}, // pshuflw
1758 {TTI::SK_PermuteSingleSrc, MVT::v8i8, {5,5,5,5}}, // punpck/pshuflw
1759 {TTI::SK_PermuteSingleSrc, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw
1760 {TTI::SK_PermuteSingleSrc, MVT::v2i8, {1,1,1,1}}, // punpck
1761 };
1762
1763 if (ST->hasSSE2())
1764 if (const auto *Entry =
1765 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1766 if (auto KindCost = Entry->Cost[CostKind])
1767 return LT.first * *KindCost;
1768 }
1769
1770 // We are going to permute multiple sources and the result will be in multiple
1771 // destinations. Providing an accurate cost only for splits where the element
1772 // type remains the same.
1773 if (LT.first != 1) {
1774 MVT LegalVT = LT.second;
1775 if (LegalVT.isVector() &&
1776 LegalVT.getVectorElementType().getSizeInBits() ==
1777 SrcTy->getElementType()->getPrimitiveSizeInBits() &&
1778 LegalVT.getVectorNumElements() <
1779 cast<FixedVectorType>(SrcTy)->getNumElements()) {
1780 unsigned VecTySize = DL.getTypeStoreSize(SrcTy);
1781 unsigned LegalVTSize = LegalVT.getStoreSize();
1782 // Number of source vectors after legalization:
1783 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1784 // Number of destination vectors after legalization:
1785 InstructionCost NumOfDests = LT.first;
1786
1787 auto *SingleOpTy = FixedVectorType::get(SrcTy->getElementType(),
1788 LegalVT.getVectorNumElements());
1789
1790 if (!Mask.empty() && NumOfDests.isValid()) {
1791 // Try to perform better estimation of the permutation.
1792 // 1. Split the source/destination vectors into real registers.
1793 // 2. Do the mask analysis to identify which real registers are
1794 // permuted. If more than 1 source registers are used for the
1795 // destination register building, the cost for this destination register
1796 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1797 // source register is used, build mask and calculate the cost as a cost
1798 // of PermuteSingleSrc.
1799 // Also, for the single register permute we try to identify if the
1800 // destination register is just a copy of the source register or the
1801 // copy of the previous destination register (the cost is
1802 // TTI::TCC_Basic). If the source register is just reused, the cost for
1803 // this operation is TTI::TCC_Free.
1804 NumOfDests =
1806 FixedVectorType::get(SrcTy->getElementType(), Mask.size()))
1807 .first;
1808 unsigned E = NumOfDests.getValue();
1809 unsigned NormalizedVF =
1810 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1811 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1812 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1813 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1814 copy(Mask, NormalizedMask.begin());
1815 unsigned PrevSrcReg = 0;
1816 ArrayRef<int> PrevRegMask;
1819 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1820 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1821 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1822 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1823 // Check if the previous register can be just copied to the next
1824 // one.
1825 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1826 PrevRegMask != RegMask)
1827 Cost +=
1829 SingleOpTy, RegMask, CostKind, 0, nullptr);
1830 else
1831 // Just a copy of previous destination register.
1833 return;
1834 }
1835 if (SrcReg != DestReg &&
1836 any_of(RegMask, not_equal_to(PoisonMaskElem))) {
1837 // Just a copy of the source register.
1839 }
1840 PrevSrcReg = SrcReg;
1841 PrevRegMask = RegMask;
1842 },
1843 [this, SingleOpTy, CostKind,
1844 &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/,
1845 unsigned /*Unused*/, bool /*Unused*/) {
1847 SingleOpTy, RegMask, CostKind, 0, nullptr);
1848 });
1849 return Cost;
1850 }
1851
1852 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1853 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1854 SingleOpTy, {}, CostKind, 0,
1855 nullptr);
1856 }
1857
1858 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1859 SubTp);
1860 }
1861
1862 // If we're just moving a single element around (probably as an alternative to
1863 // extracting it), we can assume this is cheap.
1864 if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask)
1865 return TTI::TCC_Basic;
1866
1867 static const CostKindTblEntry AVX512VBMIShuffleTbl[] = {
1868 { TTI::SK_Reverse, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1869 { TTI::SK_Reverse, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1870 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1871 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1872 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 2, 2, 2, 2 } }, // vpermt2b
1873 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // vpermt2b
1874 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 2, 2, 2, 2 } } // vpermt2b
1875 };
1876
1877 if (ST->hasVBMI())
1878 if (const auto *Entry =
1879 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1880 if (auto KindCost = Entry->Cost[CostKind])
1881 return LT.first * *KindCost;
1882
1883 static const CostKindTblEntry AVX512BWShuffleTbl[] = {
1884 { TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1885 { TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1886 { TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1887
1888 { TTI::SK_Reverse, MVT::v32i16, { 2, 6, 2, 4 } }, // vpermw
1889 { TTI::SK_Reverse, MVT::v32f16, { 2, 6, 2, 4 } }, // vpermw
1890 { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1891 { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1892 { TTI::SK_Reverse, MVT::v64i8, { 2, 9, 2, 3 } }, // pshufb + vshufi64x2
1893
1894 { TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
1895 { TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
1896 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1897 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1898 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 8, 8, 8, 8 } }, // extend to v32i16
1899
1900 { TTI::SK_PermuteTwoSrc, MVT::v32i16,{ 2, 2, 2, 2 } }, // vpermt2w
1901 { TTI::SK_PermuteTwoSrc, MVT::v32f16,{ 2, 2, 2, 2 } }, // vpermt2w
1902 { TTI::SK_PermuteTwoSrc, MVT::v16i16,{ 2, 2, 2, 2 } }, // vpermt2w
1903 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 2, 2, 2, 2 } }, // vpermt2w
1904 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 19, 19, 19, 19 } }, // 6 * v32i8 + 1
1905
1906 { TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vblendmw
1907 { TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vblendmb
1908
1909 { TTI::SK_Splice, MVT::v32i16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1910 { TTI::SK_Splice, MVT::v32f16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1911 { TTI::SK_Splice, MVT::v64i8, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1912 };
1913
1914 if (ST->hasBWI())
1915 if (const auto *Entry =
1916 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1917 if (auto KindCost = Entry->Cost[CostKind])
1918 return LT.first * *KindCost;
1919
1920 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1921 {TTI::SK_Broadcast, MVT::v8f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1922 {TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1923 {TTI::SK_Broadcast, MVT::v16f32, { 1, 3, 1, 1 } }, // vbroadcastss
1924 {TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 1 } }, // vbroadcastss
1925 {TTI::SK_Broadcast, MVT::v8i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1926 {TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1927 {TTI::SK_Broadcast, MVT::v16i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1928 {TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1929 {TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1930 {TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1931 {TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1932 {TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1933 {TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1934 {TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 1 }}, // vpbroadcastb
1935
1936 {TTI::SK_Reverse, MVT::v8f64, { 1, 5, 2, 3 } }, // vpermpd
1937 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 2, 3 } }, // vpermps
1938 {TTI::SK_Reverse, MVT::v8i64, { 1, 5, 2, 3 } }, // vpermq
1939 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 2, 3 } }, // vpermd
1940 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1941 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1942 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1943
1944 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1945 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1946 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1947 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1948 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1949 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1950 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1951 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1952 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1953 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1954 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1955
1956 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1957 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1958 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1959 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1960 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1961 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1962 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1963 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1964 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1965 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1966 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1967 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1968 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1969
1970 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1971 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1972 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1973 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1974 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1975 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1976 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1977 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1978 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1979 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1980 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1981 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1982
1983 // FIXME: This just applies the type legalization cost rules above
1984 // assuming these completely split.
1985 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1986 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1987 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1988 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1989 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1990 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1991
1992 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1993 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1994 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1995 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1996 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1997 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1998 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1999 };
2000
2001 if (ST->hasAVX512())
2002 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
2003 if (auto KindCost = Entry->Cost[CostKind])
2004 return LT.first * *KindCost;
2005
2006 static const CostKindTblEntry AVX2InLaneShuffleTbl[] = {
2007 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 1, 1, 1, 1 } }, // vpshufb
2008 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 1, 1, 1, 1 } }, // vpshufb
2009 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpshufb
2010
2011 { TTI::SK_Transpose, MVT::v4f64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2012 { TTI::SK_Transpose, MVT::v4i64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2013
2014 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
2015 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
2016 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
2017 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
2018 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
2019 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
2020 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
2021 };
2022
2023 if (IsInLaneShuffle && ST->hasAVX2())
2024 if (const auto *Entry =
2025 CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second))
2026 if (auto KindCost = Entry->Cost[CostKind])
2027 return LT.first * *KindCost;
2028
2029 static const CostKindTblEntry AVX2ShuffleTbl[] = {
2030 { TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 2 } }, // vbroadcastpd
2031 { TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 2 } }, // vbroadcastps
2032 { TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 2 } }, // vpbroadcastq
2033 { TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 2 } }, // vpbroadcastd
2034 { TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 2 } }, // vpbroadcastw
2035 { TTI::SK_Broadcast, MVT::v8i16, { 1, 3, 1, 1 } }, // vpbroadcastw
2036 { TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 2 } }, // vpbroadcastw
2037 { TTI::SK_Broadcast, MVT::v8f16, { 1, 3, 1, 1 } }, // vpbroadcastw
2038 { TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 2 } }, // vpbroadcastb
2039 { TTI::SK_Broadcast, MVT::v16i8, { 1, 3, 1, 1 } }, // vpbroadcastb
2040
2041 { TTI::SK_Reverse, MVT::v4f64, { 1, 6, 1, 2 } }, // vpermpd
2042 { TTI::SK_Reverse, MVT::v8f32, { 2, 7, 2, 4 } }, // vpermps
2043 { TTI::SK_Reverse, MVT::v4i64, { 1, 6, 1, 2 } }, // vpermq
2044 { TTI::SK_Reverse, MVT::v8i32, { 2, 7, 2, 4 } }, // vpermd
2045 { TTI::SK_Reverse, MVT::v16i16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2046 { TTI::SK_Reverse, MVT::v16f16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2047 { TTI::SK_Reverse, MVT::v32i8, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2048
2049 { TTI::SK_Select, MVT::v16i16, { 1, 1, 1, 1 } }, // vpblendvb
2050 { TTI::SK_Select, MVT::v16f16, { 1, 1, 1, 1 } }, // vpblendvb
2051 { TTI::SK_Select, MVT::v32i8, { 1, 1, 1, 1 } }, // vpblendvb
2052
2053 { TTI::SK_Splice, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2054 { TTI::SK_Splice, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2055 { TTI::SK_Splice, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2056 { TTI::SK_Splice, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2057 { TTI::SK_Splice, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2058
2059 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd
2060 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps
2061 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq
2062 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd
2063 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } },
2064 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } },
2065 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } },
2066
2067 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 3, 3, 3, 3 } }, // 2*vpermpd + vblendpd
2068 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 3, 3, 3, 3 } }, // 2*vpermps + vblendps
2069 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 3, 3, 3, 3 } }, // 2*vpermq + vpblendd
2070 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 3, 3, 3, 3 } }, // 2*vpermd + vpblendd
2071 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 7, 7, 7, 7 } },
2072 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 7, 7, 7, 7 } },
2073 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 7, 7, 7, 7 } },
2074 };
2075
2076 if (ST->hasAVX2())
2077 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
2078 if (auto KindCost = Entry->Cost[CostKind])
2079 return LT.first * *KindCost;
2080
2081 static const CostKindTblEntry XOPShuffleTbl[] = {
2082 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2083 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2084 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2085 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2086 { TTI::SK_PermuteSingleSrc, MVT::v16i16,{ 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2087 // + vinsertf128
2088 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2089 // + vinsertf128
2090
2091 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2092 // + vinsertf128
2093
2094 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 1, 1, 1, 1 } }, // vpperm
2095 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2096 // + vinsertf128
2097 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 1, 1, 1, 1 } }, // vpperm
2098 };
2099
2100 if (ST->hasXOP())
2101 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
2102 if (auto KindCost = Entry->Cost[CostKind])
2103 return LT.first * *KindCost;
2104
2105 static const CostKindTblEntry AVX1InLaneShuffleTbl[] = {
2106 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermilpd
2107 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermilpd
2108 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermilps
2109 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermilps
2110
2111 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2112 // + vpor + vinsertf128
2113 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2114 // + vpor + vinsertf128
2115 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2116 // + vpor + vinsertf128
2117
2118 { TTI::SK_Transpose, MVT::v4f64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2119 { TTI::SK_Transpose, MVT::v4i64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2120
2121 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
2122 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
2123 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpermilpd + vblendpd
2124 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpermilps + vblendps
2125 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2126 // + 2*vpor + vinsertf128
2127 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2128 // + 2*vpor + vinsertf128
2129 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2130 // + 2*vpor + vinsertf128
2131 };
2132
2133 if (IsInLaneShuffle && ST->hasAVX())
2134 if (const auto *Entry =
2135 CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second))
2136 if (auto KindCost = Entry->Cost[CostKind])
2137 return LT.first * *KindCost;
2138
2139 static const CostKindTblEntry AVX1ShuffleTbl[] = {
2140 {TTI::SK_Broadcast, MVT::v4f64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2141 {TTI::SK_Broadcast, MVT::v8f32, {2,3,2,3}}, // vperm2f128 + vpermilps
2142 {TTI::SK_Broadcast, MVT::v4i64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2143 {TTI::SK_Broadcast, MVT::v8i32, {2,3,2,3}}, // vperm2f128 + vpermilps
2144 {TTI::SK_Broadcast, MVT::v16i16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2145 {TTI::SK_Broadcast, MVT::v16f16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2146 {TTI::SK_Broadcast, MVT::v32i8, {3,4,3,6}}, // vpshufb + vinsertf128
2147
2148 {TTI::SK_Reverse, MVT::v4f64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2149 {TTI::SK_Reverse, MVT::v8f32, {2,7,2,4}}, // vperm2f128 + vpermilps
2150 {TTI::SK_Reverse, MVT::v4i64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2151 {TTI::SK_Reverse, MVT::v8i32, {2,7,2,4}}, // vperm2f128 + vpermilps
2152 {TTI::SK_Reverse, MVT::v16i16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2153 // + vinsertf128
2154 {TTI::SK_Reverse, MVT::v16f16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2155 // + vinsertf128
2156 {TTI::SK_Reverse, MVT::v32i8, {2,9,5,5}}, // vextractf128 + 2*pshufb
2157 // + vinsertf128
2158
2159 {TTI::SK_Select, MVT::v4i64, {1,1,1,1}}, // vblendpd
2160 {TTI::SK_Select, MVT::v4f64, {1,1,1,1}}, // vblendpd
2161 {TTI::SK_Select, MVT::v8i32, {1,1,1,1}}, // vblendps
2162 {TTI::SK_Select, MVT::v8f32, {1,1,1,1}}, // vblendps
2163 {TTI::SK_Select, MVT::v16i16, {3,3,3,3}}, // vpand + vpandn + vpor
2164 {TTI::SK_Select, MVT::v16f16, {3,3,3,3}}, // vpand + vpandn + vpor
2165 {TTI::SK_Select, MVT::v32i8, {3,3,3,3}}, // vpand + vpandn + vpor
2166
2167 {TTI::SK_Splice, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + shufpd
2168 {TTI::SK_Splice, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + shufpd
2169 {TTI::SK_Splice, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2170 {TTI::SK_Splice, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2171 {TTI::SK_Splice, MVT::v16i16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2172 {TTI::SK_Splice, MVT::v16f16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2173 {TTI::SK_Splice, MVT::v32i8, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2174
2175 {TTI::SK_PermuteSingleSrc, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vshufpd
2176 {TTI::SK_PermuteSingleSrc, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vshufpd
2177 {TTI::SK_PermuteSingleSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2178 {TTI::SK_PermuteSingleSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2179 {TTI::SK_PermuteSingleSrc, MVT::v16i16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2180 // + 2*por + vinsertf128
2181 {TTI::SK_PermuteSingleSrc, MVT::v16f16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2182 // + 2*por + vinsertf128
2183 {TTI::SK_PermuteSingleSrc, MVT::v32i8, {8,8,8,8}}, // vextractf128 + 4*pshufb
2184 // + 2*por + vinsertf128
2185
2186 {TTI::SK_PermuteTwoSrc, MVT::v4f64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2187 {TTI::SK_PermuteTwoSrc, MVT::v4i64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2188 {TTI::SK_PermuteTwoSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2189 {TTI::SK_PermuteTwoSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2190 {TTI::SK_PermuteTwoSrc, MVT::v16i16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2191 // + 4*por + vinsertf128
2192 {TTI::SK_PermuteTwoSrc, MVT::v16f16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2193 // + 4*por + vinsertf128
2194 {TTI::SK_PermuteTwoSrc, MVT::v32i8, {15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2195 // + 4*por + vinsertf128
2196 };
2197
2198 if (ST->hasAVX())
2199 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2200 if (auto KindCost = Entry->Cost[CostKind])
2201 return LT.first * *KindCost;
2202
2203 static const CostKindTblEntry SSE41ShuffleTbl[] = {
2204 {TTI::SK_Select, MVT::v2i64, {1,1,1,1}}, // pblendw
2205 {TTI::SK_Select, MVT::v2f64, {1,1,1,1}}, // movsd
2206 {TTI::SK_Select, MVT::v4i32, {1,1,1,1}}, // pblendw
2207 {TTI::SK_Select, MVT::v4f32, {1,1,1,1}}, // blendps
2208 {TTI::SK_Select, MVT::v8i16, {1,1,1,1}}, // pblendw
2209 {TTI::SK_Select, MVT::v8f16, {1,1,1,1}}, // pblendw
2210 {TTI::SK_Select, MVT::v16i8, {1,1,1,1}} // pblendvb
2211 };
2212
2213 if (ST->hasSSE41())
2214 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2215 if (auto KindCost = Entry->Cost[CostKind])
2216 return LT.first * *KindCost;
2217
2218 static const CostKindTblEntry SSSE3ShuffleTbl[] = {
2219 {TTI::SK_Broadcast, MVT::v8i16, {1, 3, 2, 2}}, // pshufb
2220 {TTI::SK_Broadcast, MVT::v8f16, {1, 3, 2, 2}}, // pshufb
2221 {TTI::SK_Broadcast, MVT::v16i8, {1, 3, 2, 2}}, // pshufb
2222
2223 {TTI::SK_Reverse, MVT::v8i16, {1, 2, 1, 2}}, // pshufb
2224 {TTI::SK_Reverse, MVT::v8f16, {1, 2, 1, 2}}, // pshufb
2225 {TTI::SK_Reverse, MVT::v16i8, {1, 2, 1, 2}}, // pshufb
2226
2227 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2228 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2229 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2230
2231 {TTI::SK_Splice, MVT::v4i32, {1, 1, 1, 1}}, // palignr
2232 {TTI::SK_Splice, MVT::v4f32, {1, 1, 1, 1}}, // palignr
2233 {TTI::SK_Splice, MVT::v8i16, {1, 1, 1, 1}}, // palignr
2234 {TTI::SK_Splice, MVT::v8f16, {1, 1, 1, 1}}, // palignr
2235 {TTI::SK_Splice, MVT::v16i8, {1, 1, 1, 1}}, // palignr
2236
2237 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
2238 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
2239 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
2240
2241 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2242 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2243 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2244 };
2245
2246 if (ST->hasSSSE3())
2247 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2248 if (auto KindCost = Entry->Cost[CostKind])
2249 return LT.first * *KindCost;
2250
2251 static const CostKindTblEntry SSE2ShuffleTbl[] = {
2252 {TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2253 {TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2254 {TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2255 {TTI::SK_Broadcast, MVT::v8i16, {1, 2, 2, 2}}, // pshuflw + pshufd
2256 {TTI::SK_Broadcast, MVT::v8f16, {1, 2, 2, 2}}, // pshuflw + pshufd
2257 {TTI::SK_Broadcast, MVT::v16i8, {2, 3, 3, 4}}, // unpck + pshuflw + pshufd
2258
2259 {TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2260 {TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2261 {TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2262 {TTI::SK_Reverse, MVT::v8i16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2263 {TTI::SK_Reverse, MVT::v8f16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2264 {TTI::SK_Reverse, MVT::v16i8, {5, 6,11,11}}, // 2*pshuflw + 2*pshufhw
2265 // + 2*pshufd + 2*unpck + packus
2266
2267 {TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd
2268 {TTI::SK_Select, MVT::v2f64, {1, 1, 1, 1}}, // movsd
2269 {TTI::SK_Select, MVT::v4i32, {2, 2, 2, 2}}, // 2*shufps
2270 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // pand + pandn + por
2271 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // pand + pandn + por
2272 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // pand + pandn + por
2273
2274 {TTI::SK_Splice, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2275 {TTI::SK_Splice, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2276 {TTI::SK_Splice, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2277 {TTI::SK_Splice, MVT::v8i16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2278 {TTI::SK_Splice, MVT::v8f16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2279 {TTI::SK_Splice, MVT::v16i8, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2280
2281 {TTI::SK_PermuteSingleSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2282 {TTI::SK_PermuteSingleSrc, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2283 {TTI::SK_PermuteSingleSrc, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2284 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2285 // + pshufd/unpck
2286 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2287 // + pshufd/unpck
2288 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {8, 10, 10, 10}}, // 2*pshuflw + 2*pshufhw
2289 // + 2*pshufd + 2*unpck + 2*packus
2290
2291 {TTI::SK_PermuteTwoSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2292 {TTI::SK_PermuteTwoSrc, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2293 {TTI::SK_PermuteTwoSrc, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2294 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {6, 8, 8, 8}}, // blend+permute
2295 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {6, 8, 8, 8}}, // blend+permute
2296 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {11, 13, 13, 13}}, // blend+permute
2297 };
2298
2299 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2300 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2301 };
2302
2303 if (ST->hasSSE2()) {
2304 bool IsLoad =
2305 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2306 if (ST->hasSSE3() && IsLoad)
2307 if (const auto *Entry =
2308 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2309 assert(isLegalBroadcastLoad(SrcTy->getElementType(),
2310 LT.second.getVectorElementCount()) &&
2311 "Table entry missing from isLegalBroadcastLoad()");
2312 return LT.first * Entry->Cost;
2313 }
2314
2315 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2316 if (auto KindCost = Entry->Cost[CostKind])
2317 return LT.first * *KindCost;
2318 }
2319
2320 static const CostKindTblEntry SSE1ShuffleTbl[] = {
2321 { TTI::SK_Broadcast, MVT::v4f32, {1,1,1,1} }, // shufps
2322 { TTI::SK_Reverse, MVT::v4f32, {1,1,1,1} }, // shufps
2323 { TTI::SK_Select, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2324 { TTI::SK_Splice, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2325 { TTI::SK_PermuteSingleSrc, MVT::v4f32, {1,1,1,1} }, // shufps
2326 { TTI::SK_PermuteTwoSrc, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2327 };
2328
2329 if (ST->hasSSE1()) {
2330 if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) {
2331 // SHUFPS: both pairs must come from the same source register.
2332 auto MatchSHUFPS = [](int X, int Y) {
2333 return X < 0 || Y < 0 || ((X & 4) == (Y & 4));
2334 };
2335 if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3]))
2336 return 1;
2337 }
2338 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2339 if (auto KindCost = Entry->Cost[CostKind])
2340 return LT.first * *KindCost;
2341 }
2342
2343 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
2344 SubTp);
2345}
2346
2348 Type *Src,
2351 const Instruction *I) const {
2352 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2353 assert(ISD && "Invalid opcode");
2354
2355 // The cost tables include both specific, custom (non-legal) src/dst type
2356 // conversions and generic, legalized types. We test for customs first, before
2357 // falling back to legalization.
2358 // FIXME: Need a better design of the cost table to handle non-simple types of
2359 // potential massive combinations (elem_num x src_type x dst_type).
2360 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2361 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2362 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2363
2364 // Mask sign extend has an instruction.
2365 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2366 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2367 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2368 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2369 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2370 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2371 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2372 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2373 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2374 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2375 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2376 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2377 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2378 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2379 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2380 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2381 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2382
2383 // Mask zero extend is a sext + shift.
2384 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2385 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2386 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2387 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2388 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2389 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2390 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2391 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2392 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2393 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2394 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2395 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2396 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2397 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2398 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2399 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2400 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2401
2402 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2403 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2404 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2405 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2406 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2407 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2408 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2409 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2410 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2411 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2412 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2413 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2414 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2415 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2416 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2417 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2418 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2419
2420 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2421 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2422 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2423 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2424 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2425 };
2426
2427 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2428 // Mask sign extend has an instruction.
2429 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2430 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2431 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2432 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2433 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2434 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2435 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2436 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2437
2438 // Mask zero extend is a sext + shift.
2439 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2440 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2441 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2442 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2443 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2444 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2445 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2446 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2447
2448 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2449 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2450 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2451 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2452 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2453 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2454 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2455 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2456
2457 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2458 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2459
2460 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2461 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2462
2463 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2464 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2465
2466 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2467 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2468 };
2469
2470 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2471 // 256-bit wide vectors.
2472
2473 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2474 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2475 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2476 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2477 { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps
2478 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
2479 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2480 { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph
2481
2482 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2483 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2484 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2485 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2486 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2487 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2488 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2489 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2490 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2491 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2492 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2493 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2494 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2495 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2496 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2497 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2498 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2499 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2500 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2501 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2502 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2503 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2504 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2505 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2506 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2507 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2508 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2509 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2510 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2511 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2512 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2513 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2514 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2515 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2516
2517 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2518 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2519 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2520
2521 // Sign extend is zmm vpternlogd+vptruncdb.
2522 // Zero extend is zmm broadcast load+vptruncdw.
2523 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2524 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2525 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2526 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2527 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2528 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2529 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2530 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2531
2532 // Sign extend is zmm vpternlogd+vptruncdw.
2533 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2534 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2535 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2536 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2537 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2538 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2539 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2540 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2541 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2542
2543 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2544 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2545 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2546 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2547 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2548 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2549 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2550 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2551 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2552 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2553
2554 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2555 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2556 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2557 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2558
2559 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2560 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2561 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2562 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2563 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2564 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2565 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2566 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2567 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2568 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2569
2570 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2571 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2572
2573 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2574 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2575 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2576 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2577 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2578 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2579 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2580 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2581
2582 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2583 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2584 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2585 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2586 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2587 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2588 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2589 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2590 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2591 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2592
2593 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2594 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2595 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2596 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2597 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2598 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2599 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2600 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2601 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2602 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2603 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2604
2605 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2606 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2607 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2608 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2609 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2610 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2611 };
2612
2613 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2614 // Mask sign extend has an instruction.
2615 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2616 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2617 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2618 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2619 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2620 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2621 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2622 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2623 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2624 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2625 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2626 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2627 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2628 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2629 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2630 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2631 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2632
2633 // Mask zero extend is a sext + shift.
2634 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2635 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2636 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2637 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2638 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2639 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2640 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2641 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2642 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2643 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2644 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2645 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2646 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2647 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2648 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2649 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2650 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2651
2652 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2653 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2654 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2655 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2656 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2657 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2658 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2659 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2660 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2661 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2662 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2663 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2664 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2665 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2666 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2667 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2668 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2669
2670 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2671 };
2672
2673 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2674 // Mask sign extend has an instruction.
2675 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2676 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2677 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2678 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2679 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2680 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2681 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2682 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2683
2684 // Mask zero extend is a sext + shift.
2685 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2686 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2687 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2688 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2689 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2690 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2691 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2692 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2693
2694 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2695 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2696 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2697 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2698 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2699 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2700 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2701 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2702
2703 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2704 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2705 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2706 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2707
2708 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2709 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2710 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2711 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2712
2713 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2714 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2715 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2716 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2717
2718 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2719 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2720 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2721 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2722 };
2723
2724 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2725 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2726 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2727 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2728 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2729 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2730 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2731 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2732 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2733 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2734 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2735 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2736 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2737 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2738 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2739 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2740 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2741 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2742 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2743
2744 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2745 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2746 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2747 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2748 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2749 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2750 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2751 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2752 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2753 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2754
2755 // sign extend is vpcmpeq+maskedmove+vpmovdw
2756 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2757 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2758 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2759 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2760 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2761 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2762 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2763 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2764 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2765
2766 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2767 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2768 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2769 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2770 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2771 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2772 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2773 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2774
2775 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2776 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2777 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2778 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2779
2780 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2781 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2782 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2783 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2784 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2785 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2786 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2787 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2788 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2789 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2790 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2791 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2792
2793 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2794 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2795 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2796 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2797
2798 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2799 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2800 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2801 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2802 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2803 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2804 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2805 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2806 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2807 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2808 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2809 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2810 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2811
2812 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2813 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2814 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2815
2816 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2817 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2818 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2819 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2820 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2821 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2822 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2823 };
2824
2825 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2826 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2827 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2828 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2829 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2830 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2831 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2832
2833 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2834 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2835 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2836 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2837 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2838 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2839 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2840 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2841 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2842 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2843 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2844 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2845 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2846 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2847
2848 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2849
2850 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2851 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2852 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2853 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2854 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2855 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2856 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2857 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2858 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2859 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2860 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2861 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2862
2863 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2864 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2865
2866 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2867 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2868 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2869 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2870
2871 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2872 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2873 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2874 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2875 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2876 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2877 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2878 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2879
2880 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2881 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2882 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2883 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2884 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2885 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2886 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2887
2888 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2889 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2890 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2891 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2892 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2893 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2894 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2895 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2896 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2897 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2898 };
2899
2900 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2901 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2902 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2903 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2904 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2905 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2906 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2907
2908 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2909 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2910 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2911 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2912 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2913 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2914 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2915 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2916 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2917 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2918 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2919 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2920
2921 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2922 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2923 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2924 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2925 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2926
2927 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2928 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2929 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2930 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2931 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2932 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2933 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2934 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2935
2936 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2937 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2938 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2939 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2940 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2941 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2942 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2943 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2944 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2945 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2946 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2947 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2948
2949 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2950 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2951 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2952 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2953 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2954 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2955 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2956 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2957 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2958 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2959 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2960 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2961 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2962 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2963 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2964 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2965 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2966
2967 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2968 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2969 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2970 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2971 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2972 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2973 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2974 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2975 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2976 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2977 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2978
2979 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2980 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2981 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2982 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2983 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2984 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2985 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2986 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2987 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2988 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2989 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
2990 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
2991 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
2992
2993 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
2994 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
2995 };
2996
2997 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2998 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2999 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
3000 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
3001 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
3002 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3003 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3004 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
3005 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
3006 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3007 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3008 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3009 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3010
3011 // These truncates end up widening elements.
3012 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
3013 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
3014 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
3015
3016 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
3017 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
3018 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
3019
3020 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
3021 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
3022 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
3023 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
3024 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
3025 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
3026 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
3027 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
3028 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
3029 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
3030 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
3031
3032 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
3033 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
3034 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
3035 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
3036 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
3037 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
3038 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
3039 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
3040 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
3041 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3042 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
3043 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
3044 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
3045 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
3046
3047 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3048 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
3049 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3050 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
3051 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3052 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3053 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3054 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3055 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
3056 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
3057
3058 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3059 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3060 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3061 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3062 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3063 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3064 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3065 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3066 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3067 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3068 };
3069
3070 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
3071 // These are somewhat magic numbers justified by comparing the
3072 // output of llvm-mca for our various supported scheduler models
3073 // and basing it off the worst case scenario.
3074 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3075 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3076 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
3077 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
3078 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
3079 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3080 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
3081 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3082 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3083 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
3084 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
3085 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
3086
3087 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3088 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3089 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
3090 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
3091 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3092 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
3093 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
3094 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3095 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
3096 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
3097 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
3098 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
3099 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
3100
3101 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3102 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3103 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3104 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3105 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3106 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3107 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3108 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3109 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3110 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3111
3112 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3113 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3114 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3115 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
3116 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3117 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3118 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3119 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3120 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
3121 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
3122
3123 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3124 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3125 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
3126 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
3127 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3128 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
3129 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
3130 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
3131 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3132 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
3133 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3134 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
3135
3136 // These truncates are really widening elements.
3137 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
3138 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
3139 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
3140 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
3141 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
3142 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
3143
3144 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
3145 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
3146 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
3147 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
3148 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
3149 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
3150 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
3151 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
3152 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
3153 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
3154 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
3155 };
3156
3157 static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
3158 { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } },
3159 { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } },
3160 { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } },
3161 { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } },
3162 { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3163 { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } },
3164 { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } },
3165 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3166 };
3167
3168 // Attempt to map directly to (simple) MVT types to let us match custom entries.
3169 EVT SrcTy = TLI->getValueType(DL, Src);
3170 EVT DstTy = TLI->getValueType(DL, Dst);
3171
3172 // If we're sign-extending a vector comparison result back to the comparison
3173 // width, this will be free without AVX512 (or for 8/16-bit types without
3174 // BWI).
3175 if (!ST->hasAVX512() || (!ST->hasBWI() && DstTy.getScalarSizeInBits() < 32)) {
3176 if (I && Opcode == Instruction::CastOps::SExt &&
3177 SrcTy.isFixedLengthVector() && SrcTy.getScalarType() == MVT::i1) {
3178 if (auto *CmpI = dyn_cast<CmpInst>(I->getOperand(0))) {
3179 Type *CmpTy = CmpI->getOperand(0)->getType();
3180 if (CmpTy->getScalarSizeInBits() == DstTy.getScalarSizeInBits())
3181 return TTI::TCC_Free;
3182 }
3183 }
3184 }
3185
3186 // The function getSimpleVT only handles simple value types.
3187 if (SrcTy.isSimple() && DstTy.isSimple()) {
3188 MVT SimpleSrcTy = SrcTy.getSimpleVT();
3189 MVT SimpleDstTy = DstTy.getSimpleVT();
3190
3191 if (ST->useAVX512Regs()) {
3192 if (ST->hasBWI())
3193 if (const auto *Entry = ConvertCostTableLookup(
3194 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3195 if (auto KindCost = Entry->Cost[CostKind])
3196 return *KindCost;
3197
3198 if (ST->hasDQI())
3199 if (const auto *Entry = ConvertCostTableLookup(
3200 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3201 if (auto KindCost = Entry->Cost[CostKind])
3202 return *KindCost;
3203
3204 if (ST->hasAVX512())
3205 if (const auto *Entry = ConvertCostTableLookup(
3206 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3207 if (auto KindCost = Entry->Cost[CostKind])
3208 return *KindCost;
3209 }
3210
3211 if (ST->hasBWI())
3212 if (const auto *Entry = ConvertCostTableLookup(
3213 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3214 if (auto KindCost = Entry->Cost[CostKind])
3215 return *KindCost;
3216
3217 if (ST->hasDQI())
3218 if (const auto *Entry = ConvertCostTableLookup(
3219 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3220 if (auto KindCost = Entry->Cost[CostKind])
3221 return *KindCost;
3222
3223 if (ST->hasAVX512())
3224 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3225 SimpleDstTy, SimpleSrcTy))
3226 if (auto KindCost = Entry->Cost[CostKind])
3227 return *KindCost;
3228
3229 if (ST->hasAVX2()) {
3230 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3231 SimpleDstTy, SimpleSrcTy))
3232 if (auto KindCost = Entry->Cost[CostKind])
3233 return *KindCost;
3234 }
3235
3236 if (ST->hasAVX()) {
3237 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3238 SimpleDstTy, SimpleSrcTy))
3239 if (auto KindCost = Entry->Cost[CostKind])
3240 return *KindCost;
3241 }
3242
3243 if (ST->hasF16C()) {
3244 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3245 SimpleDstTy, SimpleSrcTy))
3246 if (auto KindCost = Entry->Cost[CostKind])
3247 return *KindCost;
3248 }
3249
3250 if (ST->hasSSE41()) {
3251 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3252 SimpleDstTy, SimpleSrcTy))
3253 if (auto KindCost = Entry->Cost[CostKind])
3254 return *KindCost;
3255 }
3256
3257 if (ST->hasSSE2()) {
3258 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3259 SimpleDstTy, SimpleSrcTy))
3260 if (auto KindCost = Entry->Cost[CostKind])
3261 return *KindCost;
3262 }
3263
3264 if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) ||
3265 (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) {
3266 // fp16 conversions not covered by any table entries require a libcall.
3267 // Return a large (arbitrary) number to model this.
3268 return InstructionCost(64);
3269 }
3270 }
3271
3272 // Fall back to legalized types.
3273 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3274 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3275
3276 // If we're truncating to the same legalized type - just assume its free.
3277 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3278 return TTI::TCC_Free;
3279
3280 if (ST->useAVX512Regs()) {
3281 if (ST->hasBWI())
3282 if (const auto *Entry = ConvertCostTableLookup(
3283 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3284 if (auto KindCost = Entry->Cost[CostKind])
3285 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3286
3287 if (ST->hasDQI())
3288 if (const auto *Entry = ConvertCostTableLookup(
3289 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3290 if (auto KindCost = Entry->Cost[CostKind])
3291 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3292
3293 if (ST->hasAVX512())
3294 if (const auto *Entry = ConvertCostTableLookup(
3295 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3296 if (auto KindCost = Entry->Cost[CostKind])
3297 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3298 }
3299
3300 if (ST->hasBWI())
3301 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3302 LTDest.second, LTSrc.second))
3303 if (auto KindCost = Entry->Cost[CostKind])
3304 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3305
3306 if (ST->hasDQI())
3307 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3308 LTDest.second, LTSrc.second))
3309 if (auto KindCost = Entry->Cost[CostKind])
3310 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3311
3312 if (ST->hasAVX512())
3313 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3314 LTDest.second, LTSrc.second))
3315 if (auto KindCost = Entry->Cost[CostKind])
3316 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3317
3318 if (ST->hasAVX2())
3319 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3320 LTDest.second, LTSrc.second))
3321 if (auto KindCost = Entry->Cost[CostKind])
3322 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3323
3324 if (ST->hasAVX())
3325 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3326 LTDest.second, LTSrc.second))
3327 if (auto KindCost = Entry->Cost[CostKind])
3328 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3329
3330 if (ST->hasF16C()) {
3331 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3332 LTDest.second, LTSrc.second))
3333 if (auto KindCost = Entry->Cost[CostKind])
3334 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3335 }
3336
3337 if (ST->hasSSE41())
3338 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3339 LTDest.second, LTSrc.second))
3340 if (auto KindCost = Entry->Cost[CostKind])
3341 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3342
3343 if (ST->hasSSE2())
3344 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3345 LTDest.second, LTSrc.second))
3346 if (auto KindCost = Entry->Cost[CostKind])
3347 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3348
3349 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3350 // sitofp.
3351 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3352 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3353 Type *ExtSrc = Src->getWithNewBitWidth(32);
3354 unsigned ExtOpc =
3355 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3356
3357 // For scalar loads the extend would be free.
3358 InstructionCost ExtCost = 0;
3359 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3360 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3361
3362 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3364 }
3365
3366 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3367 // i32.
3368 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3369 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3370 Type *TruncDst = Dst->getWithNewBitWidth(32);
3371 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3372 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3374 }
3375
3376 // TODO: Allow non-throughput costs that aren't binary.
3377 auto AdjustCost = [&CostKind](InstructionCost Cost,
3380 return Cost == 0 ? 0 : N;
3381 return Cost * N;
3382 };
3383 return AdjustCost(
3384 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3385}
3386
3388 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3390 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
3391 // Early out if this type isn't scalar/vector integer/float.
3392 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3393 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3394 Op1Info, Op2Info, I);
3395
3396 // Legalize the type.
3397 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3398
3399 MVT MTy = LT.second;
3400
3401 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3402 assert(ISD && "Invalid opcode");
3403
3404 InstructionCost ExtraCost = 0;
3405 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3406 // Some vector comparison predicates cost extra instructions.
3407 // TODO: Adjust ExtraCost based on CostKind?
3408 // TODO: Should we invert this and assume worst case cmp costs
3409 // and reduce for particular predicates?
3410 if (MTy.isVector() &&
3411 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3412 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3413 ST->hasBWI())) {
3414 // Fallback to I if a specific predicate wasn't specified.
3415 CmpInst::Predicate Pred = VecPred;
3416 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3418 Pred = cast<CmpInst>(I)->getPredicate();
3419
3420 bool CmpWithConstant = false;
3421 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3422 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3423
3424 switch (Pred) {
3426 // xor(cmpeq(x,y),-1)
3427 ExtraCost = CmpWithConstant ? 0 : 1;
3428 break;
3431 // xor(cmpgt(x,y),-1)
3432 ExtraCost = CmpWithConstant ? 0 : 1;
3433 break;
3436 // cmpgt(xor(x,signbit),xor(y,signbit))
3437 // xor(cmpeq(pmaxu(x,y),x),-1)
3438 ExtraCost = CmpWithConstant ? 1 : 2;
3439 break;
3442 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3443 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3444 // cmpeq(psubus(x,y),0)
3445 // cmpeq(pminu(x,y),x)
3446 ExtraCost = 1;
3447 } else {
3448 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3449 ExtraCost = CmpWithConstant ? 2 : 3;
3450 }
3451 break;
3454 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3455 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3456 if (CondTy && !ST->hasAVX())
3457 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3459 Op1Info, Op2Info) +
3460 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3462 Op1Info, Op2Info) +
3463 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3464
3465 break;
3468 // Assume worst case scenario and add the maximum extra cost.
3469 ExtraCost = 3;
3470 break;
3471 default:
3472 break;
3473 }
3474 }
3475 }
3476
3477 static const CostKindTblEntry SLMCostTbl[] = {
3478 // slm pcmpeq/pcmpgt throughput is 2
3479 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3480 // slm pblendvb/blendvpd/blendvps throughput is 4
3481 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3482 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3483 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3484 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3485 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3486 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3487 };
3488
3489 static const CostKindTblEntry AVX512BWCostTbl[] = {
3490 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3491 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3492 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3493 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3494
3495 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3496 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3497 };
3498
3499 static const CostKindTblEntry AVX512CostTbl[] = {
3500 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3501 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3502 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3503 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3504
3505 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3506 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3507 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3508 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3509 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3510 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3511 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3512
3513 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3514 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3515 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3516 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3517 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3518 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3519 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3520 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3521 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3522 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3523 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3524 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3525 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3526 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3527
3528 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3529 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3530 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3531 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3532 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3533 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3534 };
3535
3536 static const CostKindTblEntry AVX2CostTbl[] = {
3537 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3538 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3539 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3540 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3541 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3542 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3543
3544 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3545 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3546 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3547 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3548
3549 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3550 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3551 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3552 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3553 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3554 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3555 };
3556
3557 static const CostKindTblEntry XOPCostTbl[] = {
3558 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3559 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3560 };
3561
3562 static const CostKindTblEntry AVX1CostTbl[] = {
3563 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3564 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3565 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3566 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3567 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3568 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3569
3570 // AVX1 does not support 8-wide integer compare.
3571 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3572 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3573 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3574 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3575
3576 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3577 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3578 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3579 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3580 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3581 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3582 };
3583
3584 static const CostKindTblEntry SSE42CostTbl[] = {
3585 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3586 };
3587
3588 static const CostKindTblEntry SSE41CostTbl[] = {
3589 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3590 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3591
3592 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3593 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3594 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3595 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3596 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3597 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3598 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3599 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3600 };
3601
3602 static const CostKindTblEntry SSE2CostTbl[] = {
3603 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3604 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3605
3606 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3607 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3608 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3609 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3610
3611 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3612 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3613 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3614 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3615 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3616 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3617 };
3618
3619 static const CostKindTblEntry SSE1CostTbl[] = {
3620 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3621 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3622
3623 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3624 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3625 };
3626
3627 if (ST->useSLMArithCosts())
3628 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3629 if (auto KindCost = Entry->Cost[CostKind])
3630 return LT.first * (ExtraCost + *KindCost);
3631
3632 if (ST->hasBWI())
3633 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3634 if (auto KindCost = Entry->Cost[CostKind])
3635 return LT.first * (ExtraCost + *KindCost);
3636
3637 if (ST->hasAVX512())
3638 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3639 if (auto KindCost = Entry->Cost[CostKind])
3640 return LT.first * (ExtraCost + *KindCost);
3641
3642 if (ST->hasAVX2())
3643 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3644 if (auto KindCost = Entry->Cost[CostKind])
3645 return LT.first * (ExtraCost + *KindCost);
3646
3647 if (ST->hasXOP())
3648 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3649 if (auto KindCost = Entry->Cost[CostKind])
3650 return LT.first * (ExtraCost + *KindCost);
3651
3652 if (ST->hasAVX())
3653 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3654 if (auto KindCost = Entry->Cost[CostKind])
3655 return LT.first * (ExtraCost + *KindCost);
3656
3657 if (ST->hasSSE42())
3658 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3659 if (auto KindCost = Entry->Cost[CostKind])
3660 return LT.first * (ExtraCost + *KindCost);
3661
3662 if (ST->hasSSE41())
3663 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3664 if (auto KindCost = Entry->Cost[CostKind])
3665 return LT.first * (ExtraCost + *KindCost);
3666
3667 if (ST->hasSSE2())
3668 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3669 if (auto KindCost = Entry->Cost[CostKind])
3670 return LT.first * (ExtraCost + *KindCost);
3671
3672 if (ST->hasSSE1())
3673 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3674 if (auto KindCost = Entry->Cost[CostKind])
3675 return LT.first * (ExtraCost + *KindCost);
3676
3677 // Assume a 3cy latency for fp select ops.
3678 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3679 if (ValTy->getScalarType()->isFloatingPointTy())
3680 return 3;
3681
3682 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3683 Op1Info, Op2Info, I);
3684}
3685
3687
3691 // Costs should match the codegen from:
3692 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3693 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3694 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3695 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3696 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3697
3698 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3699 // specialized in these tables yet.
3700 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3701 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3702 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3703 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3704 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3705 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3706 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3707 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3708 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3709 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3710 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3711 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3712 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3713 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3714 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3715 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3716 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3717 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3718 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3719 };
3720 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3721 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3722 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3723 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3724 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3725 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3726 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3727 };
3728 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3729 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3730 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3731 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3732 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3733 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3734 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3735 };
3736 static const CostKindTblEntry AVX512CDCostTbl[] = {
3737 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3738 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3739 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3740 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3741 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3742 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3743 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3744 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3745 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3746 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3747 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3748 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3749
3750 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3751 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3752 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3753 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3754 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3755 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3756 };
3757 static const CostKindTblEntry AVX512BWCostTbl[] = {
3758 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3759 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3760 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3761 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3762 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3763 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3764 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3765 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3766 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3767 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3768 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3769 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3770 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3771 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3772 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3773 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3774 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3775 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3776 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3777 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3778 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3779 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3780 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3781 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3782 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3783 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3784 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3785 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3786 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3787 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3788 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3789 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3790 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3791 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3792 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3793 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3794 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3795 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3796 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3797 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3798 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3799 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3800 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3801 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3802 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3803 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3804 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3805 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3806 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3807 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3808 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3809 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3810 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3811 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3812 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3813 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3814 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3815 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3816 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3817 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3818 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3819 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3820 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3821 { ISD::SADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3822 { ISD::SADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3823 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3824 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3825 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3826 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3827 { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } },
3828 { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } },
3829 { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } },
3830 { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } },
3831 { ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3832 { ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3833 { ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3834 { ISD::UADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3835 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3836 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3837 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3838 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3839 { ISD::USUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3840 { ISD::USUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3841 };
3842 static const CostKindTblEntry AVX512CostTbl[] = {
3843 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3844 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3845 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3846 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3847 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3848 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3849 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3850 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3851 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3852 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3853 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3854 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3855 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3856 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3857 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3858 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3859 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3860 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3861 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3862 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3863 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3864 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3865 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3866 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3867 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3868 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3869 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3870 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3871 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3872 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3873 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3874 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3875 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3876 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3877 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3878 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3879 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3880 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3881 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3882 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3883 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3884 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3885 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3886 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3887 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3888 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3889 { ISD::SADDSAT, MVT::v2i64, { 3, 3, 8, 9 } },
3890 { ISD::SADDSAT, MVT::v4i64, { 2, 2, 6, 7 } },
3891 { ISD::SADDSAT, MVT::v8i64, { 3, 3, 6, 7 } },
3892 { ISD::SADDSAT, MVT::v4i32, { 2, 2, 6, 7 } },
3893 { ISD::SADDSAT, MVT::v8i32, { 2, 2, 6, 7 } },
3894 { ISD::SADDSAT, MVT::v16i32, { 3, 3, 6, 7 } },
3895 { ISD::SADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3896 { ISD::SADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3897 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3898 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3899 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3900 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3901 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3902 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3903 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3904 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3905 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3906 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3907 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3908 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3909 { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } },
3910 { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } },
3911 { ISD::SMULO, MVT::v32i16, { 6, 12, 17, 17 } },
3912 { ISD::SMULO, MVT::v64i8, { 22, 28, 42, 42 } },
3913 { ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } },
3914 { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } },
3915 { ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } },
3916 { ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } },
3917 { ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } },
3918 { ISD::SSUBSAT, MVT::v16i32, { 2, 14, 7, 8 } },
3919 { ISD::SSUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3920 { ISD::SSUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3921 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3922 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3923 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3924 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3925 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3926 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3927 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3928 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3929 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3930 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3931 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3932 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3933 { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} },
3934 { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } },
3935 { ISD::UMULO, MVT::v32i16, { 5, 13, 16, 16 } },
3936 { ISD::UMULO, MVT::v64i8, { 18, 24, 30, 30 } },
3937 { ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } },
3938 { ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } },
3939 { ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } },
3940 { ISD::UADDSAT, MVT::v4i32, { 1, 2, 4, 4 } },
3941 { ISD::UADDSAT, MVT::v8i32, { 1, 2, 4, 4 } },
3942 { ISD::UADDSAT, MVT::v16i32, { 2, 2, 4, 4 } },
3943 { ISD::UADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3944 { ISD::UADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3945 { ISD::USUBSAT, MVT::v2i64, { 1, 4, 2, 2 } },
3946 { ISD::USUBSAT, MVT::v4i64, { 1, 4, 2, 2 } },
3947 { ISD::USUBSAT, MVT::v8i64, { 1, 4, 2, 2 } },
3948 { ISD::USUBSAT, MVT::v8i32, { 1, 2, 2, 2 } },
3949 { ISD::USUBSAT, MVT::v16i32, { 1, 2, 2, 2 } },
3950 { ISD::USUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3951 { ISD::USUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3952 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3953 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3954 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3955 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3956 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3957 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3958 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3959 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3960 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3961 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3962 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3963 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3964 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3965 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3966 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3967 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3968 };
3969 static const CostKindTblEntry XOPCostTbl[] = {
3970 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3971 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3972 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3973 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3974 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3975 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3976 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3977 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3978 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3979 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3980 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3981 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3982 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3983 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3984 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3985 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3986 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3987 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3988 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3989 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3990 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3991 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3992 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3993 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3994 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3995 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3996 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3997 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3998 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3999 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
4000 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
4001 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
4002 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
4003 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
4004 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
4005 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
4006 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
4007 };
4008 static const CostKindTblEntry AVX2CostTbl[] = {
4009 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4010 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4011 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
4012 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
4013 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
4014 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
4015 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
4016 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
4017 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
4018 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
4019 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
4020 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
4021 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
4022 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
4023 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
4024 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
4025 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
4026 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
4027 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
4028 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
4029 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
4030 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
4031 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
4032 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
4033 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
4034 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
4035 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
4036 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
4037 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
4038 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
4039 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
4040 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
4041 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
4042 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
4043 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
4044 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
4045 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
4046 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
4047 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
4048 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
4049 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
4050 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
4051 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
4052 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
4053 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
4054 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
4055 { ISD::SADDSAT, MVT::v2i64, { 4, 13, 8, 11 } },
4056 { ISD::SADDSAT, MVT::v4i64, { 3, 10, 8, 12 } },
4057 { ISD::SADDSAT, MVT::v4i32, { 2, 6, 7, 9 } },
4058 { ISD::SADDSAT, MVT::v8i32, { 4, 6, 7, 13 } },
4059 { ISD::SADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4060 { ISD::SADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4061 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
4062 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
4063 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4064 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4065 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4066 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
4067 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
4068 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4069 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4070 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4071 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4072 { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } },
4073 { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } },
4074 { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } },
4075 { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } },
4076 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4077 { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } },
4078 { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } },
4079 { ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } },
4080 { ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } },
4081 { ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } },
4082 { ISD::SSUBSAT, MVT::v8i32, { 4, 15, 9, 16 } },
4083 { ISD::SSUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4084 { ISD::SSUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4085 { ISD::UADDSAT, MVT::v2i64, { 2, 8, 6, 6 } },
4086 { ISD::UADDSAT, MVT::v4i64, { 3, 8, 6, 10 } },
4087 { ISD::UADDSAT, MVT::v8i32, { 2, 2, 4, 8 } },
4088 { ISD::UADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4089 { ISD::UADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4090 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
4091 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
4092 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4093 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4094 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4095 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
4096 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
4097 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4098 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4099 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4100 { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } },
4101 { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } },
4102 { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } },
4103 { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } },
4104 { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } },
4105 { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } },
4106 { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } },
4107 { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } },
4108 { ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } },
4109 { ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } },
4110 { ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } },
4111 { ISD::USUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4112 { ISD::USUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4113 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4114 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4115 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4116 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4117 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4118 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4119 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
4120 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
4121 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
4122 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
4123 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
4124 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
4125 };
4126 static const CostKindTblEntry AVX1CostTbl[] = {
4127 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4128 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
4129 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
4130 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
4131 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4132 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
4133 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4134 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
4135 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4136 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
4137 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
4138 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
4139 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
4140 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
4141 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
4142 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
4143 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
4144 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
4145 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
4146 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
4147 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
4148 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
4149 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
4150 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
4151 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4152 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
4153 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4154 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
4155 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4156 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
4157 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
4158 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
4159 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
4160 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
4161 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
4162 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
4163 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
4164 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
4165 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4166 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
4167 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
4168 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
4169 { ISD::SADDSAT, MVT::v2i64, { 6, 13, 8, 11 } },
4170 { ISD::SADDSAT, MVT::v4i64, { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
4171 { ISD::SADDSAT, MVT::v8i32, { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
4172 { ISD::SADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4173 { ISD::SADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4174 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4175 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
4176 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4177 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4178 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4179 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4180 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4181 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4182 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4183 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4184 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4185 { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } },
4186 { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } },
4187 { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } },
4188 { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } },
4189 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4190 { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } },
4191 { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } },
4192 { ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } },
4193 { ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4194 { ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4195 { ISD::SSUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4196 { ISD::SSUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4197 { ISD::UADDSAT, MVT::v2i64, { 3, 8, 6, 6 } },
4198 { ISD::UADDSAT, MVT::v4i64, { 8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4199 { ISD::UADDSAT, MVT::v8i32, { 6, 6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
4200 { ISD::UADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4201 { ISD::UADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4202 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4203 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
4204 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4205 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4206 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4207 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4208 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
4209 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4210 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4211 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4212 { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } },
4213 { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } },
4214 { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } },
4215 { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } },
4216 { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } },
4217 { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } },
4218 { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } },
4219 { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } },
4220 { ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } },
4221 { ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4222 { ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert
4223 { ISD::USUBSAT, MVT::v8i32, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4224 { ISD::USUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4225 { ISD::USUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4226 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4227 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4228 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4229 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4230 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4231 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4232 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
4233 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
4234 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
4235 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
4236 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
4237 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
4238 };
4239 static const CostKindTblEntry GFNICostTbl[] = {
4240 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
4241 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
4242 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
4243 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
4244 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4245 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4246 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4247 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
4248 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4249 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4250 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
4251 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4252 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4253 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
4254 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4255 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4256 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4257 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4258 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4259 };
4260 static const CostKindTblEntry GLMCostTbl[] = {
4261 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
4262 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
4263 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
4264 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
4265 };
4266 static const CostKindTblEntry SLMCostTbl[] = {
4267 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
4268 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
4269 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
4270 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
4271 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
4272 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
4273 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
4274 };
4275 static const CostKindTblEntry SSE42CostTbl[] = {
4276 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4277 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4278 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4279 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4280 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4281 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4282 };
4283 static const CostKindTblEntry SSE41CostTbl[] = {
4284 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
4285 { ISD::SADDSAT, MVT::v2i64, { 10, 14, 17, 21 } },
4286 { ISD::SADDSAT, MVT::v4i32, { 5, 11, 8, 10 } },
4287 { ISD::SSUBSAT, MVT::v2i64, { 12, 19, 25, 29 } },
4288 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 10, 12 } },
4289 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
4290 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4291 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4292 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4293 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4294 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4295 { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } },
4296 { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } },
4297 { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } },
4298 { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } },
4299 { ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } },
4300 { ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } },
4301 { ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } },
4302 { ISD::USUBSAT, MVT::v4i32, { 1, 2, 2, 2 } },
4303 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
4304 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4305 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4306 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
4307 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4308 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4309 { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } },
4310 { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } },
4311 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4312 { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } },
4313 };
4314 static const CostKindTblEntry SSSE3CostTbl[] = {
4315 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
4316 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
4317 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
4318 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
4319 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
4320 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
4321 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
4322 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
4323 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
4324 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
4325 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4326 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4327 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4328 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4329 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4330 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4331 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4332 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4333 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4334 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4335 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4336 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4337 };
4338 static const CostKindTblEntry SSE2CostTbl[] = {
4339 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4340 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4341 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4342 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4343 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4344 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4345 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4346 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4347 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4348 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4349 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4350 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4351 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4352 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4353 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4354 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4355 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4356 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4357 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4358 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4359 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4360 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4361 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4362 { ISD::SADDSAT, MVT::v2i64, { 12, 14, 24, 24 } },
4363 { ISD::SADDSAT, MVT::v4i32, { 6, 11, 11, 12 } },
4364 { ISD::SADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4365 { ISD::SADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4366 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4367 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4368 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4369 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4370 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4371 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4372 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4373 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4374 { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } },
4375 { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } },
4376 { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } },
4377 { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } },
4378 { ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } },
4379 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } },
4380 { ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4381 { ISD::SSUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4382 { ISD::UADDSAT, MVT::v2i64, { 7, 13, 14, 14 } },
4383 { ISD::UADDSAT, MVT::v4i32, { 4, 5, 7, 7 } },
4384 { ISD::UADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4385 { ISD::UADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4386 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4387 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4388 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4389 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4390 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4391 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4392 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4393 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4394 { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } },
4395 { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } },
4396 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4397 { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } },
4398 { ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } },
4399 { ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } },
4400 { ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4401 { ISD::USUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4402 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4403 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4404 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4405 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4406 };
4407 static const CostKindTblEntry SSE1CostTbl[] = {
4408 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4409 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4410 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4411 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4412 };
4413 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4414 { ISD::CTTZ, MVT::i64, { 1, 1, 1, 1 } },
4415 };
4416 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4417 { ISD::CTTZ, MVT::i32, { 1, 1, 1, 1 } },
4418 { ISD::CTTZ, MVT::i16, { 2, 1, 1, 1 } },
4419 { ISD::CTTZ, MVT::i8, { 2, 1, 1, 1 } },
4420 };
4421 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4422 { ISD::CTLZ, MVT::i64, { 1, 1, 1, 1 } },
4423 };
4424 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4425 { ISD::CTLZ, MVT::i32, { 1, 1, 1, 1 } },
4426 { ISD::CTLZ, MVT::i16, { 2, 1, 1, 1 } },
4427 { ISD::CTLZ, MVT::i8, { 2, 1, 1, 1 } },
4428 };
4429 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4430 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4431 };
4432 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4433 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4434 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4435 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4436 };
4437 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4438 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4439 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4440 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4441 { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4442 { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4443 { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR
4444 { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR
4445 { ISD::CTLZ_ZERO_POISON,MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
4446 { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF
4447 { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF
4448 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF
4449 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF
4450 { ISD::CTTZ_ZERO_POISON,MVT::i64,{ 1, 2, 1, 2 } }, // BSF
4451 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4452 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4453 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4454 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4455 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4456 { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } },
4457 { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } },
4458 { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } },
4459 { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } },
4460 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4461 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4462 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4463 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4464 { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
4465 { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
4466 { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
4467 { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
4468 };
4469 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4470 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4471 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4472 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4473 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4474 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4475 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4476 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4477 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4478 { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4479 { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4480 { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4481 { ISD::CTLZ_ZERO_POISON,MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
4482 { ISD::CTLZ_ZERO_POISON,MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
4483 { ISD::CTLZ_ZERO_POISON,MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4484 { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH
4485 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4486 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4487 { ISD::CTTZ_ZERO_POISON,MVT::i32,{ 1, 2, 1, 2 } }, // BSF
4488 { ISD::CTTZ_ZERO_POISON,MVT::i16,{ 2, 2, 1, 2 } }, // BSF
4489 { ISD::CTTZ_ZERO_POISON,MVT::i8, { 2, 2, 1, 2 } }, // BSF
4490 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4491 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4492 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4493 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4494 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4495 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4496 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4497 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4498 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4499 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4500 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4501 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4502 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4503 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4504 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4505 { ISD::SADDSAT, MVT::i32, { 3, 4, 6, 9 } },
4506 { ISD::SADDSAT, MVT::i16, { 4, 4, 7, 10 } },
4507 { ISD::SADDSAT, MVT::i8, { 4, 5, 8, 11 } },
4508 { ISD::SSUBSAT, MVT::i32, { 4, 4, 7, 10 } },
4509 { ISD::SSUBSAT, MVT::i16, { 4, 4, 7, 10 } },
4510 { ISD::SSUBSAT, MVT::i8, { 4, 5, 8, 11 } },
4511 { ISD::UADDSAT, MVT::i32, { 2, 3, 4, 7 } },
4512 { ISD::UADDSAT, MVT::i16, { 2, 3, 4, 7 } },
4513 { ISD::UADDSAT, MVT::i8, { 3, 3, 5, 8 } },
4514 { ISD::USUBSAT, MVT::i32, { 2, 3, 4, 7 } },
4515 { ISD::USUBSAT, MVT::i16, { 2, 3, 4, 7 } },
4516 { ISD::USUBSAT, MVT::i8, { 3, 3, 5, 8 } },
4517 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4518 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4519 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4520 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4521 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4522 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4523 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4524 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4525 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4526 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4527 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4528 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4529 { ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } },
4530 { ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } },
4531 { ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } },
4532 { ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } },
4533 { ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } },
4534 { ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } },
4535 { ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } },
4536 { ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } },
4537 { ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } },
4538 { ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } },
4539 { ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } },
4540 { ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } },
4541 };
4542
4543 Type *RetTy = ICA.getReturnType();
4544 Type *OpTy = RetTy;
4545 Intrinsic::ID IID = ICA.getID();
4546 unsigned ISD = ISD::DELETED_NODE;
4547 switch (IID) {
4548 default:
4549 break;
4550 case Intrinsic::abs:
4551 ISD = ISD::ABS;
4552 break;
4553 case Intrinsic::bitreverse:
4555 break;
4556 case Intrinsic::bswap:
4557 ISD = ISD::BSWAP;
4558 break;
4559 case Intrinsic::ctlz:
4560 ISD = ISD::CTLZ;
4561 break;
4562 case Intrinsic::ctpop:
4563 ISD = ISD::CTPOP;
4564 break;
4565 case Intrinsic::cttz:
4566 ISD = ISD::CTTZ;
4567 break;
4568 case Intrinsic::fshl:
4569 ISD = ISD::FSHL;
4570 if (!ICA.isTypeBasedOnly()) {
4571 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4572 if (Args[0] == Args[1]) {
4573 ISD = ISD::ROTL;
4574 // Handle uniform constant rotation amounts.
4575 // TODO: Handle funnel-shift cases.
4576 const APInt *Amt;
4577 if (Args[2] &&
4579 ISD = X86ISD::VROTLI;
4580 }
4581 }
4582 break;
4583 case Intrinsic::fshr:
4584 // FSHR has same costs so don't duplicate.
4585 ISD = ISD::FSHL;
4586 if (!ICA.isTypeBasedOnly()) {
4587 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4588 if (Args[0] == Args[1]) {
4589 ISD = ISD::ROTR;
4590 // Handle uniform constant rotation amount.
4591 // TODO: Handle funnel-shift cases.
4592 const APInt *Amt;
4593 if (Args[2] &&
4595 ISD = X86ISD::VROTLI;
4596 }
4597 }
4598 break;
4599 case Intrinsic::lrint:
4600 case Intrinsic::llrint: {
4601 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4602 // have the same costs as the CVTTP2SI (fptosi) instructions
4603 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4604 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4606 }
4607 case Intrinsic::maxnum:
4608 case Intrinsic::minnum:
4609 // FMINNUM has same costs so don't duplicate.
4610 ISD = ISD::FMAXNUM;
4611 break;
4612 case Intrinsic::sadd_sat:
4613 ISD = ISD::SADDSAT;
4614 break;
4615 case Intrinsic::smax:
4616 ISD = ISD::SMAX;
4617 break;
4618 case Intrinsic::smin:
4619 ISD = ISD::SMIN;
4620 break;
4621 case Intrinsic::ssub_sat:
4622 ISD = ISD::SSUBSAT;
4623 break;
4624 case Intrinsic::uadd_sat:
4625 ISD = ISD::UADDSAT;
4626 break;
4627 case Intrinsic::umax:
4628 ISD = ISD::UMAX;
4629 break;
4630 case Intrinsic::umin:
4631 ISD = ISD::UMIN;
4632 break;
4633 case Intrinsic::usub_sat:
4634 ISD = ISD::USUBSAT;
4635 break;
4636 case Intrinsic::sqrt:
4637 ISD = ISD::FSQRT;
4638 break;
4639 case Intrinsic::sadd_with_overflow:
4640 case Intrinsic::ssub_with_overflow:
4641 // SSUBO has same costs so don't duplicate.
4642 ISD = ISD::SADDO;
4643 OpTy = RetTy->getContainedType(0);
4644 break;
4645 case Intrinsic::uadd_with_overflow:
4646 case Intrinsic::usub_with_overflow:
4647 // USUBO has same costs so don't duplicate.
4648 ISD = ISD::UADDO;
4649 OpTy = RetTy->getContainedType(0);
4650 break;
4651 case Intrinsic::smul_with_overflow:
4652 ISD = ISD::SMULO;
4653 OpTy = RetTy->getContainedType(0);
4654 break;
4655 case Intrinsic::umul_with_overflow:
4656 ISD = ISD::UMULO;
4657 OpTy = RetTy->getContainedType(0);
4658 break;
4659 }
4660
4661 if (ISD != ISD::DELETED_NODE) {
4662 auto adjustTableCost = [&](int ISD, unsigned Cost,
4663 std::pair<InstructionCost, MVT> LT,
4665 InstructionCost LegalizationCost = LT.first;
4666 MVT MTy = LT.second;
4667
4668 // If there are no NANs to deal with, then these are reduced to a
4669 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4670 // assume is used in the non-fast case.
4671 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4672 if (FMF.noNaNs())
4673 return LegalizationCost * 1;
4674 }
4675
4676 // For cases where some ops can be folded into a load/store, assume free.
4677 if (MTy.isScalarInteger()) {
4678 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4679 if (const Instruction *II = ICA.getInst()) {
4680 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4681 return TTI::TCC_Free;
4682 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4683 if (LI->hasOneUse())
4684 return TTI::TCC_Free;
4685 }
4686 }
4687 }
4688 }
4689
4690 return LegalizationCost * (int)Cost;
4691 };
4692
4693 // Legalize the type.
4694 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4695 MVT MTy = LT.second;
4696
4697 // Without BMI/LZCNT see if we're only looking for a *_ZERO_POISON cost.
4698 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4699 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4700 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4701 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4702 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4703 if (Cst->isAllOnesValue())
4704 ISD =
4706 }
4707
4708 // FSQRT is a single instruction.
4710 return LT.first;
4711
4712 if (ST->useGLMDivSqrtCosts())
4713 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4714 if (auto KindCost = Entry->Cost[CostKind])
4715 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4716
4717 if (ST->useSLMArithCosts())
4718 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4719 if (auto KindCost = Entry->Cost[CostKind])
4720 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4721
4722 if (ST->hasVBMI2())
4723 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4724 if (auto KindCost = Entry->Cost[CostKind])
4725 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4726
4727 if (ST->hasBITALG())
4728 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4729 if (auto KindCost = Entry->Cost[CostKind])
4730 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4731
4732 if (ST->hasVPOPCNTDQ())
4733 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4734 if (auto KindCost = Entry->Cost[CostKind])
4735 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4736
4737 if (ST->hasGFNI())
4738 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4739 if (auto KindCost = Entry->Cost[CostKind])
4740 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4741
4742 if (ST->hasCDI())
4743 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4744 if (auto KindCost = Entry->Cost[CostKind])
4745 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4746
4747 if (ST->hasBWI())
4748 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4749 if (auto KindCost = Entry->Cost[CostKind])
4750 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4751
4752 if (ST->hasAVX512())
4753 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4754 if (auto KindCost = Entry->Cost[CostKind])
4755 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4756
4757 if (ST->hasXOP())
4758 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4759 if (auto KindCost = Entry->Cost[CostKind])
4760 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4761
4762 if (ST->hasAVX2())
4763 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4764 if (auto KindCost = Entry->Cost[CostKind])
4765 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4766
4767 if (ST->hasAVX())
4768 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4769 if (auto KindCost = Entry->Cost[CostKind])
4770 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4771
4772 if (ST->hasSSE42())
4773 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4774 if (auto KindCost = Entry->Cost[CostKind])
4775 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4776
4777 if (ST->hasSSE41())
4778 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4779 if (auto KindCost = Entry->Cost[CostKind])
4780 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4781
4782 if (ST->hasSSSE3())
4783 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4784 if (auto KindCost = Entry->Cost[CostKind])
4785 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4786
4787 if (ST->hasSSE2())
4788 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4789 if (auto KindCost = Entry->Cost[CostKind])
4790 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4791
4792 if (ST->hasSSE1())
4793 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4794 if (auto KindCost = Entry->Cost[CostKind])
4795 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4796
4797 if (ST->hasBMI()) {
4798 if (ST->is64Bit())
4799 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4800 if (auto KindCost = Entry->Cost[CostKind])
4801 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4802
4803 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4804 if (auto KindCost = Entry->Cost[CostKind])
4805 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4806 }
4807
4808 if (ST->hasLZCNT()) {
4809 if (ST->is64Bit())
4810 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4811 if (auto KindCost = Entry->Cost[CostKind])
4812 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4813
4814 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4815 if (auto KindCost = Entry->Cost[CostKind])
4816 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4817 }
4818
4819 if (ST->hasPOPCNT()) {
4820 if (ST->is64Bit())
4821 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4822 if (auto KindCost = Entry->Cost[CostKind])
4823 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4824
4825 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4826 if (auto KindCost = Entry->Cost[CostKind])
4827 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4828 }
4829
4830 if (ST->is64Bit())
4831 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4832 if (auto KindCost = Entry->Cost[CostKind])
4833 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4834
4835 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4836 if (auto KindCost = Entry->Cost[CostKind])
4837 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4838
4839 // Without arg data, we need to compute the expanded costs of custom lowered
4840 // intrinsics to prevent use of the (very low) default costs.
4841 if (ICA.isTypeBasedOnly() &&
4842 (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) {
4843 Type *CondTy = RetTy->getWithNewBitWidth(1);
4845 Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
4846 Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
4847 Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
4848 Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind);
4849 Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
4850 Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
4852 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
4854 return Cost;
4855 }
4856 }
4857
4859}
4860
4862 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4863 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4864 static const CostTblEntry SLMCostTbl[] = {
4865 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4866 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4867 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4868 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4869 };
4870
4871 assert(Val->isVectorTy() && "This must be a vector type");
4872 auto *VT = cast<VectorType>(Val);
4873 if (VT->isScalableTy())
4875
4876 Type *ScalarType = Val->getScalarType();
4877 InstructionCost RegisterFileMoveCost = 0;
4878
4879 // Non-immediate extraction/insertion can be handled as a sequence of
4880 // aliased loads+stores via the stack.
4881 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4882 Opcode == Instruction::InsertElement)) {
4883 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4884 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4885
4886 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4887 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4888 Align VecAlign = DL.getPrefTypeAlign(Val);
4889 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4890
4891 // Extract - store vector to stack, load scalar.
4892 if (Opcode == Instruction::ExtractElement) {
4893 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4894 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4895 CostKind);
4896 }
4897 // Insert - store vector to stack, store scalar, load vector.
4898 if (Opcode == Instruction::InsertElement) {
4899 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4900 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4901 CostKind) +
4902 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4903 }
4904 }
4905
4906 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4907 Opcode == Instruction::InsertElement)) {
4908 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4909 if (Opcode == Instruction::ExtractElement &&
4910 ScalarType->getScalarSizeInBits() == 1 &&
4911 cast<FixedVectorType>(Val)->getNumElements() > 1)
4912 return 1;
4913
4914 // Legalize the type.
4915 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4916
4917 // This type is legalized to a scalar type.
4918 if (!LT.second.isVector())
4919 return TTI::TCC_Free;
4920
4921 // The type may be split. Normalize the index to the new type.
4922 unsigned SizeInBits = LT.second.getSizeInBits();
4923 unsigned NumElts = LT.second.getVectorNumElements();
4924 unsigned SubNumElts = NumElts;
4925 Index = Index % NumElts;
4926
4927 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4928 // For inserts, we also need to insert the subvector back.
4929 if (SizeInBits > 128) {
4930 assert((SizeInBits % 128) == 0 && "Illegal vector");
4931 unsigned NumSubVecs = SizeInBits / 128;
4932 SubNumElts = NumElts / NumSubVecs;
4933 if (SubNumElts <= Index) {
4934 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4935 Index %= SubNumElts;
4936 }
4937 }
4938
4939 MVT MScalarTy = LT.second.getScalarType();
4940 auto IsCheapPInsrPExtrInsertPS = [&]() {
4941 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4942 // Inserting f32 into index0 is just movss.
4943 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4944 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4945 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4946 (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 &&
4947 Opcode == Instruction::InsertElement) ||
4948 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4949 Opcode == Instruction::InsertElement);
4950 };
4951
4952 if (Index == 0) {
4953 // Floating point scalars are already located in index #0.
4954 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4955 // true for all.
4956 if (ScalarType->isFloatingPointTy() &&
4957 (Opcode != Instruction::InsertElement || !Op0 ||
4958 isa<UndefValue>(Op0)))
4959 return RegisterFileMoveCost;
4960
4961 if (Opcode == Instruction::InsertElement &&
4963 // Consider the gather cost to be cheap.
4965 return RegisterFileMoveCost;
4966 if (!IsCheapPInsrPExtrInsertPS()) {
4967 // mov constant-to-GPR + movd/movq GPR -> XMM.
4968 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4969 return 2 + RegisterFileMoveCost;
4970 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4971 return 1 + RegisterFileMoveCost;
4972 }
4973 }
4974
4975 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4976 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4977 return 1 + RegisterFileMoveCost;
4978 }
4979
4980 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4981 assert(ISD && "Unexpected vector opcode");
4982 if (ST->useSLMArithCosts())
4983 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4984 return Entry->Cost + RegisterFileMoveCost;
4985
4986 // Consider cheap cases.
4987 if (IsCheapPInsrPExtrInsertPS())
4988 return 1 + RegisterFileMoveCost;
4989
4990 // For extractions we just need to shuffle the element to index 0, which
4991 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4992 // the elements to its destination. In both cases we must handle the
4993 // subvector move(s).
4994 // If the vector type is already less than 128-bits then don't reduce it.
4995 // TODO: Under what circumstances should we shuffle using the full width?
4996 InstructionCost ShuffleCost = 1;
4997 if (Opcode == Instruction::InsertElement) {
4998 auto *SubTy = cast<VectorType>(Val);
4999 EVT VT = TLI->getValueType(DL, Val);
5000 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
5001 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
5002 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, SubTy, {},
5003 CostKind, 0, SubTy);
5004 }
5005 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
5006 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
5007 }
5008
5009 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1,
5010 VIC) +
5011 RegisterFileMoveCost;
5012}
5013
5015 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
5016 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
5017 TTI::VectorInstrContext VIC) const {
5018 assert(DemandedElts.getBitWidth() ==
5019 cast<FixedVectorType>(Ty)->getNumElements() &&
5020 "Vector size mismatch");
5021
5022 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5023 MVT MScalarTy = LT.second.getScalarType();
5024 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
5026
5027 constexpr unsigned LaneBitWidth = 128;
5028 assert((LegalVectorBitWidth < LaneBitWidth ||
5029 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
5030 "Illegal vector");
5031
5032 const int NumLegalVectors = LT.first.getValue();
5033 assert(NumLegalVectors >= 0 && "Negative cost!");
5034
5035 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
5036 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. SLPVectorizer has
5037 // a special heuristic regarding poison input which is passed here in
5038 // ForPoisonSrc.
5039 if (Insert && !ForPoisonSrc) {
5040 // This is nearly identical to BaseT::getScalarizationOverhead(), except
5041 // it is passing nullptr to getVectorInstrCost() for Op0 (instead of
5042 // Constant::getNullValue()), which makes the X86TTIImpl
5043 // getVectorInstrCost() return 0 instead of 1.
5044 for (unsigned I : seq(DemandedElts.getBitWidth())) {
5045 if (!DemandedElts[I])
5046 continue;
5047 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, I,
5049 VL.empty() ? nullptr : VL[I],
5051 }
5052 return Cost;
5053 }
5054
5055 if (Insert) {
5056 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
5057 (MScalarTy.isInteger() && ST->hasSSE41()) ||
5058 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
5059 // For types we can insert directly, insertion into 128-bit sub vectors is
5060 // cheap, followed by a cheap chain of concatenations.
5061 if (LegalVectorBitWidth <= LaneBitWidth) {
5062 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
5063 /*Extract*/ false, CostKind);
5064 } else {
5065 // In each 128-lane, if at least one index is demanded but not all
5066 // indices are demanded and this 128-lane is not the first 128-lane of
5067 // the legalized-vector, then this 128-lane needs a extracti128; If in
5068 // each 128-lane, there is at least one demanded index, this 128-lane
5069 // needs a inserti128.
5070
5071 // The following cases will help you build a better understanding:
5072 // Assume we insert several elements into a v8i32 vector in avx2,
5073 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
5074 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
5075 // inserti128.
5076 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
5077 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
5078 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5079 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5080 unsigned NumLegalElts =
5081 LT.second.getVectorNumElements() * NumLegalVectors;
5082 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5083 "Vector has been legalized to smaller element count");
5084 assert((NumLegalElts % NumLanesTotal) == 0 &&
5085 "Unexpected elts per lane");
5086 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5087
5088 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5089 auto *LaneTy =
5090 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5091
5092 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5093 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5094 NumEltsPerLane, NumEltsPerLane * I);
5095 if (LaneEltMask.isZero())
5096 continue;
5097 // FIXME: we don't need to extract if all non-demanded elements
5098 // are legalization-inserted padding.
5099 if (!LaneEltMask.isAllOnes())
5101 CostKind, I * NumEltsPerLane, LaneTy);
5102 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
5103 /*Extract*/ false, CostKind);
5104 }
5105
5106 APInt AffectedLanes =
5107 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
5108 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
5109 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
5110 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
5111 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
5112 unsigned I = NumLegalLanes * LegalVec + Lane;
5113 // No need to insert unaffected lane; or lane 0 of each legal vector
5114 // iff ALL lanes of that vector were affected and will be inserted.
5115 if (!AffectedLanes[I] ||
5116 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
5117 continue;
5119 CostKind, I * NumEltsPerLane, LaneTy);
5120 }
5121 }
5122 }
5123 } else if (LT.second.isVector()) {
5124 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
5125 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
5126 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
5127 // considered cheap.
5128 if (Ty->isIntOrIntVectorTy())
5129 Cost += DemandedElts.popcount();
5130
5131 // Get the smaller of the legalized or original pow2-extended number of
5132 // vector elements, which represents the number of unpacks we'll end up
5133 // performing.
5134 unsigned NumElts = LT.second.getVectorNumElements();
5135 unsigned Pow2Elts =
5137 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
5138 }
5139 }
5140
5141 if (Extract) {
5142 // vXi1 can be efficiently extracted with MOVMSK.
5143 // TODO: AVX512 predicate mask handling.
5144 // NOTE: This doesn't work well for roundtrip scalarization.
5145 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
5146 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
5147 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
5148 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
5149 return MOVMSKCost;
5150 }
5151
5152 if (LT.second.isVector()) {
5153 unsigned NumLegalElts =
5154 LT.second.getVectorNumElements() * NumLegalVectors;
5155 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5156 "Vector has been legalized to smaller element count");
5157
5158 // If we're extracting elements from a 128-bit subvector lane,
5159 // we only need to extract each lane once, not for every element.
5160 if (LegalVectorBitWidth > LaneBitWidth) {
5161 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5162 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5163 assert((NumLegalElts % NumLanesTotal) == 0 &&
5164 "Unexpected elts per lane");
5165 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5166
5167 // Add cost for each demanded 128-bit subvector extraction.
5168 // Luckily this is a lot easier than for insertion.
5169 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5170 auto *LaneTy =
5171 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5172
5173 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5174 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5175 NumEltsPerLane, I * NumEltsPerLane);
5176 if (LaneEltMask.isZero())
5177 continue;
5179 I * NumEltsPerLane, LaneTy);
5181 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
5182 }
5183
5184 return Cost;
5185 }
5186 }
5187
5188 // Fallback to default extraction.
5189 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
5190 Extract, CostKind);
5191 }
5192
5193 return Cost;
5194}
5195
5197X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
5198 int VF, const APInt &DemandedDstElts,
5200 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
5201 // We don't differentiate element types here, only element bit width.
5202 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
5203
5204 auto bailout = [&]() {
5205 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
5206 DemandedDstElts, CostKind);
5207 };
5208
5209 // For now, only deal with AVX512 cases.
5210 if (!ST->hasAVX512())
5211 return bailout();
5212
5213 // Do we have a native shuffle for this element type, or should we promote?
5214 unsigned PromEltTyBits = EltTyBits;
5215 switch (EltTyBits) {
5216 case 32:
5217 case 64:
5218 break; // AVX512F.
5219 case 16:
5220 if (!ST->hasBWI())
5221 PromEltTyBits = 32; // promote to i32, AVX512F.
5222 break; // AVX512BW
5223 case 8:
5224 if (!ST->hasVBMI())
5225 PromEltTyBits = 32; // promote to i32, AVX512F.
5226 break; // AVX512VBMI
5227 case 1:
5228 // There is no support for shuffling i1 elements. We *must* promote.
5229 if (ST->hasBWI()) {
5230 if (ST->hasVBMI())
5231 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
5232 else
5233 PromEltTyBits = 16; // promote to i16, AVX512BW.
5234 break;
5235 }
5236 PromEltTyBits = 32; // promote to i32, AVX512F.
5237 break;
5238 default:
5239 return bailout();
5240 }
5241 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
5242
5243 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
5244 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
5245
5246 int NumDstElements = VF * ReplicationFactor;
5247 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
5248 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
5249
5250 // Legalize the types.
5251 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
5252 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
5253 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
5254 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
5255 // They should have legalized into vector types.
5256 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
5257 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
5258 return bailout();
5259
5260 if (PromEltTyBits != EltTyBits) {
5261 // If we have to perform the shuffle with wider elt type than our data type,
5262 // then we will first need to anyext (we don't care about the new bits)
5263 // the source elements, and then truncate Dst elements.
5264 InstructionCost PromotionCost;
5265 PromotionCost += getCastInstrCost(
5266 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
5268 PromotionCost +=
5269 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
5270 /*Src=*/PromDstVecTy,
5272 return PromotionCost + getReplicationShuffleCost(PromEltTy,
5273 ReplicationFactor, VF,
5274 DemandedDstElts, CostKind);
5275 }
5276
5277 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
5278 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
5279 "We expect that the legalization doesn't affect the element width, "
5280 "doesn't coalesce/split elements.");
5281
5282 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
5283 unsigned NumDstVectors =
5284 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
5285
5286 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
5287
5288 // Not all the produced Dst elements may be demanded. In our case,
5289 // given that a single Dst vector is formed by a single shuffle,
5290 // if all elements that will form a single Dst vector aren't demanded,
5291 // then we won't need to do that shuffle, so adjust the cost accordingly.
5292 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
5293 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
5294 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
5295
5296 InstructionCost SingleShuffleCost =
5297 getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, SingleDstVecTy,
5298 /*Mask=*/{}, CostKind,
5299 /*Index=*/0, /*SubTp=*/nullptr);
5300 return NumDstVectorsDemanded * SingleShuffleCost;
5301}
5302
5304 Align Alignment,
5305 unsigned AddressSpace,
5307 TTI::OperandValueInfo OpInfo,
5308 const Instruction *I) const {
5309 // TODO: Handle other cost kinds.
5311 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
5312 // Store instruction with index and scale costs 2 Uops.
5313 // Check the preceding GEP to identify non-const indices.
5314 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
5315 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
5316 return TTI::TCC_Basic * 2;
5317 }
5318 }
5319 return TTI::TCC_Basic;
5320 }
5321
5322 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5323 "Invalid Opcode");
5324 // Type legalization can't handle structs
5325 if (TLI->getValueType(DL, Src, true) == MVT::Other)
5326 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5327 CostKind, OpInfo, I);
5328
5329 // Legalize the type.
5330 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
5331
5332 auto *VTy = dyn_cast<FixedVectorType>(Src);
5333
5335
5336 // Add a cost for constant load to vector.
5337 if (Opcode == Instruction::Store && OpInfo.isConstant())
5338 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
5339 /*AddressSpace=*/0, CostKind, OpInfo);
5340
5341 // Handle the simple case of non-vectors.
5342 // NOTE: this assumes that legalization never creates vector from scalars!
5343 if (!VTy || !LT.second.isVector()) {
5344 // Each load/store unit costs 1.
5345 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
5346 }
5347
5348 bool IsLoad = Opcode == Instruction::Load;
5349
5350 Type *EltTy = VTy->getElementType();
5351
5352 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
5353
5354 // Source of truth: how many elements were there in the original IR vector?
5355 const unsigned SrcNumElt = VTy->getNumElements();
5356
5357 // How far have we gotten?
5358 int NumEltRemaining = SrcNumElt;
5359 // Note that we intentionally capture by-reference, NumEltRemaining changes.
5360 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
5361
5362 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
5363
5364 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
5365 const unsigned XMMBits = 128;
5366 if (XMMBits % EltTyBits != 0)
5367 // Vector size must be a multiple of the element size. I.e. no padding.
5368 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5369 CostKind, OpInfo, I);
5370 const int NumEltPerXMM = XMMBits / EltTyBits;
5371
5372 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
5373
5374 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
5375 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
5376 // How many elements would a single op deal with at once?
5377 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
5378 // Vector size must be a multiple of the element size. I.e. no padding.
5379 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5380 CostKind, OpInfo, I);
5381 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
5382
5383 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
5384 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
5385 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
5386 "Unless we haven't halved the op size yet, "
5387 "we have less than two op's sized units of work left.");
5388
5389 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
5390 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
5391 : XMMVecTy;
5392
5393 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
5394 "After halving sizes, the vector elt count is no longer a multiple "
5395 "of number of elements per operation?");
5396 auto *CoalescedVecTy =
5397 CurrNumEltPerOp == 1
5398 ? CurrVecTy
5400 IntegerType::get(Src->getContext(),
5401 EltTyBits * CurrNumEltPerOp),
5402 CurrVecTy->getNumElements() / CurrNumEltPerOp);
5403 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
5404 DL.getTypeSizeInBits(CurrVecTy) &&
5405 "coalesciing elements doesn't change vector width.");
5406
5407 while (NumEltRemaining > 0) {
5408 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
5409
5410 // Can we use this vector size, as per the remaining element count?
5411 // Iff the vector is naturally aligned, we can do a wide load regardless.
5412 if (NumEltRemaining < CurrNumEltPerOp &&
5413 (!IsLoad || Alignment < CurrOpSizeBytes) && CurrOpSizeBytes != 1)
5414 break; // Try smalled vector size.
5415
5416 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5417 // as a proxy for a double-pumped AVX memory interface such as on
5418 // Sandybridge.
5419 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5420 // will be scalarized.
5421 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5422 Cost += 2;
5423 else if (CurrOpSizeBytes < 4)
5424 Cost += 2;
5425 else
5426 Cost += 1;
5427
5428 // If we're loading a uniform value, then we don't need to split the load,
5429 // loading just a single (widest) vector can be reused by all splits.
5430 if (IsLoad && OpInfo.isUniform())
5431 return Cost;
5432
5433 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5434
5435 // If we have fully processed the previous reg, we need to replenish it.
5436 if (SubVecEltsLeft == 0) {
5437 SubVecEltsLeft += CurrVecTy->getNumElements();
5438 // And that's free only for the 0'th subvector of a legalized vector.
5439 if (!Is0thSubVec)
5440 Cost +=
5443 VTy, VTy, {}, CostKind, NumEltDone(), CurrVecTy);
5444 }
5445
5446 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5447 // for smaller widths (32/16/8) we have to insert/extract them separately.
5448 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5449 // but let's pretend that it is also true for 16/8 bit wide ops...)
5450 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5451 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5452 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5453 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5454 APInt DemandedElts =
5455 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5456 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5457 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5458 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5459 !IsLoad, CostKind);
5460 }
5461
5462 SubVecEltsLeft -= CurrNumEltPerOp;
5463 NumEltRemaining -= CurrNumEltPerOp;
5464 Alignment = commonAlignment(Alignment, CurrOpSizeBytes);
5465 }
5466 }
5467
5468 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5469
5470 return Cost;
5471}
5472
5476 switch (MICA.getID()) {
5477 case Intrinsic::masked_scatter:
5478 case Intrinsic::masked_gather:
5479 return getGatherScatterOpCost(MICA, CostKind);
5480 case Intrinsic::masked_load:
5481 case Intrinsic::masked_store:
5482 return getMaskedMemoryOpCost(MICA, CostKind);
5483 }
5485}
5486
5490 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
5491 : Instruction::Store;
5492 Type *SrcTy = MICA.getDataType();
5493 Align Alignment = MICA.getAlignment();
5494 unsigned AddressSpace = MICA.getAddressSpace();
5495
5496 bool IsLoad = (Instruction::Load == Opcode);
5497 bool IsStore = (Instruction::Store == Opcode);
5498
5499 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5500 if (!SrcVTy)
5501 // To calculate scalar take the regular cost, without mask
5502 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5503
5504 unsigned NumElem = SrcVTy->getNumElements();
5505 auto *MaskTy =
5506 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5507 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment, AddressSpace)) ||
5508 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment, AddressSpace))) {
5509 // Scalarization
5510 APInt DemandedElts = APInt::getAllOnes(NumElem);
5512 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5513 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5514 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5516 InstructionCost BranchCost = getCFInstrCost(Instruction::CondBr, CostKind);
5517 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5519 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5520 InstructionCost MemopCost =
5521 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5522 Alignment, AddressSpace, CostKind);
5523 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5524 }
5525
5526 // Legalize the type.
5527 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5528 auto VT = TLI->getValueType(DL, SrcVTy);
5530 MVT Ty = LT.second;
5531 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5532 // APX masked load/store for scalar is cheap.
5533 return Cost + LT.first;
5534
5535 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5536 LT.second.getVectorNumElements() == NumElem)
5537 // Promotion requires extend/truncate for data and a shuffle for mask.
5538 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, SrcVTy, {}, CostKind,
5539 0, nullptr) +
5540 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, MaskTy, {}, CostKind,
5541 0, nullptr);
5542
5543 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5544 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5545 (unsigned)LT.first.getValue() *
5546 Ty.getVectorNumElements());
5547 // Expanding requires fill mask with zeroes
5548 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, NewMaskTy, {},
5549 CostKind, 0, MaskTy);
5550 }
5551
5552 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5553 if (!ST->hasAVX512())
5554 return Cost + LT.first * (IsLoad ? 2 : 8);
5555
5556 // AVX-512 masked load/store is cheaper
5557 return Cost + LT.first;
5558}
5559
5561 ArrayRef<const Value *> Ptrs, const Value *Base,
5562 const TTI::PointersChainInfo &Info, Type *AccessTy,
5564 if (Info.isSameBase() && Info.isKnownStride()) {
5565 // If all the pointers have known stride all the differences are translated
5566 // into constants. X86 memory addressing allows encoding it into
5567 // displacement. So we just need to take the base GEP cost.
5568 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5569 SmallVector<const Value *> Indices(BaseGEP->indices());
5570 return getGEPCost(BaseGEP->getSourceElementType(),
5571 BaseGEP->getPointerOperand(), Indices, nullptr,
5572 CostKind);
5573 }
5574 return TTI::TCC_Free;
5575 }
5576 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5577}
5578
5581 const SCEV *Ptr,
5583 // Address computations in vectorized code with non-consecutive addresses will
5584 // likely result in more instructions compared to scalar code where the
5585 // computation can more often be merged into the index mode. The resulting
5586 // extra micro-ops can significantly decrease throughput.
5587 const unsigned NumVectorInstToHideOverhead = 10;
5588
5589 // Cost modeling of Strided Access Computation is hidden by the indexing
5590 // modes of X86 regardless of the stride value. We dont believe that there
5591 // is a difference between constant strided access in gerenal and constant
5592 // strided value which is less than or equal to 64.
5593 // Even in the case of (loop invariant) stride whose value is not known at
5594 // compile time, the address computation will not incur more than one extra
5595 // ADD instruction.
5596 if (PtrTy->isVectorTy() && SE && !ST->hasAVX2()) {
5597 // TODO: AVX2 is the current cut-off because we don't have correct
5598 // interleaving costs for prior ISA's.
5599 if (!BaseT::isStridedAccess(Ptr))
5600 return NumVectorInstToHideOverhead;
5601 if (!BaseT::getConstantStrideStep(SE, Ptr))
5602 return 1;
5603 }
5604
5605 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
5606}
5607
5610 std::optional<FastMathFlags> FMF,
5613 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5614
5615 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5616 // and make it as the cost.
5617
5618 static const CostTblEntry SLMCostTbl[] = {
5619 { ISD::FADD, MVT::v2f64, 3 },
5620 { ISD::ADD, MVT::v2i64, 5 },
5621 };
5622
5623 static const CostTblEntry SSE2CostTbl[] = {
5624 { ISD::FADD, MVT::v2f64, 2 },
5625 { ISD::FADD, MVT::v2f32, 2 },
5626 { ISD::FADD, MVT::v4f32, 4 },
5627 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5628 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5629 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5630 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5631 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5632 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5633 { ISD::ADD, MVT::v2i8, 2 },
5634 { ISD::ADD, MVT::v4i8, 2 },
5635 { ISD::ADD, MVT::v8i8, 2 },
5636 { ISD::ADD, MVT::v16i8, 3 },
5637 };
5638
5639 static const CostTblEntry AVX1CostTbl[] = {
5640 { ISD::FADD, MVT::v4f64, 3 },
5641 { ISD::FADD, MVT::v4f32, 3 },
5642 { ISD::FADD, MVT::v8f32, 4 },
5643 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5644 { ISD::ADD, MVT::v4i64, 3 },
5645 { ISD::ADD, MVT::v8i32, 5 },
5646 { ISD::ADD, MVT::v16i16, 5 },
5647 { ISD::ADD, MVT::v32i8, 4 },
5648 };
5649
5650 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5651 assert(ISD && "Invalid opcode");
5652
5653 // Before legalizing the type, give a chance to look up illegal narrow types
5654 // in the table.
5655 // FIXME: Is there a better way to do this?
5656 EVT VT = TLI->getValueType(DL, ValTy);
5657 if (VT.isSimple()) {
5658 MVT MTy = VT.getSimpleVT();
5659 if (ST->useSLMArithCosts())
5660 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5661 return Entry->Cost;
5662
5663 if (ST->hasAVX())
5664 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5665 return Entry->Cost;
5666
5667 if (ST->hasSSE2())
5668 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5669 return Entry->Cost;
5670 }
5671
5672 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5673
5674 MVT MTy = LT.second;
5675
5676 auto *ValVTy = cast<FixedVectorType>(ValTy);
5677
5678 // Special case: vXi8 mul reductions are performed as vXi16.
5679 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5680 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5681 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5682 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5684 CostKind) +
5685 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5686 }
5687
5688 InstructionCost ArithmeticCost = 0;
5689 if (LT.first != 1 && MTy.isVector() &&
5690 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5691 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5692 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5693 MTy.getVectorNumElements());
5694 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5695 ArithmeticCost *= LT.first - 1;
5696 }
5697
5698 if (ST->useSLMArithCosts())
5699 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5700 return ArithmeticCost + Entry->Cost;
5701
5702 if (ST->hasAVX())
5703 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5704 return ArithmeticCost + Entry->Cost;
5705
5706 if (ST->hasSSE2())
5707 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5708 return ArithmeticCost + Entry->Cost;
5709
5710 // FIXME: These assume a naive kshift+binop lowering, which is probably
5711 // conservative in most cases.
5712 static const CostTblEntry AVX512BoolReduction[] = {
5713 { ISD::AND, MVT::v2i1, 3 },
5714 { ISD::AND, MVT::v4i1, 5 },
5715 { ISD::AND, MVT::v8i1, 7 },
5716 { ISD::AND, MVT::v16i1, 9 },
5717 { ISD::AND, MVT::v32i1, 11 },
5718 { ISD::AND, MVT::v64i1, 13 },
5719 { ISD::OR, MVT::v2i1, 3 },
5720 { ISD::OR, MVT::v4i1, 5 },
5721 { ISD::OR, MVT::v8i1, 7 },
5722 { ISD::OR, MVT::v16i1, 9 },
5723 { ISD::OR, MVT::v32i1, 11 },
5724 { ISD::OR, MVT::v64i1, 13 },
5725 };
5726
5727 static const CostTblEntry AVX2BoolReduction[] = {
5728 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5729 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5730 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5731 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5732 };
5733
5734 static const CostTblEntry AVX1BoolReduction[] = {
5735 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5736 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5737 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5738 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5739 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5740 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5741 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5742 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5743 };
5744
5745 static const CostTblEntry SSE2BoolReduction[] = {
5746 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5747 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5748 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5749 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5750 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5751 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5752 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5753 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5754 };
5755
5756 // Handle bool allof/anyof patterns.
5757 if (ValVTy->getElementType()->isIntegerTy(1)) {
5758 if (ISD == ISD::ADD) {
5759 // vXi1 addition reduction will bitcast to scalar and perform a popcount.
5760 auto *IntTy = IntegerType::getIntNTy(ValVTy->getContext(),
5761 ValVTy->getNumElements());
5762 IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy});
5763 return getCastInstrCost(Instruction::BitCast, IntTy, ValVTy,
5765 CostKind) +
5767 }
5768
5769 InstructionCost ArithmeticCost = 0;
5770 if (LT.first != 1 && MTy.isVector() &&
5771 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5772 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5773 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5774 MTy.getVectorNumElements());
5775 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5776 ArithmeticCost *= LT.first - 1;
5777 }
5778
5779 if (ST->hasAVX512())
5780 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5781 return ArithmeticCost + Entry->Cost;
5782 if (ST->hasAVX2())
5783 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5784 return ArithmeticCost + Entry->Cost;
5785 if (ST->hasAVX())
5786 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5787 return ArithmeticCost + Entry->Cost;
5788 if (ST->hasSSE2())
5789 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5790 return ArithmeticCost + Entry->Cost;
5791
5792 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5793 }
5794
5795 unsigned NumVecElts = ValVTy->getNumElements();
5796 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5797
5798 // Special case power of 2 reductions where the scalar type isn't changed
5799 // by type legalization.
5800 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5801 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5802
5803 InstructionCost ReductionCost = 0;
5804
5805 auto *Ty = ValVTy;
5806 if (LT.first != 1 && MTy.isVector() &&
5807 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5808 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5809 Ty = FixedVectorType::get(ValVTy->getElementType(),
5810 MTy.getVectorNumElements());
5811 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5812 ReductionCost *= LT.first - 1;
5813 NumVecElts = MTy.getVectorNumElements();
5814 }
5815
5816 // Now handle reduction with the legal type, taking into account size changes
5817 // at each level.
5818 while (NumVecElts > 1) {
5819 // Determine the size of the remaining vector we need to reduce.
5820 unsigned Size = NumVecElts * ScalarSize;
5821 NumVecElts /= 2;
5822 // If we're reducing from 256/512 bits, use an extract_subvector.
5823 if (Size > 128) {
5824 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5825 ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
5826 CostKind, NumVecElts, SubTy);
5827 Ty = SubTy;
5828 } else if (Size == 128) {
5829 // Reducing from 128 bits is a permute of v2f64/v2i64.
5830 FixedVectorType *ShufTy;
5831 if (ValVTy->isFloatingPointTy())
5832 ShufTy =
5833 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5834 else
5835 ShufTy =
5836 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5837 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5838 {}, CostKind, 0, nullptr);
5839 } else if (Size == 64) {
5840 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5841 FixedVectorType *ShufTy;
5842 if (ValVTy->isFloatingPointTy())
5843 ShufTy =
5844 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5845 else
5846 ShufTy =
5847 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5848 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5849 {}, CostKind, 0, nullptr);
5850 } else {
5851 // Reducing from smaller size is a shift by immediate.
5852 auto *ShiftTy = FixedVectorType::get(
5853 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5854 ReductionCost += getArithmeticInstrCost(
5855 Instruction::LShr, ShiftTy, CostKind,
5858 }
5859
5860 // Add the arithmetic op for this level.
5861 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5862 }
5863
5864 // Add the final extract element to the cost.
5865 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5866 CostKind, 0, nullptr, nullptr,
5868}
5869
5872 FastMathFlags FMF) const {
5873 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5874 return getIntrinsicInstrCost(ICA, CostKind);
5875}
5876
5879 FastMathFlags FMF,
5881 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5882
5883 MVT MTy = LT.second;
5884
5885 int ISD;
5886 if (ValTy->isIntOrIntVectorTy()) {
5887 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5888 : ISD::SMIN;
5889 } else {
5890 assert(ValTy->isFPOrFPVectorTy() &&
5891 "Expected float point or integer vector type.");
5892 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5893 ? ISD::FMINNUM
5894 : ISD::FMINIMUM;
5895 }
5896
5897 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5898 // and make it as the cost.
5899
5900 static const CostTblEntry SSE2CostTbl[] = {
5901 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5902 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5903 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5904 };
5905
5906 static const CostTblEntry SSE41CostTbl[] = {
5907 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5908 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5909 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5910 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5911 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5912 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5913 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5914 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5915 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5916 {ISD::SMIN, MVT::v16i8, 6},
5917 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5918 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5919 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5920 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5921 };
5922
5923 static const CostTblEntry AVX1CostTbl[] = {
5924 {ISD::SMIN, MVT::v16i16, 6},
5925 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5926 {ISD::SMIN, MVT::v32i8, 8},
5927 {ISD::UMIN, MVT::v32i8, 8},
5928 };
5929
5930 static const CostTblEntry AVX512BWCostTbl[] = {
5931 {ISD::SMIN, MVT::v32i16, 8},
5932 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5933 {ISD::SMIN, MVT::v64i8, 10},
5934 {ISD::UMIN, MVT::v64i8, 10},
5935 };
5936
5937 // Before legalizing the type, give a chance to look up illegal narrow types
5938 // in the table.
5939 // FIXME: Is there a better way to do this?
5940 EVT VT = TLI->getValueType(DL, ValTy);
5941 if (VT.isSimple()) {
5942 MVT MTy = VT.getSimpleVT();
5943 if (ST->hasBWI())
5944 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5945 return Entry->Cost;
5946
5947 if (ST->hasAVX())
5948 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5949 return Entry->Cost;
5950
5951 if (ST->hasSSE41())
5952 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5953 return Entry->Cost;
5954
5955 if (ST->hasSSE2())
5956 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5957 return Entry->Cost;
5958 }
5959
5960 auto *ValVTy = cast<FixedVectorType>(ValTy);
5961 unsigned NumVecElts = ValVTy->getNumElements();
5962
5963 auto *Ty = ValVTy;
5964 InstructionCost MinMaxCost = 0;
5965 if (LT.first != 1 && MTy.isVector() &&
5966 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5967 // Type needs to be split. We need LT.first - 1 operations ops.
5968 Ty = FixedVectorType::get(ValVTy->getElementType(),
5969 MTy.getVectorNumElements());
5970 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5971 MinMaxCost *= LT.first - 1;
5972 NumVecElts = MTy.getVectorNumElements();
5973 }
5974
5975 if (ST->hasBWI())
5976 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5977 return MinMaxCost + Entry->Cost;
5978
5979 if (ST->hasAVX())
5980 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5981 return MinMaxCost + Entry->Cost;
5982
5983 if (ST->hasSSE41())
5984 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5985 return MinMaxCost + Entry->Cost;
5986
5987 if (ST->hasSSE2())
5988 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5989 return MinMaxCost + Entry->Cost;
5990
5991 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5992
5993 // Special case power of 2 reductions where the scalar type isn't changed
5994 // by type legalization.
5995 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5996 ScalarSize != MTy.getScalarSizeInBits())
5997 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5998
5999 // Now handle reduction with the legal type, taking into account size changes
6000 // at each level.
6001 while (NumVecElts > 1) {
6002 // Determine the size of the remaining vector we need to reduce.
6003 unsigned Size = NumVecElts * ScalarSize;
6004 NumVecElts /= 2;
6005 // If we're reducing from 256/512 bits, use an extract_subvector.
6006 if (Size > 128) {
6007 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
6008 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
6009 CostKind, NumVecElts, SubTy);
6010 Ty = SubTy;
6011 } else if (Size == 128) {
6012 // Reducing from 128 bits is a permute of v2f64/v2i64.
6013 VectorType *ShufTy;
6014 if (ValTy->isFloatingPointTy())
6015 ShufTy =
6017 else
6018 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
6019 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
6020 CostKind, 0, nullptr);
6021 } else if (Size == 64) {
6022 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
6023 FixedVectorType *ShufTy;
6024 if (ValTy->isFloatingPointTy())
6025 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
6026 else
6027 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
6028 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
6029 CostKind, 0, nullptr);
6030 } else {
6031 // Reducing from smaller size is a shift by immediate.
6032 auto *ShiftTy = FixedVectorType::get(
6033 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
6034 MinMaxCost += getArithmeticInstrCost(
6035 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
6038 }
6039
6040 // Add the arithmetic op for this level.
6041 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
6042 }
6043
6044 // Add the final extract element to the cost.
6045 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
6046 CostKind, 0, nullptr, nullptr,
6048}
6049
6050/// Calculate the cost of materializing a 64-bit value. This helper
6051/// method might only calculate a fraction of a larger immediate. Therefore it
6052/// is valid to return a cost of ZERO.
6054 if (Val == 0)
6055 return TTI::TCC_Free;
6056
6057 if (isInt<32>(Val))
6058 return TTI::TCC_Basic;
6059
6060 return 2 * TTI::TCC_Basic;
6061}
6062
6065 assert(Ty->isIntegerTy());
6066
6067 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6068 if (BitSize == 0)
6069 return ~0U;
6070
6071 // Never hoist constants larger than 128bit, because this might lead to
6072 // incorrect code generation or assertions in codegen.
6073 // Fixme: Create a cost model for types larger than i128 once the codegen
6074 // issues have been fixed.
6075 if (BitSize > 128)
6076 return TTI::TCC_Free;
6077
6078 if (Imm == 0)
6079 return TTI::TCC_Free;
6080
6081 // Sign-extend all constants to a multiple of 64-bit.
6082 APInt ImmVal = Imm;
6083 if (BitSize % 64 != 0)
6084 ImmVal = Imm.sext(alignTo(BitSize, 64));
6085
6086 // Split the constant into 64-bit chunks and calculate the cost for each
6087 // chunk.
6089 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
6090 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
6091 int64_t Val = Tmp.getSExtValue();
6092 Cost += getIntImmCost(Val);
6093 }
6094 // We need at least one instruction to materialize the constant.
6095 return std::max<InstructionCost>(1, Cost);
6096}
6097
6099 const APInt &Imm, Type *Ty,
6101 Instruction *Inst) const {
6102 assert(Ty->isIntegerTy());
6103
6104 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6105 unsigned ImmBitWidth = Imm.getBitWidth();
6106
6107 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6108 // here, so that constant hoisting will ignore this constant.
6109 if (BitSize == 0)
6110 return TTI::TCC_Free;
6111
6112 unsigned ImmIdx = ~0U;
6113 switch (Opcode) {
6114 default:
6115 return TTI::TCC_Free;
6116 case Instruction::GetElementPtr:
6117 // Always hoist the base address of a GetElementPtr. This prevents the
6118 // creation of new constants for every base constant that gets constant
6119 // folded with the offset.
6120 if (Idx == 0)
6121 return 2 * TTI::TCC_Basic;
6122 return TTI::TCC_Free;
6123 case Instruction::Store:
6124 ImmIdx = 0;
6125 break;
6126 case Instruction::ICmp:
6127 // This is an imperfect hack to prevent constant hoisting of
6128 // compares that might be trying to check if a 64-bit value fits in
6129 // 32-bits. The backend can optimize these cases using a right shift by 32.
6130 // There are other predicates and immediates the backend can use shifts for.
6131 if (Idx == 1 && ImmBitWidth == 64) {
6132 uint64_t ImmVal = Imm.getZExtValue();
6133 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
6134 return TTI::TCC_Free;
6135
6136 if (auto *Cmp = dyn_cast_or_null<CmpInst>(Inst)) {
6137 if (Cmp->isEquality()) {
6138 KnownBits Known = computeKnownBits(Cmp->getOperand(0), DL);
6139 if (Known.countMinTrailingZeros() >= 32)
6140 return TTI::TCC_Free;
6141 }
6142 }
6143 }
6144 ImmIdx = 1;
6145 break;
6146 case Instruction::And:
6147 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
6148 // by using a 32-bit operation with implicit zero extension. Detect such
6149 // immediates here as the normal path expects bit 31 to be sign extended.
6150 if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(32))
6151 return TTI::TCC_Free;
6152 // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits.
6153 if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() &&
6154 Imm.isMask())
6155 return X86TTIImpl::getIntImmCost(ST->hasBMI2() ? 255 : 65535);
6156 ImmIdx = 1;
6157 break;
6158 case Instruction::Add:
6159 case Instruction::Sub:
6160 // For add/sub, we can use the opposite instruction for INT32_MIN.
6161 if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000)
6162 return TTI::TCC_Free;
6163 ImmIdx = 1;
6164 break;
6165 case Instruction::UDiv:
6166 case Instruction::SDiv:
6167 case Instruction::URem:
6168 case Instruction::SRem:
6169 // Division by constant is typically expanded later into a different
6170 // instruction sequence. This completely changes the constants.
6171 // Report them as "free" to stop ConstantHoist from marking them as opaque.
6172 return TTI::TCC_Free;
6173 case Instruction::Mul:
6174 case Instruction::Or:
6175 case Instruction::Xor:
6176 ImmIdx = 1;
6177 break;
6178 // Always return TCC_Free for the shift value of a shift instruction.
6179 case Instruction::Shl:
6180 case Instruction::LShr:
6181 case Instruction::AShr:
6182 if (Idx == 1)
6183 return TTI::TCC_Free;
6184 break;
6185 case Instruction::Trunc:
6186 case Instruction::ZExt:
6187 case Instruction::SExt:
6188 case Instruction::IntToPtr:
6189 case Instruction::PtrToInt:
6190 case Instruction::BitCast:
6191 case Instruction::PHI:
6192 case Instruction::Call:
6193 case Instruction::Select:
6194 case Instruction::Ret:
6195 case Instruction::Load:
6196 break;
6197 }
6198
6199 if (Idx == ImmIdx) {
6200 uint64_t NumConstants = divideCeil(BitSize, 64);
6202 return (Cost <= NumConstants * TTI::TCC_Basic)
6203 ? static_cast<int>(TTI::TCC_Free)
6204 : Cost;
6205 }
6206
6207 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6208}
6209
6212 const APInt &Imm, Type *Ty,
6214 assert(Ty->isIntegerTy());
6215
6216 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6217 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6218 // here, so that constant hoisting will ignore this constant.
6219 if (BitSize == 0)
6220 return TTI::TCC_Free;
6221
6222 switch (IID) {
6223 default:
6224 return TTI::TCC_Free;
6225 case Intrinsic::sadd_with_overflow:
6226 case Intrinsic::uadd_with_overflow:
6227 case Intrinsic::ssub_with_overflow:
6228 case Intrinsic::usub_with_overflow:
6229 case Intrinsic::smul_with_overflow:
6230 case Intrinsic::umul_with_overflow:
6231 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
6232 return TTI::TCC_Free;
6233 break;
6234 case Intrinsic::experimental_stackmap:
6235 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6236 return TTI::TCC_Free;
6237 break;
6238 case Intrinsic::experimental_patchpoint_void:
6239 case Intrinsic::experimental_patchpoint:
6240 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6241 return TTI::TCC_Free;
6242 break;
6243 }
6244 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6245}
6246
6249 const Instruction *I) const {
6251 return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic;
6252 // Branches are assumed to be predicted.
6253 return TTI::TCC_Free;
6254}
6255
6256int X86TTIImpl::getGatherOverhead() const {
6257 // Some CPUs have more overhead for gather. The specified overhead is relative
6258 // to the Load operation. "2" is the number provided by Intel architects. This
6259 // parameter is used for cost estimation of Gather Op and comparison with
6260 // other alternatives.
6261 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
6262 // enable gather with a -march.
6263 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
6264 return 2;
6265
6266 return 1024;
6267}
6268
6269int X86TTIImpl::getScatterOverhead() const {
6270 if (ST->hasAVX512())
6271 return 2;
6272
6273 return 1024;
6274}
6275
6276// Return an average cost of Gather / Scatter instruction, maybe improved later.
6277InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
6279 Type *SrcVTy, const Value *Ptr,
6280 Align Alignment,
6281 unsigned AddressSpace) const {
6282
6283 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
6284 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
6285
6286 // Try to reduce index size from 64 bit (default for GEP)
6287 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
6288 // operation will use 16 x 64 indices which do not fit in a zmm and needs
6289 // to split. Also check that the base pointer is the same for all lanes,
6290 // and that there's at most one variable index.
6291 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
6292 unsigned IndexSize = DL.getPointerSizeInBits();
6293 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
6294 if (IndexSize < 64 || !GEP)
6295 return IndexSize;
6296
6297 unsigned NumOfVarIndices = 0;
6298 const Value *Ptrs = GEP->getPointerOperand();
6299 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
6300 return IndexSize;
6301 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
6302 if (isa<Constant>(GEP->getOperand(I)))
6303 continue;
6304 Type *IndxTy = GEP->getOperand(I)->getType();
6305 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
6306 IndxTy = IndexVTy->getElementType();
6307 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
6308 !isa<SExtInst>(GEP->getOperand(I))) ||
6309 ++NumOfVarIndices > 1)
6310 return IndexSize; // 64
6311 }
6312 return (unsigned)32;
6313 };
6314
6315 // Trying to reduce IndexSize to 32 bits for vector 16.
6316 // By default the IndexSize is equal to pointer size.
6317 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
6318 ? getIndexSizeInBits(Ptr, DL)
6319 : DL.getPointerSizeInBits();
6320
6321 auto *IndexVTy = FixedVectorType::get(
6322 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
6323 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
6324 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
6325 InstructionCost::CostType SplitFactor =
6326 std::max(IdxsLT.first, SrcLT.first).getValue();
6327 if (SplitFactor > 1) {
6328 // Handle splitting of vector of pointers
6329 auto *SplitSrcTy =
6330 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
6331 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
6332 Alignment, AddressSpace);
6333 }
6334
6335 // If we didn't split, this will be a single gather/scatter instruction.
6337 return 1;
6338
6339 // The gather / scatter cost is given by Intel architects. It is a rough
6340 // number since we are looking at one instruction in a time.
6341 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
6342 : getScatterOverhead();
6343 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
6344 Alignment, AddressSpace, CostKind);
6345}
6346
6347/// Calculate the cost of Gather / Scatter operation
6351 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
6352 MICA.getID() == Intrinsic::vp_gather;
6353 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
6354 Type *SrcVTy = MICA.getDataType();
6355 const Value *Ptr = MICA.getPointer();
6356 Align Alignment = MICA.getAlignment();
6357 if ((Opcode == Instruction::Load &&
6358 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
6360 Align(Alignment)))) ||
6361 (Opcode == Instruction::Store &&
6362 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
6364 Align(Alignment)))))
6366
6367 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
6368 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
6369 if (!PtrTy && Ptr->getType()->isVectorTy())
6370 PtrTy = dyn_cast<PointerType>(
6371 cast<VectorType>(Ptr->getType())->getElementType());
6372 assert(PtrTy && "Unexpected type for Ptr argument");
6373 unsigned AddressSpace = PtrTy->getAddressSpace();
6374 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
6375 AddressSpace);
6376}
6377
6379 const TargetTransformInfo::LSRCost &C2) const {
6380 // X86 specific here are "instruction number 1st priority".
6381 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
6382 C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6383 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
6384 C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6385}
6386
6388 return ST->hasMacroFusion() || ST->hasBranchFusion();
6389}
6390
6391static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST) {
6392 if (!ST->hasAVX())
6393 return false;
6394
6395 if (ScalarTy->isPointerTy())
6396 return true;
6397
6398 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6399 return true;
6400
6401 if (ScalarTy->isHalfTy() && ST->hasBWI())
6402 return true;
6403
6404 if (ScalarTy->isBFloatTy() && ST->hasBF16())
6405 return true;
6406
6407 if (!ScalarTy->isIntegerTy())
6408 return false;
6409
6410 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6411 return IntWidth == 32 || IntWidth == 64 ||
6412 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
6413}
6414
6416 unsigned AddressSpace,
6417 TTI::MaskKind MaskKind) const {
6418 Type *ScalarTy = DataTy->getScalarType();
6419
6420 // The backend can't handle a single element vector w/o CFCMOV.
6421 if (isa<VectorType>(DataTy) &&
6422 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6423 return ST->hasCF() &&
6424 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/false);
6425
6426 return isLegalMaskedLoadStore(ScalarTy, ST);
6427}
6428
6430 unsigned AddressSpace,
6431 TTI::MaskKind MaskKind) const {
6432 Type *ScalarTy = DataTy->getScalarType();
6433
6434 // The backend can't handle a single element vector w/o CFCMOV.
6435 if (isa<VectorType>(DataTy) &&
6436 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6437 return ST->hasCF() &&
6438 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/true);
6439
6440 return isLegalMaskedLoadStore(ScalarTy, ST);
6441}
6442
6443bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) const {
6444 unsigned DataSize = DL.getTypeStoreSize(DataType);
6445 // The only supported nontemporal loads are for aligned vectors of 16 or 32
6446 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
6447 // (the equivalent stores only require AVX).
6448 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
6449 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
6450
6451 return false;
6452}
6453
6454bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) const {
6455 unsigned DataSize = DL.getTypeStoreSize(DataType);
6456
6457 // SSE4A supports nontemporal stores of float and double at arbitrary
6458 // alignment.
6459 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
6460 return true;
6461
6462 // Besides the SSE4A subtarget exception above, only aligned stores are
6463 // available nontemporaly on any other subtarget. And only stores with a size
6464 // of 4..32 bytes (powers of 2, only) are permitted.
6465 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6467 return false;
6468
6469 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6470 // loads require AVX2).
6471 if (DataSize == 32)
6472 return ST->hasAVX();
6473 if (DataSize == 16)
6474 return ST->hasSSE1();
6475 return true;
6476}
6477
6479 ElementCount NumElements) const {
6480 // movddup
6481 return ST->hasSSE3() && !NumElements.isScalable() &&
6482 NumElements.getFixedValue() == 2 &&
6483 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6484}
6485
6486bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const {
6487 if (!isa<VectorType>(DataTy))
6488 return false;
6489
6490 if (!ST->hasAVX512())
6491 return false;
6492
6493 // The backend can't handle a single element vector.
6494 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6495 return false;
6496
6497 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6498
6499 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6500 return true;
6501
6502 if (!ScalarTy->isIntegerTy())
6503 return false;
6504
6505 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6506 return IntWidth == 32 || IntWidth == 64 ||
6507 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6508}
6509
6511 Align Alignment) const {
6512 return isLegalMaskedExpandLoad(DataTy, Alignment);
6513}
6514
6515bool X86TTIImpl::supportsGather() const {
6516 // Some CPUs have better gather performance than others.
6517 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6518 // enable gather with a -march.
6519 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6520}
6521
6523 Align Alignment) const {
6524 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6525 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6526 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6527 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6528 // Check, maybe the gather/scatter instruction is better in the VariableMask
6529 // case.
6530 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6531 return NumElts == 1 ||
6532 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6533}
6534
6536 Align Alignment) const {
6537 Type *ScalarTy = DataTy->getScalarType();
6538 if (ScalarTy->isPointerTy())
6539 return true;
6540
6541 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6542 return true;
6543
6544 if (!ScalarTy->isIntegerTy())
6545 return false;
6546
6547 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6548 return IntWidth == 32 || IntWidth == 64;
6549}
6550
6551bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) const {
6552 if (!supportsGather() || !ST->preferGather())
6553 return false;
6554 return isLegalMaskedGatherScatter(DataTy, Alignment);
6555}
6556
6557bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6558 unsigned Opcode1,
6559 const SmallBitVector &OpcodeMask) const {
6560 // ADDSUBPS 4xf32 SSE3
6561 // VADDSUBPS 4xf32 AVX
6562 // VADDSUBPS 8xf32 AVX2
6563 // ADDSUBPD 2xf64 SSE3
6564 // VADDSUBPD 2xf64 AVX
6565 // VADDSUBPD 4xf64 AVX2
6566
6567 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6568 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6569 if (!isPowerOf2_32(NumElements))
6570 return false;
6571 // Check the opcode pattern. We apply the mask on the opcode arguments and
6572 // then check if it is what we expect.
6573 for (int Lane : seq<int>(0, NumElements)) {
6574 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6575 // We expect FSub for even lanes and FAdd for odd lanes.
6576 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6577 return false;
6578 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6579 return false;
6580 }
6581 // Now check that the pattern is supported by the target ISA.
6582 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6583 if (ElemTy->isFloatTy())
6584 return ST->hasSSE3() && NumElements % 4 == 0;
6585 if (ElemTy->isDoubleTy())
6586 return ST->hasSSE3() && NumElements % 2 == 0;
6587 return false;
6588}
6589
6590bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) const {
6591 // AVX2 doesn't support scatter
6592 if (!ST->hasAVX512() || !ST->preferScatter())
6593 return false;
6594 return isLegalMaskedGatherScatter(DataType, Alignment);
6595}
6596
6597bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const {
6598 EVT VT = TLI->getValueType(DL, DataType);
6599 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6600}
6601
6603 // FDIV is always expensive, even if it has a very low uop count.
6604 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6605 if (I->getOpcode() == Instruction::FDiv)
6606 return true;
6607
6609}
6610
6611bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return false; }
6612
6614 const Function *Callee) const {
6615 const TargetMachine &TM = getTLI()->getTargetMachine();
6616
6617 // Work this as a subsetting of subtarget features.
6618 const FeatureBitset &CallerBits =
6619 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6620 const FeatureBitset &CalleeBits =
6621 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6622
6623 // Check whether features are the same (apart from the ignore list).
6624 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6625 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6626 if (RealCallerBits == RealCalleeBits)
6627 return true;
6628
6629 // If the features are a subset, we need to additionally check for calls
6630 // that may become ABI-incompatible as a result of inlining.
6631 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6632 return false;
6633
6634 for (const Instruction &I : instructions(Callee)) {
6635 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6636 // Having more target features is fine for inline ASM and intrinsics.
6637 if (CB->isInlineAsm() || CB->getIntrinsicID() != Intrinsic::not_intrinsic)
6638 continue;
6639
6641 for (Value *Arg : CB->args())
6642 Types.push_back(Arg->getType());
6643 if (!CB->getType()->isVoidTy())
6644 Types.push_back(CB->getType());
6645
6646 // Simple types are always ABI compatible.
6647 auto IsSimpleTy = [](Type *Ty) {
6648 return !Ty->isVectorTy() && !Ty->isAggregateType();
6649 };
6650 if (all_of(Types, IsSimpleTy))
6651 continue;
6652
6653 // Do a precise compatibility check.
6654 if (!areTypesABICompatible(Caller, Callee, Types))
6655 return false;
6656 }
6657 }
6658 return true;
6659}
6660
6662 const Function *Callee,
6663 ArrayRef<Type *> Types) const {
6664 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6665 return false;
6666
6667 // If we get here, we know the target features match. If one function
6668 // considers 512-bit vectors legal and the other does not, consider them
6669 // incompatible.
6670 const TargetMachine &TM = getTLI()->getTargetMachine();
6671
6672 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6674 return true;
6675
6676 // Consider the arguments compatible if they aren't vectors or aggregates.
6677 // FIXME: Look at the size of vectors.
6678 // FIXME: Look at the element types of aggregates to see if there are vectors.
6679 return llvm::none_of(Types,
6680 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6681}
6682
6684X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6686 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6687 Options.NumLoadsPerBlock = 2;
6688 // All GPR and vector loads can be unaligned.
6689 Options.AllowOverlappingLoads = true;
6690 if (IsZeroCmp) {
6691 // Only enable vector loads for equality comparison. Right now the vector
6692 // version is not as fast for three way compare (see #33329).
6693 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6694 if (PreferredWidth >= 512 && ST->hasAVX512())
6695 Options.LoadSizes.push_back(64);
6696 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6697 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6698 }
6699 if (ST->is64Bit()) {
6700 Options.LoadSizes.push_back(8);
6701 }
6702 Options.LoadSizes.push_back(4);
6703 Options.LoadSizes.push_back(2);
6704 Options.LoadSizes.push_back(1);
6705 return Options;
6706}
6707
6709 return supportsGather();
6710}
6711
6713 return false;
6714}
6715
6717 // TODO: We expect this to be beneficial regardless of arch,
6718 // but there are currently some unexplained performance artifacts on Atom.
6719 // As a temporary solution, disable on Atom.
6720 return !(ST->isAtom());
6721}
6722
6723// Get estimation for interleaved load/store operations and strided load.
6724// \p Indices contains indices for strided load.
6725// \p Factor - the factor of interleaving.
6726// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6728 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6729 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6730 TTI::TargetCostKind CostKind, bool UseMaskForCond,
6731 bool UseMaskForGaps) const {
6732 // VecTy for interleave memop is <VF*Factor x Elt>.
6733 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6734 // VecTy = <12 x i32>.
6735
6736 // Calculate the number of memory operations (NumOfMemOps), required
6737 // for load/store the VecTy.
6738 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6739 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6740 unsigned LegalVTSize = LegalVT.getStoreSize();
6741 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6742
6743 // Get the cost of one memory operation.
6744 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6745 LegalVT.getVectorNumElements());
6746 InstructionCost MemOpCost;
6747 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6748 if (UseMaskedMemOp) {
6749 unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load
6750 : Intrinsic::masked_store;
6751 MemOpCost = getMaskedMemoryOpCost(
6752 {IID, SingleMemOpTy, Alignment, AddressSpace}, CostKind);
6753 } else
6754 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace,
6755 CostKind);
6756
6757 unsigned VF = VecTy->getNumElements() / Factor;
6758 MVT VT =
6759 MVT::getVectorVT(TLI->getSimpleValueType(DL, VecTy->getScalarType()), VF);
6760
6761 InstructionCost MaskCost;
6762 if (UseMaskedMemOp) {
6763 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6764 for (unsigned Index : Indices) {
6765 assert(Index < Factor && "Invalid index for interleaved memory op");
6766 for (unsigned Elm = 0; Elm < VF; Elm++)
6767 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6768 }
6769
6770 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6771
6772 MaskCost = getReplicationShuffleCost(
6773 I1Type, Factor, VF,
6774 UseMaskForGaps ? DemandedLoadStoreElts
6776 CostKind);
6777
6778 // The Gaps mask is invariant and created outside the loop, therefore the
6779 // cost of creating it is not accounted for here. However if we have both
6780 // a MaskForGaps and some other mask that guards the execution of the
6781 // memory access, we need to account for the cost of And-ing the two masks
6782 // inside the loop.
6783 if (UseMaskForGaps) {
6784 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6785 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6786 }
6787 }
6788
6789 if (Opcode == Instruction::Load) {
6790 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6791 // contain the cost of the optimized shuffle sequence that the
6792 // X86InterleavedAccess pass will generate.
6793 // The cost of loads and stores are computed separately from the table.
6794
6795 // X86InterleavedAccess support only the following interleaved-access group.
6796 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6797 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6798 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6799 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6800 };
6801
6802 if (const auto *Entry =
6803 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6804 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6805 //If an entry does not exist, fallback to the default implementation.
6806
6807 // Kind of shuffle depends on number of loaded values.
6808 // If we load the entire data in one register, we can use a 1-src shuffle.
6809 // Otherwise, we'll merge 2 sources in each operation.
6810 TTI::ShuffleKind ShuffleKind =
6811 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6812
6813 InstructionCost ShuffleCost = getShuffleCost(
6814 ShuffleKind, SingleMemOpTy, SingleMemOpTy, {}, CostKind, 0, nullptr);
6815
6816 unsigned NumOfLoadsInInterleaveGrp =
6817 Indices.size() ? Indices.size() : Factor;
6818 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6819 VecTy->getNumElements() / Factor);
6820 InstructionCost NumOfResults =
6821 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6822
6823 // About a half of the loads may be folded in shuffles when we have only
6824 // one result. If we have more than one result, or the loads are masked,
6825 // we do not fold loads at all.
6826 unsigned NumOfUnfoldedLoads =
6827 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6828
6829 // Get a number of shuffle operations per result.
6830 unsigned NumOfShufflesPerResult =
6831 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6832
6833 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6834 // When we have more than one destination, we need additional instructions
6835 // to keep sources.
6836 InstructionCost NumOfMoves = 0;
6837 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6838 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6839
6840 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6841 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6842 NumOfMoves;
6843
6844 return Cost;
6845 }
6846
6847 // Store.
6848 assert(Opcode == Instruction::Store &&
6849 "Expected Store Instruction at this point");
6850 // X86InterleavedAccess support only the following interleaved-access group.
6851 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6852 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6853 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6854 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6855
6856 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6857 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6858 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6859 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6860 };
6861
6862 if (const auto *Entry =
6863 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6864 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6865 //If an entry does not exist, fallback to the default implementation.
6866
6867 // There is no strided stores meanwhile. And store can't be folded in
6868 // shuffle.
6869 unsigned NumOfSources = Factor; // The number of values to be merged.
6870 InstructionCost ShuffleCost =
6871 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, SingleMemOpTy, {},
6872 CostKind, 0, nullptr);
6873 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6874
6875 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6876 // We need additional instructions to keep sources.
6877 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6879 MaskCost +
6880 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6881 NumOfMoves;
6882 return Cost;
6883}
6884
6886 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6887 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6888 bool UseMaskForCond, bool UseMaskForGaps) const {
6889 auto *VecTy = cast<FixedVectorType>(BaseTy);
6890
6891 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6892 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6893 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6894 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6895 return true;
6896 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6897 return ST->hasBWI();
6898 if (EltTy->isBFloatTy())
6899 return ST->hasBF16();
6900 return false;
6901 };
6902 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6904 Opcode, VecTy, Factor, Indices, Alignment,
6905 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6906
6907 if (UseMaskForCond || UseMaskForGaps)
6908 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6909 Alignment, AddressSpace, CostKind,
6910 UseMaskForCond, UseMaskForGaps);
6911
6912 // Get estimation for interleaved load/store operations for SSE-AVX2.
6913 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6914 // computing the cost using a generic formula as a function of generic
6915 // shuffles. We therefore use a lookup table instead, filled according to
6916 // the instruction sequences that codegen currently generates.
6917
6918 // VecTy for interleave memop is <VF*Factor x Elt>.
6919 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6920 // VecTy = <12 x i32>.
6921 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6922
6923 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6924 // the VF=2, while v2i128 is an unsupported MVT vector type
6925 // (see MachineValueType.h::getVectorVT()).
6926 if (!LegalVT.isVector())
6927 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6928 Alignment, AddressSpace, CostKind);
6929
6930 unsigned VF = VecTy->getNumElements() / Factor;
6931 Type *ScalarTy = VecTy->getElementType();
6932 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6933 if (!ScalarTy->isIntegerTy())
6934 ScalarTy =
6935 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6936
6937 // Get the cost of all the memory operations.
6938 // FIXME: discount dead loads.
6939 InstructionCost MemOpCosts =
6940 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
6941
6942 auto *VT = FixedVectorType::get(ScalarTy, VF);
6943 EVT ETy = TLI->getValueType(DL, VT);
6944 if (!ETy.isSimple())
6945 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6946 Alignment, AddressSpace, CostKind);
6947
6948 // TODO: Complete for other data-types and strides.
6949 // Each combination of Stride, element bit width and VF results in a different
6950 // sequence; The cost tables are therefore accessed with:
6951 // Factor (stride) and VectorType=VFxiN.
6952 // The Cost accounts only for the shuffle sequence;
6953 // The cost of the loads/stores is accounted for separately.
6954 //
6955 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6956 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6957 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6958 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6959 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6960 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6961
6962 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6963 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6964 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6965
6966 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6967 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6968 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6969
6970 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6971 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6972 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6973 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6974
6975 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6976 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6977 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6978 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6979 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6980
6981 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6982 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6983 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6984 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6985 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6986
6987 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6988 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6989 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6990 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6991 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6992
6993 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6994 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6995 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6996 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6997
6998 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6999 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
7000 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
7001 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
7002 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
7003
7004 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
7005 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
7006 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
7007 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
7008 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
7009
7010 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
7011 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
7012 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
7013 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
7014 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
7015
7016 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
7017 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
7018 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
7019 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
7020
7021 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
7022 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
7023 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
7024 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
7025 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
7026
7027 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
7028 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
7029 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
7030 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
7031 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
7032
7033 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
7034 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
7035 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
7036 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
7037
7038 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
7039 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
7040 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
7041
7042 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
7043 };
7044
7045 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
7046 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
7047 };
7048
7049 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
7050 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
7051 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
7052
7053 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
7054 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
7055
7056 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
7057 };
7058
7059 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
7060 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
7061 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
7062
7063 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
7064 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
7065 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
7066
7067 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
7068 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
7069 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
7070 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
7071
7072 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
7073 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
7074 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
7075 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
7076 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
7077
7078 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
7079 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
7080 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
7081 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
7082 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
7083
7084 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
7085 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
7086 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
7087 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
7088 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
7089
7090 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
7091 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
7092 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
7093 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
7094 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
7095
7096 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
7097 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
7098 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
7099 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
7100
7101 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
7102 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
7103 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
7104 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
7105 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
7106
7107 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
7108 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
7109 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
7110 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
7111 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
7112
7113 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
7114 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
7115 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
7116 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
7117 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
7118
7119 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
7120 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
7121 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
7122 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
7123
7124 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
7125 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
7126 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
7127 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
7128 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
7129
7130 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
7131 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
7132 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
7133 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
7134 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
7135
7136 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
7137 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
7138 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
7139 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
7140
7141 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
7142 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
7143 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
7144 };
7145
7146 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
7147 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
7148 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
7149 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
7150
7151 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
7152 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
7153
7154 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
7155 };
7156
7157 if (Opcode == Instruction::Load) {
7158 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
7159 MemOpCosts](const CostTblEntry *Entry) {
7160 // NOTE: this is just an approximation!
7161 // It can over/under -estimate the cost!
7162 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
7163 };
7164
7165 if (ST->hasAVX2())
7166 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
7167 ETy.getSimpleVT()))
7168 return GetDiscountedCost(Entry);
7169
7170 if (ST->hasSSSE3())
7171 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
7172 ETy.getSimpleVT()))
7173 return GetDiscountedCost(Entry);
7174
7175 if (ST->hasSSE2())
7176 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
7177 ETy.getSimpleVT()))
7178 return GetDiscountedCost(Entry);
7179 } else {
7180 assert(Opcode == Instruction::Store &&
7181 "Expected Store Instruction at this point");
7182 assert((!Indices.size() || Indices.size() == Factor) &&
7183 "Interleaved store only supports fully-interleaved groups.");
7184 if (ST->hasAVX2())
7185 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
7186 ETy.getSimpleVT()))
7187 return MemOpCosts + Entry->Cost;
7188
7189 if (ST->hasSSE2())
7190 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
7191 ETy.getSimpleVT()))
7192 return MemOpCosts + Entry->Cost;
7193 }
7194
7195 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7196 Alignment, AddressSpace, CostKind,
7197 UseMaskForCond, UseMaskForGaps);
7198}
7199
7201 StackOffset BaseOffset,
7202 bool HasBaseReg, int64_t Scale,
7203 unsigned AddrSpace) const {
7204 // Scaling factors are not free at all.
7205 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
7206 // will take 2 allocations in the out of order engine instead of 1
7207 // for plain addressing mode, i.e. inst (reg1).
7208 // E.g.,
7209 // vaddps (%rsi,%rdx), %ymm0, %ymm1
7210 // Requires two allocations (one for the load, one for the computation)
7211 // whereas:
7212 // vaddps (%rsi), %ymm0, %ymm1
7213 // Requires just 1 allocation, i.e., freeing allocations for other operations
7214 // and having less micro operations to execute.
7215 //
7216 // For some X86 architectures, this is even worse because for instance for
7217 // stores, the complex addressing mode forces the instruction to use the
7218 // "load" ports instead of the dedicated "store" port.
7219 // E.g., on Haswell:
7220 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
7221 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
7223 AM.BaseGV = BaseGV;
7224 AM.BaseOffs = BaseOffset.getFixed();
7225 AM.HasBaseReg = HasBaseReg;
7226 AM.Scale = Scale;
7227 AM.ScalableOffset = BaseOffset.getScalable();
7228 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
7229 // Scale represents reg2 * scale, thus account for 1
7230 // as soon as we use a second register.
7231 return AM.Scale != 0;
7233}
7234
7236 // TODO: Hook MispredictPenalty of SchedMachineModel into this.
7237 return 14;
7238}
7239
7241 unsigned Bits = Ty->getScalarSizeInBits();
7242
7243 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
7244 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
7245 if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
7246 return false;
7247
7248 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
7249 // shifts just as cheap as scalar ones.
7250 if (ST->hasAVX2() && (Bits == 32 || Bits == 64))
7251 return false;
7252
7253 // AVX512BW has shifts such as vpsllvw.
7254 if (ST->hasBWI() && Bits == 16)
7255 return false;
7256
7257 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
7258 // fully general vector.
7259 return true;
7260}
7261
7262unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
7263 Type *ScalarValTy, Align Alignment,
7264 unsigned AddrSpace) const {
7265 if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
7266 return 4;
7267 }
7268 return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy, Alignment,
7269 AddrSpace);
7270}
7271
7273 SmallVectorImpl<Use *> &Ops) const {
7274 using namespace llvm::PatternMatch;
7275
7276 if (I->getOpcode() == Instruction::And &&
7277 (ST->hasBMI() || (I->getType()->isVectorTy() && ST->hasSSE2()))) {
7278 for (auto &Op : I->operands()) {
7279 // (and X, (not Y)) -> (andn X, Y)
7280 if (match(Op.get(), m_Not(m_Value())) && !I->getType()->isIntegerTy(8)) {
7281 Ops.push_back(&Op);
7282 return true;
7283 }
7284 // (and X, (splat (not Y))) -> (andn X, (splat Y))
7285 if (match(Op.get(),
7287 m_Value(), m_ZeroMask()))) {
7288 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
7289 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
7290 Ops.push_back(&Not);
7291 Ops.push_back(&InsertElt);
7292 Ops.push_back(&Op);
7293 return true;
7294 }
7295 }
7296 }
7297
7298 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
7299 if (!VTy)
7300 return false;
7301
7302 if (I->getOpcode() == Instruction::Mul &&
7303 VTy->getElementType()->isIntegerTy(64)) {
7304 for (auto &Op : I->operands()) {
7305 // Make sure we are not already sinking this operand
7306 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7307 continue;
7308
7309 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
7310 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
7311 if (ST->hasSSE41() &&
7312 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
7313 m_SpecificInt(32)))) {
7314 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7315 Ops.push_back(&Op);
7316 } else if (ST->hasSSE2() &&
7317 match(Op.get(),
7318 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
7319 Ops.push_back(&Op);
7320 }
7321 }
7322
7323 return !Ops.empty();
7324 }
7325
7326 // A uniform shift amount in a vector shift or funnel shift may be much
7327 // cheaper than a generic variable vector shift, so make that pattern visible
7328 // to SDAG by sinking the shuffle instruction next to the shift.
7329 int ShiftAmountOpNum = -1;
7330 if (I->isShift())
7331 ShiftAmountOpNum = 1;
7332 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
7333 if (II->getIntrinsicID() == Intrinsic::fshl ||
7334 II->getIntrinsicID() == Intrinsic::fshr)
7335 ShiftAmountOpNum = 2;
7336 }
7337
7338 if (ShiftAmountOpNum == -1)
7339 return false;
7340
7341 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
7342 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
7343 isVectorShiftByScalarCheap(I->getType())) {
7344 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
7345 return true;
7346 }
7347
7348 return false;
7349}
7350
7352 bool HasEGPR = ST->hasEGPR();
7353 const TargetMachine &TM = getTLI()->getTargetMachine();
7354
7355 for (User *U : F.users()) {
7357 if (!CB || CB->getCalledOperand() != &F)
7358 continue;
7359 Function *CallerFunc = CB->getFunction();
7360 if (TM.getSubtarget<X86Subtarget>(*CallerFunc).hasEGPR() != HasEGPR)
7361 return false;
7362 }
7363
7364 return true;
7365}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T
uint64_t IntrinsicInst * II
#define P(N)
static unsigned getNumElements(Type *Ty)
This file implements the SmallBitVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
CostTblEntryT< CostKindCosts > CostKindTblEntry
static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST)
TypeConversionCostTblEntryT< CostKindCosts > TypeConversionCostKindTblEntry
This file a TargetTransformInfoImplBase conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1353
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1084
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:834
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:483
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy, Align Alignment, unsigned AddrSpace) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Value * getCalledOperand() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
const FeatureBitset & getFeatureBits() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
Class to represent pointers.
unsigned getAddressSpace() const
Return the address space of the Pointer type.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
virtual bool isExpensiveToSpeculativelyExecute(const Instruction *I) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
MaskKind
Some targets only support masked load/store with a constant mask.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
LLVM_ABI unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:65
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:263
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:147
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:158
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:291
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:227
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition Type.h:399
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:290
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
Base class of all SIMD vector types.
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Type * getElementType() const
bool useAVX512Regs() const
bool hasAVX512() const
bool hasAVX2() const
bool useFastCCForInternalCall(Function &F) const override
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
bool isLegalNTLoad(Type *DataType, Align Alignment) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
unsigned getRegisterClassForType(bool Vector, Type *Ty) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalNTStore(Type *DataType, Align Alignment) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isVectorShiftByScalarCheap(Type *Ty) const override
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
unsigned getAtomicMemIntrinsicMaxElementSize() const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) const override
InstructionCost getBranchMispredictPenalty() const override
bool isExpensiveToSpeculativelyExecute(const Instruction *I) const override
bool hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment) const
bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
bool enableInterleavedAccessVectorization() const override
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const override
unsigned getNumberOfRegisters(unsigned ClassID) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy, Align Alignment, unsigned AddrSpace) const override
bool hasDivRemOp(Type *DataType, bool IsSigned) const override
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool supportsEfficientVectorElementLoadStore() const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCost(int64_t) const
Calculate the cost of materializing a 64-bit value.
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool canMacroFuseCmp() const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool prefersVectorizedAddressing() const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Type) const override
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
Calculate the cost of Gather / Scatter operation.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3061
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ CTLZ_ZERO_POISON
Definition ISDOpcodes.h:788
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ CTTZ_ZERO_POISON
Bit counting operators with a poisoned result for zero inputs.
Definition ISDOpcodes.h:787
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
ap_match< APInt > m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
auto m_Value()
Match an arbitrary value and ignore it.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
This is an optimization pass for GlobalISel generic memory operations.
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2179
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t DataSize
Definition InstrProf.h:299
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1884
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2018
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Cost Table Entry.
Definition CostTable.h:25
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:381
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:331
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:256
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition CostTable.h:55