LLVM 20.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include "llvm/Support/Debug.h"
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
103
106 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
107 // TODO: Currently the __builtin_popcount() implementation using SSE3
108 // instructions is inefficient. Once the problem is fixed, we should
109 // call ST->hasSSE3() instead of ST->hasPOPCNT().
110 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
111}
112
113std::optional<unsigned> X86TTIImpl::getCacheSize(
115 switch (Level) {
117 // - Penryn
118 // - Nehalem
119 // - Westmere
120 // - Sandy Bridge
121 // - Ivy Bridge
122 // - Haswell
123 // - Broadwell
124 // - Skylake
125 // - Kabylake
126 return 32 * 1024; // 32 KByte
128 // - Penryn
129 // - Nehalem
130 // - Westmere
131 // - Sandy Bridge
132 // - Ivy Bridge
133 // - Haswell
134 // - Broadwell
135 // - Skylake
136 // - Kabylake
137 return 256 * 1024; // 256 KByte
138 }
139
140 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
141}
142
143std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
145 // - Penryn
146 // - Nehalem
147 // - Westmere
148 // - Sandy Bridge
149 // - Ivy Bridge
150 // - Haswell
151 // - Broadwell
152 // - Skylake
153 // - Kabylake
154 switch (Level) {
156 [[fallthrough]];
158 return 8;
159 }
160
161 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
162}
163
164unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
165 bool Vector = (ClassID == 1);
166 if (Vector && !ST->hasSSE1())
167 return 0;
168
169 if (ST->is64Bit()) {
170 if (Vector && ST->hasAVX512())
171 return 32;
172 if (!Vector && ST->hasEGPR())
173 return 32;
174 return 16;
175 }
176 return 8;
177}
178
180 if (!ST->hasCF())
181 return false;
182 if (!Ty)
183 return true;
184 // Conditional faulting is supported by CFCMOV, which only accepts
185 // 16/32/64-bit operands.
186 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
187 // profitable.
188 auto *VTy = dyn_cast<FixedVectorType>(Ty);
189 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
190 return false;
191 auto *ScalarTy = Ty->getScalarType();
192 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
193 default:
194 return false;
195 case 16:
196 case 32:
197 case 64:
198 return true;
199 }
200}
201
204 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
205 switch (K) {
207 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
209 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
210 return TypeSize::getFixed(512);
211 if (ST->hasAVX() && PreferVectorWidth >= 256)
212 return TypeSize::getFixed(256);
213 if (ST->hasSSE1() && PreferVectorWidth >= 128)
214 return TypeSize::getFixed(128);
215 return TypeSize::getFixed(0);
217 return TypeSize::getScalable(0);
218 }
219
220 llvm_unreachable("Unsupported register kind");
221}
222
225 .getFixedValue();
226}
227
229 // If the loop will not be vectorized, don't interleave the loop.
230 // Let regular unroll to unroll the loop, which saves the overflow
231 // check and memory check cost.
232 if (VF.isScalar())
233 return 1;
234
235 if (ST->isAtom())
236 return 1;
237
238 // Sandybridge and Haswell have multiple execution ports and pipelined
239 // vector units.
240 if (ST->hasAVX())
241 return 4;
242
243 return 2;
244}
245
247 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
250 const Instruction *CxtI) {
251
252 // vXi8 multiplications are always promoted to vXi16.
253 // Sub-128-bit types can be extended/packed more efficiently.
254 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
255 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
256 Type *WideVecTy =
257 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
258 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
260 CostKind) +
261 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
263 CostKind) +
264 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
265 }
266
267 // Legalize the type.
268 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
269
270 int ISD = TLI->InstructionOpcodeToISD(Opcode);
271 assert(ISD && "Invalid opcode");
272
273 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
274 (LT.second.getScalarType() == MVT::i32 ||
275 LT.second.getScalarType() == MVT::i64)) {
276 // Check if the operands can be represented as a smaller datatype.
277 bool Op1Signed = false, Op2Signed = false;
278 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
279 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
280 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
281 bool SignedMode = Op1Signed || Op2Signed;
282
283 // If both vXi32 are representable as i15 and at least one is constant,
284 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
285 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
286 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
287 LT.second.getScalarType() == MVT::i32) {
288 bool Op1Constant =
289 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
290 bool Op2Constant =
291 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
292 bool Op1Sext = isa<SExtInst>(Args[0]) &&
293 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
294 bool Op2Sext = isa<SExtInst>(Args[1]) &&
295 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
296
297 bool IsZeroExtended = !Op1Signed || !Op2Signed;
298 bool IsConstant = Op1Constant || Op2Constant;
299 bool IsSext = Op1Sext || Op2Sext;
300 if (IsConstant || IsZeroExtended || IsSext)
301 LT.second =
302 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
303 }
304
305 // Check if the vXi32 operands can be shrunk into a smaller datatype.
306 // This should match the codegen from reduceVMULWidth.
307 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
308 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
309 if (OpMinSize <= 7)
310 return LT.first * 3; // pmullw/sext
311 if (!SignedMode && OpMinSize <= 8)
312 return LT.first * 3; // pmullw/zext
313 if (OpMinSize <= 15)
314 return LT.first * 5; // pmullw/pmulhw/pshuf
315 if (!SignedMode && OpMinSize <= 16)
316 return LT.first * 5; // pmullw/pmulhw/pshuf
317 }
318
319 // If both vXi64 are representable as (unsigned) i32, then we can perform
320 // the multiple with a single PMULUDQ instruction.
321 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
322 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
323 ISD = X86ISD::PMULUDQ;
324 }
325
326 // Vector multiply by pow2 will be simplified to shifts.
327 // Vector multiply by -pow2 will be simplified to shifts/negates.
328 if (ISD == ISD::MUL && Op2Info.isConstant() &&
329 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
331 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
332 Op1Info.getNoProps(), Op2Info.getNoProps());
333 if (Op2Info.isNegatedPowerOf2())
334 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
335 return Cost;
336 }
337
338 // On X86, vector signed division by constants power-of-two are
339 // normally expanded to the sequence SRA + SRL + ADD + SRA.
340 // The OperandValue properties may not be the same as that of the previous
341 // operation; conservatively assume OP_None.
342 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
343 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
345 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
346 Op1Info.getNoProps(), Op2Info.getNoProps());
347 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
348 Op1Info.getNoProps(), Op2Info.getNoProps());
349 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
350 Op1Info.getNoProps(), Op2Info.getNoProps());
351
352 if (ISD == ISD::SREM) {
353 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
354 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
355 Op2Info.getNoProps());
356 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
357 Op2Info.getNoProps());
358 }
359
360 return Cost;
361 }
362
363 // Vector unsigned division/remainder will be simplified to shifts/masks.
364 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
365 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
366 if (ISD == ISD::UDIV)
367 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
368 Op1Info.getNoProps(), Op2Info.getNoProps());
369 // UREM
370 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
371 Op1Info.getNoProps(), Op2Info.getNoProps());
372 }
373
374 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
375 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
376 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
377 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
378 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
379 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
380 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
381 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
382 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
383 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
384 };
385
386 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
387 if (const auto *Entry =
388 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
389 if (auto KindCost = Entry->Cost[CostKind])
390 return LT.first * *KindCost;
391
392 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
393 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
394 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
395 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
396 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
397 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
398 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
399 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
400 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
401 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
402
403 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
404 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
405 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
406 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
407 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
408 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
409 };
410
411 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
412 if (const auto *Entry =
413 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
414 if (auto KindCost = Entry->Cost[CostKind])
415 return LT.first * *KindCost;
416
417 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
418 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
419 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
420 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
421
422 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
423 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
424 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
425
426 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
427 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
428 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
429 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
430 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
431 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
432
433 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
434 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
435 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
436 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
437 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
438 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
439 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
440
441 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
442 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
443 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
444 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
445 };
446
447 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
448 if (const auto *Entry =
449 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
450 if (auto KindCost = Entry->Cost[CostKind])
451 return LT.first * *KindCost;
452
453 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
454 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
455 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
456 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
457 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
458 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
459 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
460
461 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
462 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
463 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
464 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
465 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
466 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
467
468 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
469 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
470 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
471 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
472 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
473 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
474
475 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
476 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
477 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
478 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
479 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
480 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
481
482 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
483 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
484 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
485 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
486 };
487
488 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
489 if (const auto *Entry =
490 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
491 if (auto KindCost = Entry->Cost[CostKind])
492 return LT.first * *KindCost;
493
494 static const CostKindTblEntry AVXUniformConstCostTable[] = {
495 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
496 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
497 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
498 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
499 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
500 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
501
502 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
503 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
504 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
505 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
506 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
507 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
508
509 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
510 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
511 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
512 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
513 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
514 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
515
516 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
517 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
518 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
519 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
520 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
521 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
522
523 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
524 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
525 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
526 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
527 };
528
529 // XOP has faster vXi8 shifts.
530 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
531 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
532 if (const auto *Entry =
533 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
534 if (auto KindCost = Entry->Cost[CostKind])
535 return LT.first * *KindCost;
536
537 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
538 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
539 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
540 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
541
542 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
543 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
544 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
545
546 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
547 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
548 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
549
550 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
551 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
552 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
553
554 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
555 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
556 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
557 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
558 };
559
560 // XOP has faster vXi8 shifts.
561 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
562 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
563 if (const auto *Entry =
564 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
565 if (auto KindCost = Entry->Cost[CostKind])
566 return LT.first * *KindCost;
567
568 static const CostKindTblEntry AVX512BWConstCostTable[] = {
569 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
570 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
571 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
572 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
573
574 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
575 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
576 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
577 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
578 };
579
580 if (Op2Info.isConstant() && ST->hasBWI())
581 if (const auto *Entry =
582 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
583 if (auto KindCost = Entry->Cost[CostKind])
584 return LT.first * *KindCost;
585
586 static const CostKindTblEntry AVX512ConstCostTable[] = {
587 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
588 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
589 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
590 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
591
592 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
593 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
594 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
595 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
596
597 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
598 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
599 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
600 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
601 };
602
603 if (Op2Info.isConstant() && ST->hasAVX512())
604 if (const auto *Entry =
605 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
606 if (auto KindCost = Entry->Cost[CostKind])
607 return LT.first * *KindCost;
608
609 static const CostKindTblEntry AVX2ConstCostTable[] = {
610 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
611 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
612 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
613 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
614
615 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
616 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
617 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
618 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
619
620 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
621 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
622 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
623 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
624 };
625
626 if (Op2Info.isConstant() && ST->hasAVX2())
627 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
628 if (auto KindCost = Entry->Cost[CostKind])
629 return LT.first * *KindCost;
630
631 static const CostKindTblEntry AVXConstCostTable[] = {
632 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
633 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
634 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
635 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
636
637 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
638 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
639 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
640 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
641
642 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
643 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
644 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
645 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
646 };
647
648 if (Op2Info.isConstant() && ST->hasAVX())
649 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
650 if (auto KindCost = Entry->Cost[CostKind])
651 return LT.first * *KindCost;
652
653 static const CostKindTblEntry SSE41ConstCostTable[] = {
654 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
655 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
656 };
657
658 if (Op2Info.isConstant() && ST->hasSSE41())
659 if (const auto *Entry =
660 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
661 if (auto KindCost = Entry->Cost[CostKind])
662 return LT.first * *KindCost;
663
664 static const CostKindTblEntry SSE2ConstCostTable[] = {
665 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
666 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
667 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
668 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
669
670 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
671 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
672 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
673 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
674
675 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
676 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
677 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
678 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
679 };
680
681 if (Op2Info.isConstant() && ST->hasSSE2())
682 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
683 if (auto KindCost = Entry->Cost[CostKind])
684 return LT.first * *KindCost;
685
686 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
687 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
688 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
689 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
690 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
691 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
692 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
693 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
694 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
695 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
696
697 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
698 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
699 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
700 };
701
702 if (ST->hasBWI() && Op2Info.isUniform())
703 if (const auto *Entry =
704 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
705 if (auto KindCost = Entry->Cost[CostKind])
706 return LT.first * *KindCost;
707
708 static const CostKindTblEntry AVX512UniformCostTable[] = {
709 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
710 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
711 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
712
713 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
714 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
715 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
716
717 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
718 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
719 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
720 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
721 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
722 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
723 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
724 };
725
726 if (ST->hasAVX512() && Op2Info.isUniform())
727 if (const auto *Entry =
728 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
729 if (auto KindCost = Entry->Cost[CostKind])
730 return LT.first * *KindCost;
731
732 static const CostKindTblEntry AVX2UniformCostTable[] = {
733 // Uniform splats are cheaper for the following instructions.
734 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
735 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
736 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
737 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
738 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
739 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
740
741 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
742 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
743 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
744 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
745 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
746 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
747
748 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
749 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
750 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
751 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
752 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
753 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
754
755 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
756 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
757 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
758 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
759 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
760 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
761 };
762
763 if (ST->hasAVX2() && Op2Info.isUniform())
764 if (const auto *Entry =
765 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
766 if (auto KindCost = Entry->Cost[CostKind])
767 return LT.first * *KindCost;
768
769 static const CostKindTblEntry AVXUniformCostTable[] = {
770 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
771 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
772 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
773 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
774 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
775 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
776
777 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
778 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
779 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
780 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
781 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
782 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
783
784 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
785 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
786 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
787 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
788 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
789 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
790
791 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
792 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
793 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
794 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
795 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
796 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
797 };
798
799 // XOP has faster vXi8 shifts.
800 if (ST->hasAVX() && Op2Info.isUniform() &&
801 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
802 if (const auto *Entry =
803 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
804 if (auto KindCost = Entry->Cost[CostKind])
805 return LT.first * *KindCost;
806
807 static const CostKindTblEntry SSE2UniformCostTable[] = {
808 // Uniform splats are cheaper for the following instructions.
809 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
810 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
811 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
812
813 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
814 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
815 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
816
817 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
818 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
819 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
820
821 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
822 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
823 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
824 };
825
826 if (ST->hasSSE2() && Op2Info.isUniform() &&
827 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
828 if (const auto *Entry =
829 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
830 if (auto KindCost = Entry->Cost[CostKind])
831 return LT.first * *KindCost;
832
833 static const CostKindTblEntry AVX512DQCostTable[] = {
834 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
835 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
836 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
837 };
838
839 // Look for AVX512DQ lowering tricks for custom cases.
840 if (ST->hasDQI())
841 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
842 if (auto KindCost = Entry->Cost[CostKind])
843 return LT.first * *KindCost;
844
845 static const CostKindTblEntry AVX512BWCostTable[] = {
846 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
847 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
848 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
849 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
850 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
851 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
852 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
853 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
854 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
855
856 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
857 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
858 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
859 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
860 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
861 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
862 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
863 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
864 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
865
866 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
867 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
868
869 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
870 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
871 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
872 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
873
874 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
875 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
876
877 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
878 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
879 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
880 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
881
882 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
883 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
884 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
885 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
886 };
887
888 // Look for AVX512BW lowering tricks for custom cases.
889 if (ST->hasBWI())
890 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
891 if (auto KindCost = Entry->Cost[CostKind])
892 return LT.first * *KindCost;
893
894 static const CostKindTblEntry AVX512CostTable[] = {
895 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
896 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
897 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
898
899 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
900 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
901 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
902
903 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
904 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
905 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
906 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
907 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
908 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
909 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
910 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
911 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
912
913 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
914 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
915 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
916 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
917 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
918 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
919 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
920 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
921 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
922
923 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
924 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
925
926 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
927 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
928
929 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
930 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
931 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
932 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
933
934 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
935 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
936 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
937 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
938
939 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
940 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
941 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
942 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
943
944 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
945 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
946 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
947 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
948 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
949
950 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
951
952 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
953 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
954 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
955 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
956 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
957 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
958 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
959 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
960 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
961
962 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
964 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
965 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
966
967 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
968 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
969 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
970 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
972 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
973 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
974 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
975 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
976
977 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
979 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
980 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
981 };
982
983 if (ST->hasAVX512())
984 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
985 if (auto KindCost = Entry->Cost[CostKind])
986 return LT.first * *KindCost;
987
988 static const CostKindTblEntry AVX2ShiftCostTable[] = {
989 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
990 // customize them to detect the cases where shift amount is a scalar one.
991 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
992 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
993 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
994 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
995 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
996 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
997 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
998 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
999 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
1000 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1001 };
1002
1003 if (ST->hasAVX512()) {
1004 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1005 // On AVX512, a packed v32i16 shift left by a constant build_vector
1006 // is lowered into a vector multiply (vpmullw).
1007 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1008 Op1Info.getNoProps(), Op2Info.getNoProps());
1009 }
1010
1011 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1012 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1013 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1014 Op2Info.isConstant())
1015 // On AVX2, a packed v16i16 shift left by a constant build_vector
1016 // is lowered into a vector multiply (vpmullw).
1017 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1018 Op1Info.getNoProps(), Op2Info.getNoProps());
1019
1020 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1021 if (auto KindCost = Entry->Cost[CostKind])
1022 return LT.first * *KindCost;
1023 }
1024
1025 static const CostKindTblEntry XOPShiftCostTable[] = {
1026 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1027 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1028 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1029 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1030 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1031 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1032 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1033 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1034 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1035 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1036 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1037 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1038 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1039 // 256bit shifts require splitting if AVX2 didn't catch them above.
1040 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1041 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1042 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1043 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1044 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1045 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1046 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1047 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1048 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1049 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1050 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1051 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1052 };
1053
1054 // Look for XOP lowering tricks.
1055 if (ST->hasXOP()) {
1056 // If the right shift is constant then we'll fold the negation so
1057 // it's as cheap as a left shift.
1058 int ShiftISD = ISD;
1059 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1060 ShiftISD = ISD::SHL;
1061 if (const auto *Entry =
1062 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1063 if (auto KindCost = Entry->Cost[CostKind])
1064 return LT.first * *KindCost;
1065 }
1066
1067 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1068 MVT VT = LT.second;
1069 // Vector shift left by non uniform constant can be lowered
1070 // into vector multiply.
1071 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1072 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1073 ISD = ISD::MUL;
1074 }
1075
1076 static const CostKindTblEntry GLMCostTable[] = {
1077 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1078 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1079 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1080 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1081 };
1082
1083 if (ST->useGLMDivSqrtCosts())
1084 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1085 if (auto KindCost = Entry->Cost[CostKind])
1086 return LT.first * *KindCost;
1087
1088 static const CostKindTblEntry SLMCostTable[] = {
1089 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1090 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1091 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1092 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1093 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1094 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1095 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1096 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1097 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1098 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1099 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1100 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1101 // v2i64/v4i64 mul is custom lowered as a series of long:
1102 // multiplies(3), shifts(3) and adds(2)
1103 // slm muldq version throughput is 2 and addq throughput 4
1104 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1105 // 3X4 (addq throughput) = 17
1106 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1107 // slm addq\subq throughput is 4
1108 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1109 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1110 };
1111
1112 if (ST->useSLMArithCosts())
1113 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1114 if (auto KindCost = Entry->Cost[CostKind])
1115 return LT.first * *KindCost;
1116
1117 static const CostKindTblEntry AVX2CostTable[] = {
1118 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1119 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1120 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1121 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1122
1123 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1124 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1125 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1126 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1127
1128 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1129 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1130 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1131 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1132 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1133 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1134
1135 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1136 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1137 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1138 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1139 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1140 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1141 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1142 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1143
1144 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1145 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1146 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1147 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1148 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1149 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1150 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1151
1152 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1153
1154 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1155 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1156
1157 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1158 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1159 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1160 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1161 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1162 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1163
1164 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1165 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1166 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1167 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1168 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1169 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1170
1171 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1172 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1173 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1174 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1175 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1176 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1177
1178 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1179 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1180 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1181 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1182 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1183 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1184 };
1185
1186 // Look for AVX2 lowering tricks for custom cases.
1187 if (ST->hasAVX2())
1188 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1189 if (auto KindCost = Entry->Cost[CostKind])
1190 return LT.first * *KindCost;
1191
1192 static const CostKindTblEntry AVX1CostTable[] = {
1193 // We don't have to scalarize unsupported ops. We can issue two half-sized
1194 // operations and we only need to extract the upper YMM half.
1195 // Two ops + 1 extract + 1 insert = 4.
1196 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1197 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1198 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1199 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1200 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1201 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1202
1203 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1204 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1205 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1206 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1207
1208 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1209 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1210 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1211 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1212
1213 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1214 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1215 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1216 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1217
1218 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1219 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1220 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1221 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1222 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1223 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1224 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1225 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1226 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1227 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1228
1229 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1230 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1231 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1232 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1233 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1234 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1235 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1236 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1237
1238 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1239 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1240 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1241 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1242 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1243 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1244 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1245 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1246
1247 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1248 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1249 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1250 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1251 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1252 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1253 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1254 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1255
1256 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1257 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1258
1259 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1260 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1261 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1262 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1263 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1264 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1265
1266 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1267 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1268 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1269 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1270 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1271 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1272
1273 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1274 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1275 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1276 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1277 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1278 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1279
1280 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1281 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1282 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1283 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1284 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1285 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1286 };
1287
1288 if (ST->hasAVX())
1289 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1290 if (auto KindCost = Entry->Cost[CostKind])
1291 return LT.first * *KindCost;
1292
1293 static const CostKindTblEntry SSE42CostTable[] = {
1294 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1295 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1296 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1297 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1298
1299 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1300 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1301 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1302 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1303
1304 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1305 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1306 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1307 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1308
1309 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1310 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1311 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1312 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1313
1314 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1315 };
1316
1317 if (ST->hasSSE42())
1318 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1319 if (auto KindCost = Entry->Cost[CostKind])
1320 return LT.first * *KindCost;
1321
1322 static const CostKindTblEntry SSE41CostTable[] = {
1323 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1324 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1325 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1326
1327 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1328 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1329 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1330 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1331
1332 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1333 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1334 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1335 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1336
1337 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1338 };
1339
1340 if (ST->hasSSE41())
1341 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1342 if (auto KindCost = Entry->Cost[CostKind])
1343 return LT.first * *KindCost;
1344
1345 static const CostKindTblEntry SSSE3CostTable[] = {
1346 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1347 };
1348
1349 if (ST->hasSSSE3())
1350 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1351 if (auto KindCost = Entry->Cost[CostKind])
1352 return LT.first * *KindCost;
1353
1354 static const CostKindTblEntry SSE2CostTable[] = {
1355 // We don't correctly identify costs of casts because they are marked as
1356 // custom.
1357 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1358 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1359 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1360 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1361
1362 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1363 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1364 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1365 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1366
1367 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1368 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1369 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1370 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1371
1372 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1373 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1374 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1375 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1376
1377 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1378 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1379 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1380 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1381
1382 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1383 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1384 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1385 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1386
1387 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1388 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1389
1390 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1391 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1392 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1393 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1394
1395 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1396
1397 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1398 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1399 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1400 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1401
1402 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1403 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1404 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1405 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1406
1407 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1410
1411 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1412 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1413 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1414
1415 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1417 };
1418
1419 if (ST->hasSSE2())
1420 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1421 if (auto KindCost = Entry->Cost[CostKind])
1422 return LT.first * *KindCost;
1423
1424 static const CostKindTblEntry SSE1CostTable[] = {
1425 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1426 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1427
1428 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1429 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1430
1431 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1432 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1433
1434 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1435 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1436
1437 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1438 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1439 };
1440
1441 if (ST->hasSSE1())
1442 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1443 if (auto KindCost = Entry->Cost[CostKind])
1444 return LT.first * *KindCost;
1445
1446 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1447 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1448 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1449 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1450 };
1451
1452 if (ST->is64Bit())
1453 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1454 if (auto KindCost = Entry->Cost[CostKind])
1455 return LT.first * *KindCost;
1456
1457 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1458 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1459 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1460 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1461
1462 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1463 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1464 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1465
1466 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1467 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1468 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1469
1470 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1471 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1472 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1473 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1474 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1475 };
1476
1477 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1478 if (auto KindCost = Entry->Cost[CostKind])
1479 return LT.first * *KindCost;
1480
1481 // It is not a good idea to vectorize division. We have to scalarize it and
1482 // in the process we will often end up having to spilling regular
1483 // registers. The overhead of division is going to dominate most kernels
1484 // anyways so try hard to prevent vectorization of division - it is
1485 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1486 // to hide "20 cycles" for each lane.
1487 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1488 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1489 ISD == ISD::UREM)) {
1490 InstructionCost ScalarCost =
1492 Op1Info.getNoProps(), Op2Info.getNoProps());
1493 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1494 }
1495
1496 // Handle some basic single instruction code size cases.
1497 if (CostKind == TTI::TCK_CodeSize) {
1498 switch (ISD) {
1499 case ISD::FADD:
1500 case ISD::FSUB:
1501 case ISD::FMUL:
1502 case ISD::FDIV:
1503 case ISD::FNEG:
1504 case ISD::AND:
1505 case ISD::OR:
1506 case ISD::XOR:
1507 return LT.first;
1508 break;
1509 }
1510 }
1511
1512 // Fallback to the default implementation.
1513 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1514 Args, CxtI);
1515}
1516
1519 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1521 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1522 return TTI::TCC_Basic;
1524}
1525
1527 TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask,
1529 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1530 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1531 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1532 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1533
1534 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1535
1536 // Recognize a basic concat_vector shuffle.
1537 if (Kind == TTI::SK_PermuteTwoSrc &&
1538 Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1539 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1542 CostKind, Mask.size() / 2, BaseTp);
1543
1544 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1545 if (Kind == TTI::SK_Transpose)
1546 Kind = TTI::SK_PermuteTwoSrc;
1547
1548 if (Kind == TTI::SK_Broadcast) {
1549 // For Broadcasts we are splatting the first element from the first input
1550 // register, so only need to reference that input and all the output
1551 // registers are the same.
1552 LT.first = 1;
1553
1554 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1555 using namespace PatternMatch;
1556 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1557 (ST->hasAVX2() ||
1558 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1559 return TTI::TCC_Free;
1560 }
1561
1562 // Treat <X x bfloat> shuffles as <X x half>.
1563 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1564 LT.second = LT.second.changeVectorElementType(MVT::f16);
1565
1566 // Subvector extractions are free if they start at the beginning of a
1567 // vector and cheap if the subvectors are aligned.
1568 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1569 int NumElts = LT.second.getVectorNumElements();
1570 if ((Index % NumElts) == 0)
1571 return 0;
1572 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1573 if (SubLT.second.isVector()) {
1574 int NumSubElts = SubLT.second.getVectorNumElements();
1575 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1576 return SubLT.first;
1577 // Handle some cases for widening legalization. For now we only handle
1578 // cases where the original subvector was naturally aligned and evenly
1579 // fit in its legalized subvector type.
1580 // FIXME: Remove some of the alignment restrictions.
1581 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1582 // vectors.
1583 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1584 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1585 (NumSubElts % OrigSubElts) == 0 &&
1586 LT.second.getVectorElementType() ==
1587 SubLT.second.getVectorElementType() &&
1588 LT.second.getVectorElementType().getSizeInBits() ==
1590 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1591 "Unexpected number of elements!");
1592 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1593 LT.second.getVectorNumElements());
1594 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1595 SubLT.second.getVectorNumElements());
1596 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1597 InstructionCost ExtractCost =
1598 getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt,
1599 CostKind, ExtractIndex, SubTy);
1600
1601 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1602 // if we have SSSE3 we can use pshufb.
1603 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1604 return ExtractCost + 1; // pshufd or pshufb
1605
1606 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1607 "Unexpected vector size");
1608
1609 return ExtractCost + 2; // worst case pshufhw + pshufd
1610 }
1611 }
1612 // If the extract subvector is not optimal, treat it as single op shuffle.
1614 }
1615
1616 // Subvector insertions are cheap if the subvectors are aligned.
1617 // Note that in general, the insertion starting at the beginning of a vector
1618 // isn't free, because we need to preserve the rest of the wide vector.
1619 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1620 int NumElts = LT.second.getVectorNumElements();
1621 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1622 if (SubLT.second.isVector()) {
1623 int NumSubElts = SubLT.second.getVectorNumElements();
1624 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1625 return SubLT.first;
1626 }
1627
1628 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1629 Kind = TTI::SK_PermuteTwoSrc;
1630 }
1631
1632 // Handle some common (illegal) sub-vector types as they are often very cheap
1633 // to shuffle even on targets without PSHUFB.
1634 EVT VT = TLI->getValueType(DL, BaseTp);
1635 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1636 !ST->hasSSSE3()) {
1637 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1638 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1639 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1640 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1641 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1642 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1643
1644 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1645 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1646 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1647 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1648
1649 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1650 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1651 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1652 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1653
1654 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1655 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1656 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1657 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1658 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1659
1660 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1661 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1662 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1663 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1664 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1665 };
1666
1667 if (ST->hasSSE2())
1668 if (const auto *Entry =
1669 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1670 return Entry->Cost;
1671 }
1672
1673 // We are going to permute multiple sources and the result will be in multiple
1674 // destinations. Providing an accurate cost only for splits where the element
1675 // type remains the same.
1676 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1677 MVT LegalVT = LT.second;
1678 if (LegalVT.isVector() &&
1679 LegalVT.getVectorElementType().getSizeInBits() ==
1681 LegalVT.getVectorNumElements() <
1682 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1683 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1684 unsigned LegalVTSize = LegalVT.getStoreSize();
1685 // Number of source vectors after legalization:
1686 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1687 // Number of destination vectors after legalization:
1688 InstructionCost NumOfDests = LT.first;
1689
1690 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1691 LegalVT.getVectorNumElements());
1692
1693 if (!Mask.empty() && NumOfDests.isValid()) {
1694 // Try to perform better estimation of the permutation.
1695 // 1. Split the source/destination vectors into real registers.
1696 // 2. Do the mask analysis to identify which real registers are
1697 // permuted. If more than 1 source registers are used for the
1698 // destination register building, the cost for this destination register
1699 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1700 // source register is used, build mask and calculate the cost as a cost
1701 // of PermuteSingleSrc.
1702 // Also, for the single register permute we try to identify if the
1703 // destination register is just a copy of the source register or the
1704 // copy of the previous destination register (the cost is
1705 // TTI::TCC_Basic). If the source register is just reused, the cost for
1706 // this operation is 0.
1707 NumOfDests =
1709 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1710 .first;
1711 unsigned E = *NumOfDests.getValue();
1712 unsigned NormalizedVF =
1713 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1714 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1715 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1716 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1717 copy(Mask, NormalizedMask.begin());
1718 unsigned PrevSrcReg = 0;
1719 ArrayRef<int> PrevRegMask;
1722 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1723 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1724 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1725 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1726 // Check if the previous register can be just copied to the next
1727 // one.
1728 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1729 PrevRegMask != RegMask)
1731 RegMask, CostKind, 0, nullptr);
1732 else
1733 // Just a copy of previous destination register.
1735 return;
1736 }
1737 if (SrcReg != DestReg &&
1738 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1739 // Just a copy of the source register.
1741 }
1742 PrevSrcReg = SrcReg;
1743 PrevRegMask = RegMask;
1744 },
1745 [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1746 unsigned /*Unused*/,
1747 unsigned /*Unused*/) {
1748 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1749 CostKind, 0, nullptr);
1750 });
1751 return Cost;
1752 }
1753
1754 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1755 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1756 std::nullopt, CostKind, 0, nullptr);
1757 }
1758
1759 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1760 }
1761
1762 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1763 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1764 // We assume that source and destination have the same vector type.
1765 InstructionCost NumOfDests = LT.first;
1766 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1767 LT.first = NumOfDests * NumOfShufflesPerDest;
1768 }
1769
1770 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1771 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1772 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1773
1774 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1775 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1776
1777 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1778 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1779 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1780 };
1781
1782 if (ST->hasVBMI())
1783 if (const auto *Entry =
1784 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1785 return LT.first * Entry->Cost;
1786
1787 static const CostTblEntry AVX512BWShuffleTbl[] = {
1788 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1789 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1790 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1791
1792 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1793 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1794 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1795 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1796
1797 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1798 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1799 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1800 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1801 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1802
1803 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1804 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1805 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1806 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1807 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1808
1809 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1810 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1811
1812 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1813 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1814 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1815 };
1816
1817 if (ST->hasBWI())
1818 if (const auto *Entry =
1819 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1820 return LT.first * Entry->Cost;
1821
1822 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1823 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1824 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1825 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1826 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1827 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1828 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1829 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1830
1831 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1832 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1833 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1834 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1835 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1836 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1837 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1838
1839 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1840 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1841 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1842 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1843 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1844 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1845 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1846 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1847 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1848 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1849 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1850
1851 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1852 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1853 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1854 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1855 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1856 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1857 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1858 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1859 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1860 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1861 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1862 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1863 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1864
1865 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1866 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1867 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1868 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1869 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1870 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1871 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1872 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1873 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1874 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1875 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1876 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1877
1878 // FIXME: This just applies the type legalization cost rules above
1879 // assuming these completely split.
1880 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1881 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1882 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1883 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1884 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1885 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1886
1887 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1888 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1889 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1890 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1891 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1892 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1893 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1894 };
1895
1896 if (ST->hasAVX512())
1897 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1898 if (auto KindCost = Entry->Cost[CostKind])
1899 return LT.first * *KindCost;
1900
1901 static const CostTblEntry AVX2ShuffleTbl[] = {
1902 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1903 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1904 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1905 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1906 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1907 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1908 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1909
1910 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1911 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1912 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1913 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1914 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1915 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1916 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1917
1918 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1919 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1920 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1921
1922 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1923 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1924 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1925 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1926 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1927
1928 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1929 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1930 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1931 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1932 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1933 // + vpblendvb
1934 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1935 // + vpblendvb
1936 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1937 // + vpblendvb
1938
1939 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1940 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1941 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1942 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1943 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1944 // + vpblendvb
1945 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1946 // + vpblendvb
1947 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1948 // + vpblendvb
1949 };
1950
1951 if (ST->hasAVX2())
1952 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1953 return LT.first * Entry->Cost;
1954
1955 static const CostTblEntry XOPShuffleTbl[] = {
1956 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1957 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1958 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1959 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1960 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1961 // + vinsertf128
1962 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1963 // + vinsertf128
1964
1965 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1966 // + vinsertf128
1967 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1968 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1969 // + vinsertf128
1970 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1971 };
1972
1973 if (ST->hasXOP())
1974 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1975 return LT.first * Entry->Cost;
1976
1977 static const CostTblEntry AVX1ShuffleTbl[] = {
1978 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1979 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1980 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1981 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1982 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1983 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1984 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1985
1986 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1987 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1988 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1989 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1990 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1991 // + vinsertf128
1992 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1993 // + vinsertf128
1994 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1995 // + vinsertf128
1996
1997 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1998 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1999 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
2000 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
2001 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
2002 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
2003 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
2004
2005 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
2006 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
2007 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2008 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2009 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2010 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2011 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2012
2013 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
2014 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
2015 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2016 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2017 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
2018 // + 2*por + vinsertf128
2019 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
2020 // + 2*por + vinsertf128
2021 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
2022 // + 2*por + vinsertf128
2023
2024 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
2025 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
2026 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2027 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2028 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
2029 // + 4*por + vinsertf128
2030 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
2031 // + 4*por + vinsertf128
2032 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
2033 // + 4*por + vinsertf128
2034 };
2035
2036 if (ST->hasAVX())
2037 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2038 return LT.first * Entry->Cost;
2039
2040 static const CostTblEntry SSE41ShuffleTbl[] = {
2041 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
2042 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2043 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
2044 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
2045 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
2046 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
2047 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
2048 };
2049
2050 if (ST->hasSSE41())
2051 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2052 return LT.first * Entry->Cost;
2053
2054 static const CostTblEntry SSSE3ShuffleTbl[] = {
2055 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
2056 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
2057 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
2058
2059 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
2060 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
2061 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
2062
2063 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
2064 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
2065 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
2066
2067 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
2068 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
2069 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
2070 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
2071 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
2072
2073 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
2074 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
2075 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
2076
2077 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
2078 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
2079 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
2080 };
2081
2082 if (ST->hasSSSE3())
2083 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2084 return LT.first * Entry->Cost;
2085
2086 static const CostTblEntry SSE2ShuffleTbl[] = {
2087 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2088 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2089 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2090 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2091 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2092 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2093
2094 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2095 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2096 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2097 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2098 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2099 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2100 // + 2*pshufd + 2*unpck + packus
2101
2102 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2103 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2104 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2105 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2106 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2107 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2108
2109 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2110 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2111 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2112 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2113 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2114 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2115
2116 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2117 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2118 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2119 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2120 // + pshufd/unpck
2121 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2122 // + pshufd/unpck
2123 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2124 // + 2*pshufd + 2*unpck + 2*packus
2125
2126 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2127 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2128 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2129 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2130 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2131 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2132 };
2133
2134 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2135 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2136 };
2137
2138 if (ST->hasSSE2()) {
2139 bool IsLoad =
2140 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2141 if (ST->hasSSE3() && IsLoad)
2142 if (const auto *Entry =
2143 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2145 LT.second.getVectorElementCount()) &&
2146 "Table entry missing from isLegalBroadcastLoad()");
2147 return LT.first * Entry->Cost;
2148 }
2149
2150 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2151 return LT.first * Entry->Cost;
2152 }
2153
2154 static const CostTblEntry SSE1ShuffleTbl[] = {
2155 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2156 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2157 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2158 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2159 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2160 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2161 };
2162
2163 if (ST->hasSSE1())
2164 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2165 return LT.first * Entry->Cost;
2166
2167 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2168}
2169
2171 Type *Src,
2174 const Instruction *I) {
2175 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2176 assert(ISD && "Invalid opcode");
2177
2178 // The cost tables include both specific, custom (non-legal) src/dst type
2179 // conversions and generic, legalized types. We test for customs first, before
2180 // falling back to legalization.
2181 // FIXME: Need a better design of the cost table to handle non-simple types of
2182 // potential massive combinations (elem_num x src_type x dst_type).
2183 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2184 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2185 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2186
2187 // Mask sign extend has an instruction.
2188 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2189 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2190 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2191 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2192 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2193 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2194 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2195 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2196 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2197 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2198 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2199 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2200 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2201 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2202 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2203 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2204 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2205
2206 // Mask zero extend is a sext + shift.
2207 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2208 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2209 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2210 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2211 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2212 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2213 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2214 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2215 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2216 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2217 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2218 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2219 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2220 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2221 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2222 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2223 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2224
2225 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2226 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2227 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2228 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2229 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2230 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2231 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2232 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2233 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2234 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2235 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2236 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2237 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2238 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2239 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2240 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2241 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2242
2243 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2244 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2245 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2246 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2247 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2248 };
2249
2250 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2251 // Mask sign extend has an instruction.
2252 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2253 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2254 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2255 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2256 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2257 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2258 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2259 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2260
2261 // Mask zero extend is a sext + shift.
2262 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2263 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2264 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2265 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2266 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2267 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2268 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2269 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2270
2271 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2272 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2273 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2274 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2275 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2276 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2277 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2278 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2279
2280 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2281 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2282
2283 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2284 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2285
2286 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2287 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2288
2289 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2290 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2291 };
2292
2293 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2294 // 256-bit wide vectors.
2295
2296 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2297 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2298 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2299 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2300 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2301
2302 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2303 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2304 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2305 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2306 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2307 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2308 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2309 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2310 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2311 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2312 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2313 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2314 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2315 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2316 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2317 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2318 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2319 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2320 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2321 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2322 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2323 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2324 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2325 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2326 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2327 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2328 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2329 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2330 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2331 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2332 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2333 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2334 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2335 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2336
2337 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2338 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2339 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2340
2341 // Sign extend is zmm vpternlogd+vptruncdb.
2342 // Zero extend is zmm broadcast load+vptruncdw.
2343 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2344 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2345 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2346 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2347 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2348 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2349 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2350 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2351
2352 // Sign extend is zmm vpternlogd+vptruncdw.
2353 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2354 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2355 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2356 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2357 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2358 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2359 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2360 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2361 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2362
2363 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2364 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2365 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2366 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2367 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2368 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2369 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2370 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2371 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2372 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2373
2374 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2375 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2376 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2377 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2378
2379 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2380 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2381 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2382 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2383 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2384 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2385 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2386 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2387 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2388 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2389
2390 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2391 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2392
2393 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2394 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2395 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2396 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2397 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2398 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2399 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2400 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2401
2402 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2403 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2404 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2405 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2406 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2407 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2408 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2409 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2410 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2411 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2412
2413 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2414 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2415 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2416 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2417 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2418 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2419 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2420 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2421 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2422 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2423 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2424
2425 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2426 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2427 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2428 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2429 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2430 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2431 };
2432
2433 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2434 // Mask sign extend has an instruction.
2435 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2436 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2437 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2438 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2439 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2440 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2441 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2442 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2443 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2444 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2445 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2446 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2447 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2448 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2449 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2450 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2451 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2452
2453 // Mask zero extend is a sext + shift.
2454 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2455 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2456 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2457 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2458 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2459 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2460 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2461 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2462 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2463 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2464 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2465 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2466 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2467 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2468 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2469 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2470 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2471
2472 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2473 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2474 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2475 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2476 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2477 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2478 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2479 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2480 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2481 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2482 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2483 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2484 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2485 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2486 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2487 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2488 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2489
2490 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2491 };
2492
2493 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2494 // Mask sign extend has an instruction.
2495 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2496 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2497 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2498 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2499 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2500 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2501 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2502 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2503
2504 // Mask zero extend is a sext + shift.
2505 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2506 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2507 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2508 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2509 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2510 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2511 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2512 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2513
2514 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2515 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2516 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2517 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2518 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2519 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2520 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2521 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2522
2523 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2524 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2525 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2526 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2527
2528 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2529 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2530 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2531 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2532
2533 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2534 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2535 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2536 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2537
2538 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2539 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2540 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2541 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2542 };
2543
2544 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2545 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2546 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2547 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2548 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2549 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2550 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2551 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2552 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2553 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2554 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2555 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2556 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2557 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2558 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2559 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2560 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2561 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2562 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2563
2564 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2565 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2566 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2567 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2568 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2569 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2570 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2571 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2572 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2573 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2574
2575 // sign extend is vpcmpeq+maskedmove+vpmovdw
2576 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2577 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2578 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2579 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2580 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2581 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2582 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2583 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2584 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2585
2586 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2587 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2588 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2589 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2590 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2591 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2592 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2593 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2594
2595 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2596 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2597 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2598 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2599
2600 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2601 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2602 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2603 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2604 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2605 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2606 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2607 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2608 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2609 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2610 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2611 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2612
2613 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2614 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2615 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2616 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2617
2618 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2619 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2620 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2621 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2622 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2623 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2624 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2625 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2626 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2627 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2628 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2629 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2630 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2631
2632 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2633 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2634 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2635
2636 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2637 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2638 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2639 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2640 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2641 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2642 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2643 };
2644
2645 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2646 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2647 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2648 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2649 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2650 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2651 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2652
2653 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2654 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2655 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2656 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2657 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2658 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2659 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2660 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2661 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2662 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2663 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2664 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2665 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2666 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2667
2668 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2669
2670 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2671 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2672 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2673 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2674 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2675 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2676 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2677 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2678 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2679 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2680 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2681 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2682
2683 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2684 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2685
2686 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2687 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2688 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2689 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2690
2691 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2692 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2693 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2694 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2695 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2696 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2697 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2698 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2699
2700 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2701 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2702 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2703 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2704 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2705 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2706 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2707
2708 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2709 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2710 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2711 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2712 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2713 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2714 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2715 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2716 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2717 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2718 };
2719
2720 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2721 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2722 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2723 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2724 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2725 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2726 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2727
2728 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2729 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2730 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2731 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2732 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2733 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2734 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2735 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2736 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2737 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2738 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2739 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2740
2741 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2742 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2743 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2744 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2745 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2746
2747 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2748 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2749 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2750 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2751 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2752 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2753 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2754 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2755
2756 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2757 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2758 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2759 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2760 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2761 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2762 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2763 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2764 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2765 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2766 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2767 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2768
2769 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2770 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2771 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2772 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2773 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2774 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2775 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2776 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2777 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2778 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2779 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2780 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2781 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2782 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2783 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2784 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2785 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2786
2787 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2788 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2789 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2790 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2791 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2792 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2793 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2794 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2795 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2796 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2797 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2798
2799 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2800 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2801 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2802 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2803 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2804 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2805 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2806 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2807 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2808 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2809 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
2810 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
2811 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
2812
2813 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
2814 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
2815 };
2816
2817 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2818 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2819 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2820 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2821 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2822 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2823 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2824 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2825 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2826 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2827 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2828 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2829 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2830
2831 // These truncates end up widening elements.
2832 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
2833 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
2834 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
2835
2836 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
2837 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
2838 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
2839
2840 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2841 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2842 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2843 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2844 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2845 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2846 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2847 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2848 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2849 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
2850 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2851
2852 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2853 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2854 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
2855 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
2856 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2857 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2858 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2859 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2860 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
2861 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2862 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
2863 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
2864 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
2865 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
2866
2867 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2868 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2869 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2870 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2871 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2872 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2873 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2874 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2875 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2876 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2877
2878 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2879 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2880 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2881 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
2882 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2883 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2884 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2885 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2886 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
2887 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2888 };
2889
2890 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
2891 // These are somewhat magic numbers justified by comparing the
2892 // output of llvm-mca for our various supported scheduler models
2893 // and basing it off the worst case scenario.
2894 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2895 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2896 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
2897 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
2898 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
2899 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2900 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
2901 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
2902 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2903 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
2904 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
2905 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
2906
2907 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2908 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2909 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
2910 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
2911 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2912 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
2913 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
2914 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
2915 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
2916 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
2917 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2918 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
2919 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
2920
2921 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
2922 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2923 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
2924 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
2925 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
2926 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
2927 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
2928 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
2929 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
2930 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2931
2932 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
2933 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2934 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
2935 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
2936 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
2937 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
2938 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
2939 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
2940 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
2941 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
2942
2943 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
2944 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
2945 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
2946 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
2947 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2948 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
2949 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
2950 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
2951 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2952 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
2953 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2954 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
2955
2956 // These truncates are really widening elements.
2957 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
2958 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
2959 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
2960 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
2961 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
2962 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
2963
2964 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
2965 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
2966 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
2967 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
2968 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
2969 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
2970 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2971 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
2972 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
2973 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
2974 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
2975 };
2976
2977 // Attempt to map directly to (simple) MVT types to let us match custom entries.
2978 EVT SrcTy = TLI->getValueType(DL, Src);
2979 EVT DstTy = TLI->getValueType(DL, Dst);
2980
2981 // The function getSimpleVT only handles simple value types.
2982 if (SrcTy.isSimple() && DstTy.isSimple()) {
2983 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2984 MVT SimpleDstTy = DstTy.getSimpleVT();
2985
2986 if (ST->useAVX512Regs()) {
2987 if (ST->hasBWI())
2988 if (const auto *Entry = ConvertCostTableLookup(
2989 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2990 if (auto KindCost = Entry->Cost[CostKind])
2991 return *KindCost;
2992
2993 if (ST->hasDQI())
2994 if (const auto *Entry = ConvertCostTableLookup(
2995 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2996 if (auto KindCost = Entry->Cost[CostKind])
2997 return *KindCost;
2998
2999 if (ST->hasAVX512())
3000 if (const auto *Entry = ConvertCostTableLookup(
3001 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3002 if (auto KindCost = Entry->Cost[CostKind])
3003 return *KindCost;
3004 }
3005
3006 if (ST->hasBWI())
3007 if (const auto *Entry = ConvertCostTableLookup(
3008 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3009 if (auto KindCost = Entry->Cost[CostKind])
3010 return *KindCost;
3011
3012 if (ST->hasDQI())
3013 if (const auto *Entry = ConvertCostTableLookup(
3014 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3015 if (auto KindCost = Entry->Cost[CostKind])
3016 return *KindCost;
3017
3018 if (ST->hasAVX512())
3019 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3020 SimpleDstTy, SimpleSrcTy))
3021 if (auto KindCost = Entry->Cost[CostKind])
3022 return *KindCost;
3023
3024 if (ST->hasAVX2()) {
3025 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3026 SimpleDstTy, SimpleSrcTy))
3027 if (auto KindCost = Entry->Cost[CostKind])
3028 return *KindCost;
3029 }
3030
3031 if (ST->hasAVX()) {
3032 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3033 SimpleDstTy, SimpleSrcTy))
3034 if (auto KindCost = Entry->Cost[CostKind])
3035 return *KindCost;
3036 }
3037
3038 if (ST->hasSSE41()) {
3039 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3040 SimpleDstTy, SimpleSrcTy))
3041 if (auto KindCost = Entry->Cost[CostKind])
3042 return *KindCost;
3043 }
3044
3045 if (ST->hasSSE2()) {
3046 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3047 SimpleDstTy, SimpleSrcTy))
3048 if (auto KindCost = Entry->Cost[CostKind])
3049 return *KindCost;
3050 }
3051 }
3052
3053 // Fall back to legalized types.
3054 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3055 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3056
3057 // If we're truncating to the same legalized type - just assume its free.
3058 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3059 return TTI::TCC_Free;
3060
3061 if (ST->useAVX512Regs()) {
3062 if (ST->hasBWI())
3063 if (const auto *Entry = ConvertCostTableLookup(
3064 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3065 if (auto KindCost = Entry->Cost[CostKind])
3066 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3067
3068 if (ST->hasDQI())
3069 if (const auto *Entry = ConvertCostTableLookup(
3070 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3071 if (auto KindCost = Entry->Cost[CostKind])
3072 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3073
3074 if (ST->hasAVX512())
3075 if (const auto *Entry = ConvertCostTableLookup(
3076 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3077 if (auto KindCost = Entry->Cost[CostKind])
3078 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3079 }
3080
3081 if (ST->hasBWI())
3082 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3083 LTDest.second, LTSrc.second))
3084 if (auto KindCost = Entry->Cost[CostKind])
3085 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3086
3087 if (ST->hasDQI())
3088 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3089 LTDest.second, LTSrc.second))
3090 if (auto KindCost = Entry->Cost[CostKind])
3091 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3092
3093 if (ST->hasAVX512())
3094 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3095 LTDest.second, LTSrc.second))
3096 if (auto KindCost = Entry->Cost[CostKind])
3097 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3098
3099 if (ST->hasAVX2())
3100 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3101 LTDest.second, LTSrc.second))
3102 if (auto KindCost = Entry->Cost[CostKind])
3103 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3104
3105 if (ST->hasAVX())
3106 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3107 LTDest.second, LTSrc.second))
3108 if (auto KindCost = Entry->Cost[CostKind])
3109 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3110
3111 if (ST->hasSSE41())
3112 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3113 LTDest.second, LTSrc.second))
3114 if (auto KindCost = Entry->Cost[CostKind])
3115 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3116
3117 if (ST->hasSSE2())
3118 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3119 LTDest.second, LTSrc.second))
3120 if (auto KindCost = Entry->Cost[CostKind])
3121 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3122
3123 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3124 // sitofp.
3125 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3126 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3127 Type *ExtSrc = Src->getWithNewBitWidth(32);
3128 unsigned ExtOpc =
3129 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3130
3131 // For scalar loads the extend would be free.
3132 InstructionCost ExtCost = 0;
3133 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3134 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3135
3136 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3138 }
3139
3140 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3141 // i32.
3142 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3143 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3144 Type *TruncDst = Dst->getWithNewBitWidth(32);
3145 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3146 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3148 }
3149
3150 // TODO: Allow non-throughput costs that aren't binary.
3151 auto AdjustCost = [&CostKind](InstructionCost Cost,
3154 return Cost == 0 ? 0 : N;
3155 return Cost * N;
3156 };
3157 return AdjustCost(
3158 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3159}
3160
3162 Type *CondTy,
3163 CmpInst::Predicate VecPred,
3165 const Instruction *I) {
3166 // Early out if this type isn't scalar/vector integer/float.
3167 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3168 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3169 I);
3170
3171 // Legalize the type.
3172 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3173
3174 MVT MTy = LT.second;
3175
3176 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3177 assert(ISD && "Invalid opcode");
3178
3179 InstructionCost ExtraCost = 0;
3180 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3181 // Some vector comparison predicates cost extra instructions.
3182 // TODO: Adjust ExtraCost based on CostKind?
3183 // TODO: Should we invert this and assume worst case cmp costs
3184 // and reduce for particular predicates?
3185 if (MTy.isVector() &&
3186 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3187 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3188 ST->hasBWI())) {
3189 // Fallback to I if a specific predicate wasn't specified.
3190 CmpInst::Predicate Pred = VecPred;
3191 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3193 Pred = cast<CmpInst>(I)->getPredicate();
3194
3195 bool CmpWithConstant = false;
3196 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3197 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3198
3199 switch (Pred) {
3201 // xor(cmpeq(x,y),-1)
3202 ExtraCost = CmpWithConstant ? 0 : 1;
3203 break;
3206 // xor(cmpgt(x,y),-1)
3207 ExtraCost = CmpWithConstant ? 0 : 1;
3208 break;
3211 // cmpgt(xor(x,signbit),xor(y,signbit))
3212 // xor(cmpeq(pmaxu(x,y),x),-1)
3213 ExtraCost = CmpWithConstant ? 1 : 2;
3214 break;
3217 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3218 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3219 // cmpeq(psubus(x,y),0)
3220 // cmpeq(pminu(x,y),x)
3221 ExtraCost = 1;
3222 } else {
3223 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3224 ExtraCost = CmpWithConstant ? 2 : 3;
3225 }
3226 break;
3229 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3230 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3231 if (CondTy && !ST->hasAVX())
3232 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3234 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3236 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3237
3238 break;
3241 // Assume worst case scenario and add the maximum extra cost.
3242 ExtraCost = 3;
3243 break;
3244 default:
3245 break;
3246 }
3247 }
3248 }
3249
3250 static const CostKindTblEntry SLMCostTbl[] = {
3251 // slm pcmpeq/pcmpgt throughput is 2
3252 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3253 // slm pblendvb/blendvpd/blendvps throughput is 4
3254 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3255 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3256 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3257 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3258 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3259 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3260 };
3261
3262 static const CostKindTblEntry AVX512BWCostTbl[] = {
3263 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3264 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3265 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3266 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3267
3268 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3269 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3270 };
3271
3272 static const CostKindTblEntry AVX512CostTbl[] = {
3273 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3274 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3275 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3276 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3277
3278 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3279 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3280 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3281 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3282 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3283 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3284 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3285
3286 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3287 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3288 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3289 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3290 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3291 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3292 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3293 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3294 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3295 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3296 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3297 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3298 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3299 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3300
3301 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3302 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3303 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3304 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3305 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3306 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3307 };
3308
3309 static const CostKindTblEntry AVX2CostTbl[] = {
3310 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3311 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3312 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3313 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3314 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3315 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3316
3317 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3318 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3319 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3320 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3321
3322 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3323 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3324 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3325 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3326 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3327 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3328 };
3329
3330 static const CostKindTblEntry XOPCostTbl[] = {
3331 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3332 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3333 };
3334
3335 static const CostKindTblEntry AVX1CostTbl[] = {
3336 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3337 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3338 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3339 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3340 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3341 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3342
3343 // AVX1 does not support 8-wide integer compare.
3344 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3345 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3346 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3347 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3348
3349 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3350 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3351 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3352 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3353 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3354 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3355 };
3356
3357 static const CostKindTblEntry SSE42CostTbl[] = {
3358 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3359 };
3360
3361 static const CostKindTblEntry SSE41CostTbl[] = {
3362 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3363 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3364
3365 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3366 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3367 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3368 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3369 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3370 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3371 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3372 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3373 };
3374
3375 static const CostKindTblEntry SSE2CostTbl[] = {
3376 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3377 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3378
3379 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3380 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3381 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3382 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3383
3384 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3385 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3386 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3387 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3388 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3389 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3390 };
3391
3392 static const CostKindTblEntry SSE1CostTbl[] = {
3393 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3394 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3395
3396 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3397 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3398 };
3399
3400 if (ST->useSLMArithCosts())
3401 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3402 if (auto KindCost = Entry->Cost[CostKind])
3403 return LT.first * (ExtraCost + *KindCost);
3404
3405 if (ST->hasBWI())
3406 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3407 if (auto KindCost = Entry->Cost[CostKind])
3408 return LT.first * (ExtraCost + *KindCost);
3409
3410 if (ST->hasAVX512())
3411 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3412 if (auto KindCost = Entry->Cost[CostKind])
3413 return LT.first * (ExtraCost + *KindCost);
3414
3415 if (ST->hasAVX2())
3416 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3417 if (auto KindCost = Entry->Cost[CostKind])
3418 return LT.first * (ExtraCost + *KindCost);
3419
3420 if (ST->hasXOP())
3421 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3422 if (auto KindCost = Entry->Cost[CostKind])
3423 return LT.first * (ExtraCost + *KindCost);
3424
3425 if (ST->hasAVX())
3426 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3427 if (auto KindCost = Entry->Cost[CostKind])
3428 return LT.first * (ExtraCost + *KindCost);
3429
3430 if (ST->hasSSE42())
3431 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3432 if (auto KindCost = Entry->Cost[CostKind])
3433 return LT.first * (ExtraCost + *KindCost);
3434
3435 if (ST->hasSSE41())
3436 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3437 if (auto KindCost = Entry->Cost[CostKind])
3438 return LT.first * (ExtraCost + *KindCost);
3439
3440 if (ST->hasSSE2())
3441 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3442 if (auto KindCost = Entry->Cost[CostKind])
3443 return LT.first * (ExtraCost + *KindCost);
3444
3445 if (ST->hasSSE1())
3446 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3447 if (auto KindCost = Entry->Cost[CostKind])
3448 return LT.first * (ExtraCost + *KindCost);
3449
3450 // Assume a 3cy latency for fp select ops.
3451 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3452 if (ValTy->getScalarType()->isFloatingPointTy())
3453 return 3;
3454
3455 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3456}
3457
3459
3463 // Costs should match the codegen from:
3464 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3465 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3466 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3467 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3468 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3469
3470 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3471 // specialized in these tables yet.
3472 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3473 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3474 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3475 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3476 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3477 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3478 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3479 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3480 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3481 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3482 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3483 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3484 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3485 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3486 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3487 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3488 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3489 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3490 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3491 };
3492 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3493 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3494 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3495 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3496 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3497 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3498 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3499 };
3500 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3501 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3502 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3503 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3504 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3505 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3506 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3507 };
3508 static const CostKindTblEntry AVX512CDCostTbl[] = {
3509 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3510 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3511 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3512 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3513 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3514 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3515 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3516 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3517 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3518 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3519 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3520 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3521
3522 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3523 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3524 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3525 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3526 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3527 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3528 };
3529 static const CostKindTblEntry AVX512BWCostTbl[] = {
3530 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3531 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3532 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3533 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3534 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3535 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3536 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3537 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3538 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3539 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3540 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3541 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3542 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3543 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3544 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3545 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3546 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3547 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3548 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3549 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3550 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3551 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3552 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3553 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3554 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3555 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3556 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3557 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3558 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3559 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3560 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3561 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3562 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3563 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3564 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3565 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3566 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3567 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3568 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3569 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3570 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3571 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3572 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3573 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3574 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3575 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3576 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3577 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3578 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3579 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3580 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3581 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3582 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3583 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3584 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3585 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3586 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3587 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3588 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3589 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3590 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3591 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3592 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3593 { ISD::SADDSAT, MVT::v32i16, { 1 } },
3594 { ISD::SADDSAT, MVT::v64i8, { 1 } },
3595 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3596 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3597 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3598 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3599 { ISD::SSUBSAT, MVT::v32i16, { 1 } },
3600 { ISD::SSUBSAT, MVT::v64i8, { 1 } },
3601 { ISD::UADDSAT, MVT::v32i16, { 1 } },
3602 { ISD::UADDSAT, MVT::v64i8, { 1 } },
3603 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3604 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3605 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3606 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3607 { ISD::USUBSAT, MVT::v32i16, { 1 } },
3608 { ISD::USUBSAT, MVT::v64i8, { 1 } },
3609 };
3610 static const CostKindTblEntry AVX512CostTbl[] = {
3611 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3612 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3613 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3614 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3615 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3616 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3617 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3618 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3619 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3620 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3621 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3622 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3623 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3624 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3625 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3626 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3627 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3628 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3629 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3630 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3631 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3632 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3633 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3634 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3635 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3636 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3637 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3638 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3639 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3640 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3641 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3642 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3643 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3644 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3645 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3646 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3647 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3648 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3649 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3650 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3651 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3652 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3653 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3654 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3655 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3656 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3657 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3658 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3659 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3660 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3661 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3662 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3663 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3664 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3665 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3666 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3667 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3668 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3669 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3670 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3671 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3672 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3673 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3674 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3675 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3676 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3677 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3678 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3679 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3680 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3681 { ISD::USUBSAT, MVT::v16i32, { 2 } }, // pmaxud + psubd
3682 { ISD::USUBSAT, MVT::v2i64, { 2 } }, // pmaxuq + psubq
3683 { ISD::USUBSAT, MVT::v4i64, { 2 } }, // pmaxuq + psubq
3684 { ISD::USUBSAT, MVT::v8i64, { 2 } }, // pmaxuq + psubq
3685 { ISD::UADDSAT, MVT::v16i32, { 3 } }, // not + pminud + paddd
3686 { ISD::UADDSAT, MVT::v2i64, { 3 } }, // not + pminuq + paddq
3687 { ISD::UADDSAT, MVT::v4i64, { 3 } }, // not + pminuq + paddq
3688 { ISD::UADDSAT, MVT::v8i64, { 3 } }, // not + pminuq + paddq
3689 { ISD::SADDSAT, MVT::v32i16, { 2 } },
3690 { ISD::SADDSAT, MVT::v64i8, { 2 } },
3691 { ISD::SSUBSAT, MVT::v32i16, { 2 } },
3692 { ISD::SSUBSAT, MVT::v64i8, { 2 } },
3693 { ISD::UADDSAT, MVT::v32i16, { 2 } },
3694 { ISD::UADDSAT, MVT::v64i8, { 2 } },
3695 { ISD::USUBSAT, MVT::v32i16, { 2 } },
3696 { ISD::USUBSAT, MVT::v64i8, { 2 } },
3697 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3698 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3699 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3700 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3701 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3702 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3703 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3704 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3705 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3706 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3707 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3708 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3709 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3710 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3711 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3712 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3713 };
3714 static const CostKindTblEntry XOPCostTbl[] = {
3715 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3716 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3717 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3718 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3719 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3720 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3721 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3722 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3723 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3724 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3725 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3726 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3727 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3728 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3729 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3730 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3731 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3732 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3733 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3734 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3735 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3736 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3737 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3738 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3739 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3740 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3741 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3742 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3743 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3744 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3745 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3746 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3747 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3748 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3749 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3750 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3751 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3752 };
3753 static const CostKindTblEntry AVX2CostTbl[] = {
3754 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3755 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3756 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3757 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3758 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3759 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3760 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3761 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3762 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3763 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3764 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3765 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3766 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3767 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3768 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3769 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3770 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3771 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3772 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3773 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3774 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3775 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3776 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3777 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3778 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3779 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3780 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3781 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3782 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3783 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3784 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3785 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3786 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3787 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3788 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3789 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3790 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3791 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3792 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3793 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3794 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3795 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3796 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3797 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3798 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3799 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3800 { ISD::SADDSAT, MVT::v16i16, { 1 } },
3801 { ISD::SADDSAT, MVT::v32i8, { 1 } },
3802 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
3803 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
3804 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3805 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3806 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3807 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
3808 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
3809 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3810 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3811 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3812 { ISD::SSUBSAT, MVT::v16i16, { 1 } },
3813 { ISD::SSUBSAT, MVT::v32i8, { 1 } },
3814 { ISD::UADDSAT, MVT::v16i16, { 1 } },
3815 { ISD::UADDSAT, MVT::v32i8, { 1 } },
3816 { ISD::UADDSAT, MVT::v8i32, { 3 } }, // not + pminud + paddd
3817 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
3818 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
3819 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3820 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3821 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3822 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
3823 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
3824 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3825 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3826 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3827 { ISD::USUBSAT, MVT::v16i16, { 1 } },
3828 { ISD::USUBSAT, MVT::v32i8, { 1 } },
3829 { ISD::USUBSAT, MVT::v8i32, { 2 } }, // pmaxud + psubd
3830 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3831 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3832 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3833 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3834 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3835 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3836 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
3837 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
3838 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
3839 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
3840 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
3841 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
3842 };
3843 static const CostKindTblEntry AVX1CostTbl[] = {
3844 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3845 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
3846 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
3847 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
3848 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3849 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
3850 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3851 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
3852 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3853 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
3854 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
3855 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
3856 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
3857 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
3858 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
3859 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
3860 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
3861 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
3862 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
3863 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
3864 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
3865 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
3866 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
3867 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
3868 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3869 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
3870 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3871 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
3872 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3873 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
3874 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
3875 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
3876 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
3877 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
3878 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
3879 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
3880 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
3881 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
3882 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3883 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
3884 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
3885 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
3886 { ISD::SADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3887 { ISD::SADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3888 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3889 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
3890 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3891 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3892 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3893 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3894 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3895 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3896 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3897 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3898 { ISD::SSUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3899 { ISD::SSUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3900 { ISD::UADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3901 { ISD::UADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3902 { ISD::UADDSAT, MVT::v8i32, { 8 } }, // 2 x 128-bit Op + extract/insert
3903 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3904 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
3905 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3906 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3907 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3908 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3909 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
3910 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3911 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3912 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3913 { ISD::USUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3914 { ISD::USUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3915 { ISD::USUBSAT, MVT::v8i32, { 6 } }, // 2 x 128-bit Op + extract/insert
3916 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3917 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3918 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3919 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3920 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3921 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3922 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
3923 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
3924 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
3925 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
3926 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
3927 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
3928 };
3929 static const CostKindTblEntry GFNICostTbl[] = {
3930 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
3931 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
3932 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
3933 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
3934 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3935 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3936 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3937 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
3938 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
3939 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
3940 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
3941 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
3942 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
3943 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
3944 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3945 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3946 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3947 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3948 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3949 };
3950 static const CostKindTblEntry GLMCostTbl[] = {
3951 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
3952 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
3953 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
3954 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
3955 };
3956 static const CostKindTblEntry SLMCostTbl[] = {
3957 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
3958 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
3959 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
3960 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
3961 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
3962 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
3963 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
3964 };
3965 static const CostKindTblEntry SSE42CostTbl[] = {
3966 { ISD::USUBSAT, MVT::v4i32, { 2 } }, // pmaxud + psubd
3967 { ISD::UADDSAT, MVT::v4i32, { 3 } }, // not + pminud + paddd
3968 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3969 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3970 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3971 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3972 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3973 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3974 };
3975 static const CostKindTblEntry SSE41CostTbl[] = {
3976 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
3977 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
3978 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3979 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
3980 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3981 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3982 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
3983 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
3984 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3985 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
3986 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
3987 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3988 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
3989 };
3990 static const CostKindTblEntry SSSE3CostTbl[] = {
3991 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
3992 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
3993 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
3994 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
3995 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
3996 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
3997 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
3998 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
3999 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
4000 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
4001 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4002 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4003 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4004 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4005 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4006 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4007 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4008 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4009 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4010 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4011 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4012 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4013 };
4014 static const CostKindTblEntry SSE2CostTbl[] = {
4015 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4016 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4017 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4018 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4019 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4020 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4021 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4022 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4023 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4024 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4025 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4026 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4027 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4028 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4029 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4030 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4031 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4032 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4033 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4034 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4035 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4036 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4037 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4038 { ISD::SADDSAT, MVT::v8i16, { 1 } },
4039 { ISD::SADDSAT, MVT::v16i8, { 1 } },
4040 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4041 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4042 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4043 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4044 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4045 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4046 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4047 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4048 { ISD::SSUBSAT, MVT::v8i16, { 1 } },
4049 { ISD::SSUBSAT, MVT::v16i8, { 1 } },
4050 { ISD::UADDSAT, MVT::v8i16, { 1 } },
4051 { ISD::UADDSAT, MVT::v16i8, { 1 } },
4052 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4053 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4054 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4055 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4056 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4057 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4058 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4059 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4060 { ISD::USUBSAT, MVT::v8i16, { 1 } },
4061 { ISD::USUBSAT, MVT::v16i8, { 1 } },
4062 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4063 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4064 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4065 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4066 };
4067 static const CostKindTblEntry SSE1CostTbl[] = {
4068 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4069 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4070 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4071 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4072 };
4073 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4074 { ISD::CTTZ, MVT::i64, { 1 } },
4075 };
4076 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4077 { ISD::CTTZ, MVT::i32, { 1 } },
4078 { ISD::CTTZ, MVT::i16, { 1 } },
4079 { ISD::CTTZ, MVT::i8, { 1 } },
4080 };
4081 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4082 { ISD::CTLZ, MVT::i64, { 1 } },
4083 };
4084 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4085 { ISD::CTLZ, MVT::i32, { 1 } },
4086 { ISD::CTLZ, MVT::i16, { 2 } },
4087 { ISD::CTLZ, MVT::i8, { 2 } },
4088 };
4089 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4090 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4091 };
4092 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4093 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4094 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4095 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4096 };
4097 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4098 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4099 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4100 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4101 { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4102 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR+XOR
4103 { ISD::CTTZ, MVT::i64, { 3 } }, // TEST+BSF+CMOV/BRANCH
4104 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR
4105 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4106 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4107 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4108 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4109 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4110 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4111 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4112 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4113 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4114 { ISD::SADDO, MVT::i64, { 1 } },
4115 { ISD::UADDO, MVT::i64, { 1 } },
4116 { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto
4117 };
4118 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4119 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4120 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4121 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4122 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4123 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4124 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4125 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4126 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4127 { ISD::CTLZ, MVT::i32, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4128 { ISD::CTLZ, MVT::i16, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4129 { ISD::CTLZ, MVT::i8, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4130 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSR+XOR
4131 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 3, 3 } }, // BSR+XOR
4132 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4133 { ISD::CTTZ, MVT::i32, { 3 } }, // TEST+BSF+CMOV/BRANCH
4134 { ISD::CTTZ, MVT::i16, { 3 } }, // TEST+BSF+CMOV/BRANCH
4135 { ISD::CTTZ, MVT::i8, { 3 } }, // TEST+BSF+CMOV/BRANCH
4136 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSF
4137 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 1 } }, // BSF
4138 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 1 } }, // BSF
4139 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4140 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4141 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4142 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4143 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4144 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4145 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4146 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4147 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4148 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4149 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4150 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4151 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4152 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4153 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4154 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4155 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4156 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4157 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4158 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4159 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4160 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4161 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4162 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4163 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4164 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4165 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4166 { ISD::SADDO, MVT::i32, { 1 } },
4167 { ISD::SADDO, MVT::i16, { 1 } },
4168 { ISD::SADDO, MVT::i8, { 1 } },
4169 { ISD::UADDO, MVT::i32, { 1 } },
4170 { ISD::UADDO, MVT::i16, { 1 } },
4171 { ISD::UADDO, MVT::i8, { 1 } },
4172 { ISD::UMULO, MVT::i32, { 2 } }, // mul + seto
4173 { ISD::UMULO, MVT::i16, { 2 } },
4174 { ISD::UMULO, MVT::i8, { 2 } },
4175 };
4176
4177 Type *RetTy = ICA.getReturnType();
4178 Type *OpTy = RetTy;
4179 Intrinsic::ID IID = ICA.getID();
4180 unsigned ISD = ISD::DELETED_NODE;
4181 switch (IID) {
4182 default:
4183 break;
4184 case Intrinsic::abs:
4185 ISD = ISD::ABS;
4186 break;
4187 case Intrinsic::bitreverse:
4188 ISD = ISD::BITREVERSE;
4189 break;
4190 case Intrinsic::bswap:
4191 ISD = ISD::BSWAP;
4192 break;
4193 case Intrinsic::ctlz:
4194 ISD = ISD::CTLZ;
4195 break;
4196 case Intrinsic::ctpop:
4197 ISD = ISD::CTPOP;
4198 break;
4199 case Intrinsic::cttz:
4200 ISD = ISD::CTTZ;
4201 break;
4202 case Intrinsic::fshl:
4203 ISD = ISD::FSHL;
4204 if (!ICA.isTypeBasedOnly()) {
4205 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4206 if (Args[0] == Args[1]) {
4207 ISD = ISD::ROTL;
4208 // Handle uniform constant rotation amounts.
4209 // TODO: Handle funnel-shift cases.
4210 const APInt *Amt;
4211 if (Args[2] &&
4213 ISD = X86ISD::VROTLI;
4214 }
4215 }
4216 break;
4217 case Intrinsic::fshr:
4218 // FSHR has same costs so don't duplicate.
4219 ISD = ISD::FSHL;
4220 if (!ICA.isTypeBasedOnly()) {
4221 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4222 if (Args[0] == Args[1]) {
4223 ISD = ISD::ROTR;
4224 // Handle uniform constant rotation amount.
4225 // TODO: Handle funnel-shift cases.
4226 const APInt *Amt;
4227 if (Args[2] &&
4229 ISD = X86ISD::VROTLI;
4230 }
4231 }
4232 break;
4233 case Intrinsic::lrint:
4234 case Intrinsic::llrint:
4235 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4236 // have the same costs as the CVTTP2SI (fptosi) instructions
4237 if (!ICA.isTypeBasedOnly()) {
4238 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4239 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4241 }
4242 break;
4243 case Intrinsic::maxnum:
4244 case Intrinsic::minnum:
4245 // FMINNUM has same costs so don't duplicate.
4246 ISD = ISD::FMAXNUM;
4247 break;
4248 case Intrinsic::sadd_sat:
4249 ISD = ISD::SADDSAT;
4250 break;
4251 case Intrinsic::smax:
4252 ISD = ISD::SMAX;
4253 break;
4254 case Intrinsic::smin:
4255 ISD = ISD::SMIN;
4256 break;
4257 case Intrinsic::ssub_sat:
4258 ISD = ISD::SSUBSAT;
4259 break;
4260 case Intrinsic::uadd_sat:
4261 ISD = ISD::UADDSAT;
4262 break;
4263 case Intrinsic::umax:
4264 ISD = ISD::UMAX;
4265 break;
4266 case Intrinsic::umin:
4267 ISD = ISD::UMIN;
4268 break;
4269 case Intrinsic::usub_sat:
4270 ISD = ISD::USUBSAT;
4271 break;
4272 case Intrinsic::sqrt:
4273 ISD = ISD::FSQRT;
4274 break;
4275 case Intrinsic::sadd_with_overflow:
4276 case Intrinsic::ssub_with_overflow:
4277 // SSUBO has same costs so don't duplicate.
4278 ISD = ISD::SADDO;
4279 OpTy = RetTy->getContainedType(0);
4280 break;
4281 case Intrinsic::uadd_with_overflow:
4282 case Intrinsic::usub_with_overflow:
4283 // USUBO has same costs so don't duplicate.
4284 ISD = ISD::UADDO;
4285 OpTy = RetTy->getContainedType(0);
4286 break;
4287 case Intrinsic::umul_with_overflow:
4288 case Intrinsic::smul_with_overflow:
4289 // SMULO has same costs so don't duplicate.
4290 ISD = ISD::UMULO;
4291 OpTy = RetTy->getContainedType(0);
4292 break;
4293 }
4294
4295 if (ISD != ISD::DELETED_NODE) {
4296 auto adjustTableCost = [&](int ISD, unsigned Cost,
4297 std::pair<InstructionCost, MVT> LT,
4299 InstructionCost LegalizationCost = LT.first;
4300 MVT MTy = LT.second;
4301
4302 // If there are no NANs to deal with, then these are reduced to a
4303 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4304 // assume is used in the non-fast case.
4305 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4306 if (FMF.noNaNs())
4307 return LegalizationCost * 1;
4308 }
4309
4310 // For cases where some ops can be folded into a load/store, assume free.
4311 if (MTy.isScalarInteger()) {
4312 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4313 if (const Instruction *II = ICA.getInst()) {
4314 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4315 return TTI::TCC_Free;
4316 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4317 if (LI->hasOneUse())
4318 return TTI::TCC_Free;
4319 }
4320 }
4321 }
4322 }
4323
4324 return LegalizationCost * (int)Cost;
4325 };
4326
4327 // Legalize the type.
4328 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4329 MVT MTy = LT.second;
4330
4331 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4332 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4333 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4334 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4335 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4336 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4337 if (Cst->isAllOnesValue())
4339 }
4340
4341 // FSQRT is a single instruction.
4342 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4343 return LT.first;
4344
4345 if (ST->useGLMDivSqrtCosts())
4346 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4347 if (auto KindCost = Entry->Cost[CostKind])
4348 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4349
4350 if (ST->useSLMArithCosts())
4351 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4352 if (auto KindCost = Entry->Cost[CostKind])
4353 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4354
4355 if (ST->hasVBMI2())
4356 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4357 if (auto KindCost = Entry->Cost[CostKind])
4358 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4359
4360 if (ST->hasBITALG())
4361 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4362 if (auto KindCost = Entry->Cost[CostKind])
4363 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4364
4365 if (ST->hasVPOPCNTDQ())
4366 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4367 if (auto KindCost = Entry->Cost[CostKind])
4368 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4369
4370 if (ST->hasGFNI())
4371 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4372 if (auto KindCost = Entry->Cost[CostKind])
4373 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4374
4375 if (ST->hasCDI())
4376 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4377 if (auto KindCost = Entry->Cost[CostKind])
4378 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4379
4380 if (ST->hasBWI())
4381 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4382 if (auto KindCost = Entry->Cost[CostKind])
4383 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4384
4385 if (ST->hasAVX512())
4386 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4387 if (auto KindCost = Entry->Cost[CostKind])
4388 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4389
4390 if (ST->hasXOP())
4391 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4392 if (auto KindCost = Entry->Cost[CostKind])
4393 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4394
4395 if (ST->hasAVX2())
4396 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4397 if (auto KindCost = Entry->Cost[CostKind])
4398 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4399
4400 if (ST->hasAVX())
4401 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4402 if (auto KindCost = Entry->Cost[CostKind])
4403 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4404
4405 if (ST->hasSSE42())
4406 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4407 if (auto KindCost = Entry->Cost[CostKind])
4408 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4409
4410 if (ST->hasSSE41())
4411 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4412 if (auto KindCost = Entry->Cost[CostKind])
4413 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4414
4415 if (ST->hasSSSE3())
4416 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4417 if (auto KindCost = Entry->Cost[CostKind])
4418 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4419
4420 if (ST->hasSSE2())
4421 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4422 if (auto KindCost = Entry->Cost[CostKind])
4423 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4424
4425 if (ST->hasSSE1())
4426 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4427 if (auto KindCost = Entry->Cost[CostKind])
4428 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4429
4430 if (ST->hasBMI()) {
4431 if (ST->is64Bit())
4432 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4433 if (auto KindCost = Entry->Cost[CostKind])
4434 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4435
4436 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4437 if (auto KindCost = Entry->Cost[CostKind])
4438 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4439 }
4440
4441 if (ST->hasLZCNT()) {
4442 if (ST->is64Bit())
4443 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4444 if (auto KindCost = Entry->Cost[CostKind])
4445 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4446
4447 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4448 if (auto KindCost = Entry->Cost[CostKind])
4449 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4450 }
4451
4452 if (ST->hasPOPCNT()) {
4453 if (ST->is64Bit())
4454 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4455 if (auto KindCost = Entry->Cost[CostKind])
4456 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4457
4458 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4459 if (auto KindCost = Entry->Cost[CostKind])
4460 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4461 }
4462
4463 if (ST->is64Bit())
4464 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4465 if (auto KindCost = Entry->Cost[CostKind])
4466 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4467
4468 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4469 if (auto KindCost = Entry->Cost[CostKind])
4470 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4471 }
4472
4474}
4475
4478 unsigned Index, Value *Op0,
4479 Value *Op1) {
4480 static const CostTblEntry SLMCostTbl[] = {
4481 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4482 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4483 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4484 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4485 };
4486
4487 assert(Val->isVectorTy() && "This must be a vector type");
4488 Type *ScalarType = Val->getScalarType();
4489 InstructionCost RegisterFileMoveCost = 0;
4490
4491 // Non-immediate extraction/insertion can be handled as a sequence of
4492 // aliased loads+stores via the stack.
4493 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4494 Opcode == Instruction::InsertElement)) {
4495 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4496 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4497
4498 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4499 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4500 Align VecAlign = DL.getPrefTypeAlign(Val);
4501 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4502
4503 // Extract - store vector to stack, load scalar.
4504 if (Opcode == Instruction::ExtractElement) {
4505 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4506 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4507 CostKind);
4508 }
4509 // Insert - store vector to stack, store scalar, load vector.
4510 if (Opcode == Instruction::InsertElement) {
4511 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4512 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4513 CostKind) +
4514 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4515 }
4516 }
4517
4518 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4519 Opcode == Instruction::InsertElement)) {
4520 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4521 if (Opcode == Instruction::ExtractElement &&
4522 ScalarType->getScalarSizeInBits() == 1 &&
4523 cast<FixedVectorType>(Val)->getNumElements() > 1)
4524 return 1;
4525
4526 // Legalize the type.
4527 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4528
4529 // This type is legalized to a scalar type.
4530 if (!LT.second.isVector())
4531 return 0;
4532
4533 // The type may be split. Normalize the index to the new type.
4534 unsigned SizeInBits = LT.second.getSizeInBits();
4535 unsigned NumElts = LT.second.getVectorNumElements();
4536 unsigned SubNumElts = NumElts;
4537 Index = Index % NumElts;
4538
4539 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4540 // For inserts, we also need to insert the subvector back.
4541 if (SizeInBits > 128) {
4542 assert((SizeInBits % 128) == 0 && "Illegal vector");
4543 unsigned NumSubVecs = SizeInBits / 128;
4544 SubNumElts = NumElts / NumSubVecs;
4545 if (SubNumElts <= Index) {
4546 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4547 Index %= SubNumElts;
4548 }
4549 }
4550
4551 MVT MScalarTy = LT.second.getScalarType();
4552 auto IsCheapPInsrPExtrInsertPS = [&]() {
4553 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4554 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4555 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4556 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4557 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4558 Opcode == Instruction::InsertElement);
4559 };
4560
4561 if (Index == 0) {
4562 // Floating point scalars are already located in index #0.
4563 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4564 // true for all.
4565 if (ScalarType->isFloatingPointTy() &&
4566 (Opcode != Instruction::InsertElement || !Op0 ||
4567 isa<UndefValue>(Op0)))
4568 return RegisterFileMoveCost;
4569
4570 if (Opcode == Instruction::InsertElement &&
4571 isa_and_nonnull<UndefValue>(Op0)) {
4572 // Consider the gather cost to be cheap.
4573 if (isa_and_nonnull<LoadInst>(Op1))
4574 return RegisterFileMoveCost;
4575 if (!IsCheapPInsrPExtrInsertPS()) {
4576 // mov constant-to-GPR + movd/movq GPR -> XMM.
4577 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4578 return 2 + RegisterFileMoveCost;
4579 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4580 return 1 + RegisterFileMoveCost;
4581 }
4582 }
4583
4584 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4585 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4586 return 1 + RegisterFileMoveCost;
4587 }
4588
4589 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4590 assert(ISD && "Unexpected vector opcode");
4591 if (ST->useSLMArithCosts())
4592 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4593 return Entry->Cost + RegisterFileMoveCost;
4594
4595 // Consider cheap cases.
4596 if (IsCheapPInsrPExtrInsertPS())
4597 return 1 + RegisterFileMoveCost;
4598
4599 // For extractions we just need to shuffle the element to index 0, which
4600 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4601 // the elements to its destination. In both cases we must handle the
4602 // subvector move(s).
4603 // If the vector type is already less than 128-bits then don't reduce it.
4604 // TODO: Under what circumstances should we shuffle using the full width?
4605 InstructionCost ShuffleCost = 1;
4606 if (Opcode == Instruction::InsertElement) {
4607 auto *SubTy = cast<VectorType>(Val);
4608 EVT VT = TLI->getValueType(DL, Val);
4609 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4610 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4611 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, std::nullopt,
4612 CostKind, 0, SubTy);
4613 }
4614 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4615 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4616 }
4617
4618 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4619 RegisterFileMoveCost;
4620}
4621
4624 bool Insert, bool Extract,
4626 assert(DemandedElts.getBitWidth() ==
4627 cast<FixedVectorType>(Ty)->getNumElements() &&
4628 "Vector size mismatch");
4629
4630 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4631 MVT MScalarTy = LT.second.getScalarType();
4632 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4634
4635 constexpr unsigned LaneBitWidth = 128;
4636 assert((LegalVectorBitWidth < LaneBitWidth ||
4637 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4638 "Illegal vector");
4639
4640 const int NumLegalVectors = *LT.first.getValue();
4641 assert(NumLegalVectors >= 0 && "Negative cost!");
4642
4643 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4644 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4645 if (Insert) {
4646 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4647 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4648 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4649 // For types we can insert directly, insertion into 128-bit sub vectors is
4650 // cheap, followed by a cheap chain of concatenations.
4651 if (LegalVectorBitWidth <= LaneBitWidth) {
4652 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4653 /*Extract*/ false, CostKind);
4654 } else {
4655 // In each 128-lane, if at least one index is demanded but not all
4656 // indices are demanded and this 128-lane is not the first 128-lane of
4657 // the legalized-vector, then this 128-lane needs a extracti128; If in
4658 // each 128-lane, there is at least one demanded index, this 128-lane
4659 // needs a inserti128.
4660
4661 // The following cases will help you build a better understanding:
4662 // Assume we insert several elements into a v8i32 vector in avx2,
4663 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4664 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4665 // inserti128.
4666 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4667 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4668 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4669 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4670 unsigned NumLegalElts =
4671 LT.second.getVectorNumElements() * NumLegalVectors;
4672 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4673 "Vector has been legalized to smaller element count");
4674 assert((NumLegalElts % NumLanesTotal) == 0 &&
4675 "Unexpected elts per lane");
4676 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4677
4678 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4679 auto *LaneTy =
4680 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4681
4682 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4683 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4684 NumEltsPerLane, NumEltsPerLane * I);
4685 if (LaneEltMask.isZero())
4686 continue;
4687 // FIXME: we don't need to extract if all non-demanded elements
4688 // are legalization-inserted padding.
4689 if (!LaneEltMask.isAllOnes())
4690 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4691 CostKind, I * NumEltsPerLane, LaneTy);
4692 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4693 /*Extract*/ false, CostKind);
4694 }
4695
4696 APInt AffectedLanes =
4697 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4698 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4699 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4700 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4701 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4702 unsigned I = NumLegalLanes * LegalVec + Lane;
4703 // No need to insert unaffected lane; or lane 0 of each legal vector
4704 // iff ALL lanes of that vector were affected and will be inserted.
4705 if (!AffectedLanes[I] ||
4706 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
4707 continue;
4708 Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, std::nullopt,
4709 CostKind, I * NumEltsPerLane, LaneTy);
4710 }
4711 }
4712 }
4713 } else if (LT.second.isVector()) {
4714 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
4715 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
4716 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
4717 // considered cheap.
4718 if (Ty->isIntOrIntVectorTy())
4719 Cost += DemandedElts.popcount();
4720
4721 // Get the smaller of the legalized or original pow2-extended number of
4722 // vector elements, which represents the number of unpacks we'll end up
4723 // performing.
4724 unsigned NumElts = LT.second.getVectorNumElements();
4725 unsigned Pow2Elts =
4726 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
4727 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
4728 }
4729 }
4730
4731 if (Extract) {
4732 // vXi1 can be efficiently extracted with MOVMSK.
4733 // TODO: AVX512 predicate mask handling.
4734 // NOTE: This doesn't work well for roundtrip scalarization.
4735 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
4736 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
4737 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
4738 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
4739 return MOVMSKCost;
4740 }
4741
4742 if (LT.second.isVector()) {
4743 unsigned NumLegalElts =
4744 LT.second.getVectorNumElements() * NumLegalVectors;
4745 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4746 "Vector has been legalized to smaller element count");
4747
4748 // If we're extracting elements from a 128-bit subvector lane,
4749 // we only need to extract each lane once, not for every element.
4750 if (LegalVectorBitWidth > LaneBitWidth) {
4751 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4752 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4753 assert((NumLegalElts % NumLanesTotal) == 0 &&
4754 "Unexpected elts per lane");
4755 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4756
4757 // Add cost for each demanded 128-bit subvector extraction.
4758 // Luckily this is a lot easier than for insertion.
4759 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4760 auto *LaneTy =
4761 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4762
4763 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4764 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4765 NumEltsPerLane, I * NumEltsPerLane);
4766 if (LaneEltMask.isZero())
4767 continue;
4768 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4769 CostKind, I * NumEltsPerLane, LaneTy);
4771 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
4772 }
4773
4774 return Cost;
4775 }
4776 }
4777
4778 // Fallback to default extraction.
4779 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
4780 Extract, CostKind);
4781 }
4782
4783 return Cost;
4784}
4785
4787X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
4788 int VF, const APInt &DemandedDstElts,
4790 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
4791 // We don't differentiate element types here, only element bit width.
4792 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
4793
4794 auto bailout = [&]() {
4795 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
4796 DemandedDstElts, CostKind);
4797 };
4798
4799 // For now, only deal with AVX512 cases.
4800 if (!ST->hasAVX512())
4801 return bailout();
4802
4803 // Do we have a native shuffle for this element type, or should we promote?
4804 unsigned PromEltTyBits = EltTyBits;
4805 switch (EltTyBits) {
4806 case 32:
4807 case 64:
4808 break; // AVX512F.
4809 case 16:
4810 if (!ST->hasBWI())
4811 PromEltTyBits = 32; // promote to i32, AVX512F.
4812 break; // AVX512BW
4813 case 8:
4814 if (!ST->hasVBMI())
4815 PromEltTyBits = 32; // promote to i32, AVX512F.
4816 break; // AVX512VBMI
4817 case 1:
4818 // There is no support for shuffling i1 elements. We *must* promote.
4819 if (ST->hasBWI()) {
4820 if (ST->hasVBMI())
4821 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
4822 else
4823 PromEltTyBits = 16; // promote to i16, AVX512BW.
4824 break;
4825 }
4826 PromEltTyBits = 32; // promote to i32, AVX512F.
4827 break;
4828 default:
4829 return bailout();
4830 }
4831 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
4832
4833 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
4834 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
4835
4836 int NumDstElements = VF * ReplicationFactor;
4837 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
4838 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
4839
4840 // Legalize the types.
4841 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
4842 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
4843 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
4844 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
4845 // They should have legalized into vector types.
4846 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
4847 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
4848 return bailout();
4849
4850 if (PromEltTyBits != EltTyBits) {
4851 // If we have to perform the shuffle with wider elt type than our data type,
4852 // then we will first need to anyext (we don't care about the new bits)
4853 // the source elements, and then truncate Dst elements.
4854 InstructionCost PromotionCost;
4855 PromotionCost += getCastInstrCost(
4856 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
4858 PromotionCost +=
4859 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
4860 /*Src=*/PromDstVecTy,
4862 return PromotionCost + getReplicationShuffleCost(PromEltTy,
4863 ReplicationFactor, VF,
4864 DemandedDstElts, CostKind);
4865 }
4866
4867 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
4868 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
4869 "We expect that the legalization doesn't affect the element width, "
4870 "doesn't coalesce/split elements.");
4871
4872 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
4873 unsigned NumDstVectors =
4874 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
4875
4876 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
4877
4878 // Not all the produced Dst elements may be demanded. In our case,
4879 // given that a single Dst vector is formed by a single shuffle,
4880 // if all elements that will form a single Dst vector aren't demanded,
4881 // then we won't need to do that shuffle, so adjust the cost accordingly.
4882 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
4883 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
4884 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
4885
4886 InstructionCost SingleShuffleCost = getShuffleCost(
4887 TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/std::nullopt, CostKind,
4888 /*Index=*/0, /*SubTp=*/nullptr);
4889 return NumDstVectorsDemanded * SingleShuffleCost;
4890}
4891
4893 MaybeAlign Alignment,
4894 unsigned AddressSpace,
4896 TTI::OperandValueInfo OpInfo,
4897 const Instruction *I) {
4898 // TODO: Handle other cost kinds.
4900 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
4901 // Store instruction with index and scale costs 2 Uops.
4902 // Check the preceding GEP to identify non-const indices.
4903 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
4904 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
4905 return TTI::TCC_Basic * 2;
4906 }
4907 }
4908 return TTI::TCC_Basic;
4909 }
4910
4911 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4912 "Invalid Opcode");
4913 // Type legalization can't handle structs
4914 if (TLI->getValueType(DL, Src, true) == MVT::Other)
4915 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4916 CostKind);
4917
4918 // Legalize the type.
4919 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
4920
4921 auto *VTy = dyn_cast<FixedVectorType>(Src);
4922
4924
4925 // Add a cost for constant load to vector.
4926 if (Opcode == Instruction::Store && OpInfo.isConstant())
4927 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
4928 /*AddressSpace=*/0, CostKind);
4929
4930 // Handle the simple case of non-vectors.
4931 // NOTE: this assumes that legalization never creates vector from scalars!
4932 if (!VTy || !LT.second.isVector()) {
4933 // Each load/store unit costs 1.
4934 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
4935 }
4936
4937 bool IsLoad = Opcode == Instruction::Load;
4938
4939 Type *EltTy = VTy->getElementType();
4940
4941 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
4942
4943 // Source of truth: how many elements were there in the original IR vector?
4944 const unsigned SrcNumElt = VTy->getNumElements();
4945
4946 // How far have we gotten?
4947 int NumEltRemaining = SrcNumElt;
4948 // Note that we intentionally capture by-reference, NumEltRemaining changes.
4949 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
4950
4951 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
4952
4953 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
4954 const unsigned XMMBits = 128;
4955 if (XMMBits % EltTyBits != 0)
4956 // Vector size must be a multiple of the element size. I.e. no padding.
4957 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4958 CostKind);
4959 const int NumEltPerXMM = XMMBits / EltTyBits;
4960
4961 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
4962
4963 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
4964 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
4965 // How many elements would a single op deal with at once?
4966 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
4967 // Vector size must be a multiple of the element size. I.e. no padding.
4968 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4969 CostKind);
4970 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
4971
4972 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
4973 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
4974 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
4975 "Unless we haven't halved the op size yet, "
4976 "we have less than two op's sized units of work left.");
4977
4978 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
4979 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
4980 : XMMVecTy;
4981
4982 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
4983 "After halving sizes, the vector elt count is no longer a multiple "
4984 "of number of elements per operation?");
4985 auto *CoalescedVecTy =
4986 CurrNumEltPerOp == 1
4987 ? CurrVecTy
4989 IntegerType::get(Src->getContext(),
4990 EltTyBits * CurrNumEltPerOp),
4991 CurrVecTy->getNumElements() / CurrNumEltPerOp);
4992 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
4993 DL.getTypeSizeInBits(CurrVecTy) &&
4994 "coalesciing elements doesn't change vector width.");
4995
4996 while (NumEltRemaining > 0) {
4997 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
4998
4999 // Can we use this vector size, as per the remaining element count?
5000 // Iff the vector is naturally aligned, we can do a wide load regardless.
5001 if (NumEltRemaining < CurrNumEltPerOp &&
5002 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
5003 CurrOpSizeBytes != 1)
5004 break; // Try smalled vector size.
5005
5006 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5007
5008 // If we have fully processed the previous reg, we need to replenish it.
5009 if (SubVecEltsLeft == 0) {
5010 SubVecEltsLeft += CurrVecTy->getNumElements();
5011 // And that's free only for the 0'th subvector of a legalized vector.
5012 if (!Is0thSubVec)
5015 VTy, std::nullopt, CostKind, NumEltDone(),
5016 CurrVecTy);
5017 }
5018
5019 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5020 // for smaller widths (32/16/8) we have to insert/extract them separately.
5021 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5022 // but let's pretend that it is also true for 16/8 bit wide ops...)
5023 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5024 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5025 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5026 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5027 APInt DemandedElts =
5028 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5029 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5030 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5031 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5032 !IsLoad, CostKind);
5033 }
5034
5035 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5036 // as a proxy for a double-pumped AVX memory interface such as on
5037 // Sandybridge.
5038 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5039 // will be scalarized.
5040 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5041 Cost += 2;
5042 else if (CurrOpSizeBytes < 4)
5043 Cost += 2;
5044 else
5045 Cost += 1;
5046
5047 SubVecEltsLeft -= CurrNumEltPerOp;
5048 NumEltRemaining -= CurrNumEltPerOp;
5049 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
5050 }
5051 }
5052
5053 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5054
5055 return Cost;
5056}
5057
5059X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
5060 unsigned AddressSpace,
5062 bool IsLoad = (Instruction::Load == Opcode);
5063 bool IsStore = (Instruction::Store == Opcode);
5064
5065 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5066 if (!SrcVTy)
5067 // To calculate scalar take the regular cost, without mask
5068 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5069
5070 unsigned NumElem = SrcVTy->getNumElements();
5071 auto *MaskTy =
5072 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5073 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
5074 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
5075 // Scalarization
5076 APInt DemandedElts = APInt::getAllOnes(NumElem);
5078 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5079 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5080 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5082 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5083 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5085 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5086 InstructionCost MemopCost =
5087 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5088 Alignment, AddressSpace, CostKind);
5089 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5090 }
5091
5092 // Legalize the type.
5093 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5094 auto VT = TLI->getValueType(DL, SrcVTy);
5096 MVT Ty = LT.second;
5097 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5098 // APX masked load/store for scalar is cheap.
5099 return Cost + LT.first;
5100
5101 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5102 LT.second.getVectorNumElements() == NumElem)
5103 // Promotion requires extend/truncate for data and a shuffle for mask.
5104 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt,
5105 CostKind, 0, nullptr) +
5106 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt,
5107 CostKind, 0, nullptr);
5108
5109 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5110 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5112 // Expanding requires fill mask with zeroes
5113 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt,
5114 CostKind, 0, MaskTy);
5115 }
5116
5117 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5118 if (!ST->hasAVX512())
5119 return Cost + LT.first * (IsLoad ? 2 : 8);
5120
5121 // AVX-512 masked load/store is cheaper
5122 return Cost + LT.first;
5123}
5124
5127 const Value *Base,
5128 const TTI::PointersChainInfo &Info,
5129 Type *AccessTy, TTI::TargetCostKind CostKind) {
5130 if (Info.isSameBase() && Info.isKnownStride()) {
5131 // If all the pointers have known stride all the differences are translated
5132 // into constants. X86 memory addressing allows encoding it into
5133 // displacement. So we just need to take the base GEP cost.
5134 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5135 SmallVector<const Value *> Indices(BaseGEP->indices());
5136 return getGEPCost(BaseGEP->getSourceElementType(),
5137 BaseGEP->getPointerOperand(), Indices, nullptr,
5138 CostKind);
5139 }
5140 return TTI::TCC_Free;
5141 }
5142 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5143}
5144
5146 ScalarEvolution *SE,
5147 const SCEV *Ptr) {
5148 // Address computations in vectorized code with non-consecutive addresses will
5149 // likely result in more instructions compared to scalar code where the
5150 // computation can more often be merged into the index mode. The resulting
5151 // extra micro-ops can significantly decrease throughput.
5152 const unsigned NumVectorInstToHideOverhead = 10;
5153
5154 // Cost modeling of Strided Access Computation is hidden by the indexing
5155 // modes of X86 regardless of the stride value. We dont believe that there
5156 // is a difference between constant strided access in gerenal and constant
5157 // strided value which is less than or equal to 64.
5158 // Even in the case of (loop invariant) stride whose value is not known at
5159 // compile time, the address computation will not incur more than one extra
5160 // ADD instruction.
5161 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
5162 // TODO: AVX2 is the current cut-off because we don't have correct
5163 // interleaving costs for prior ISA's.
5165 return NumVectorInstToHideOverhead;
5167 return 1;
5168 }
5169
5170 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5171}
5172
5175 std::optional<FastMathFlags> FMF,
5178 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5179
5180 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5181 // and make it as the cost.
5182
5183 static const CostTblEntry SLMCostTbl[] = {
5184 { ISD::FADD, MVT::v2f64, 3 },
5185 { ISD::ADD, MVT::v2i64, 5 },
5186 };
5187
5188 static const CostTblEntry SSE2CostTbl[] = {
5189 { ISD::FADD, MVT::v2f64, 2 },
5190 { ISD::FADD, MVT::v2f32, 2 },
5191 { ISD::FADD, MVT::v4f32, 4 },
5192 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5193 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5194 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5195 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5196 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5197 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5198 { ISD::ADD, MVT::v2i8, 2 },
5199 { ISD::ADD, MVT::v4i8, 2 },
5200 { ISD::ADD, MVT::v8i8, 2 },
5201 { ISD::ADD, MVT::v16i8, 3 },
5202 };
5203
5204 static const CostTblEntry AVX1CostTbl[] = {
5205 { ISD::FADD, MVT::v4f64, 3 },
5206 { ISD::FADD, MVT::v4f32, 3 },
5207 { ISD::FADD, MVT::v8f32, 4 },
5208 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5209 { ISD::ADD, MVT::v4i64, 3 },
5210 { ISD::ADD, MVT::v8i32, 5 },
5211 { ISD::ADD, MVT::v16i16, 5 },
5212 { ISD::ADD, MVT::v32i8, 4 },
5213 };
5214
5215 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5216 assert(ISD && "Invalid opcode");
5217
5218 // Before legalizing the type, give a chance to look up illegal narrow types
5219 // in the table.
5220 // FIXME: Is there a better way to do this?
5221 EVT VT = TLI->getValueType(DL, ValTy);
5222 if (VT.isSimple()) {
5223 MVT MTy = VT.getSimpleVT();
5224 if (ST->useSLMArithCosts())
5225 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5226 return Entry->Cost;
5227
5228 if (ST->hasAVX())
5229 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5230 return Entry->Cost;
5231
5232 if (ST->hasSSE2())
5233 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5234 return Entry->Cost;
5235 }
5236
5237 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5238
5239 MVT MTy = LT.second;
5240
5241 auto *ValVTy = cast<FixedVectorType>(ValTy);
5242
5243 // Special case: vXi8 mul reductions are performed as vXi16.
5244 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5245 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5246 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5247 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5249 CostKind) +
5250 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5251 }
5252
5253 InstructionCost ArithmeticCost = 0;
5254 if (LT.first != 1 && MTy.isVector() &&
5255 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5256 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5257 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5258 MTy.getVectorNumElements());
5259 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5260 ArithmeticCost *= LT.first - 1;
5261 }
5262
5263 if (ST->useSLMArithCosts())
5264 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5265 return ArithmeticCost + Entry->Cost;
5266
5267 if (ST->hasAVX())
5268 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5269 return ArithmeticCost + Entry->Cost;
5270
5271 if (ST->hasSSE2())
5272 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5273 return ArithmeticCost + Entry->Cost;
5274
5275 // FIXME: These assume a naive kshift+binop lowering, which is probably
5276 // conservative in most cases.
5277 static const CostTblEntry AVX512BoolReduction[] = {
5278 { ISD::AND, MVT::v2i1, 3 },
5279 { ISD::AND, MVT::v4i1, 5 },
5280 { ISD::AND, MVT::v8i1, 7 },
5281 { ISD::AND, MVT::v16i1, 9 },
5282 { ISD::AND, MVT::v32i1, 11 },
5283 { ISD::AND, MVT::v64i1, 13 },
5284 { ISD::OR, MVT::v2i1, 3 },
5285 { ISD::OR, MVT::v4i1, 5 },
5286 { ISD::OR, MVT::v8i1, 7 },
5287 { ISD::OR, MVT::v16i1, 9 },
5288 { ISD::OR, MVT::v32i1, 11 },
5289 { ISD::OR, MVT::v64i1, 13 },
5290 };
5291
5292 static const CostTblEntry AVX2BoolReduction[] = {
5293 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5294 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5295 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5296 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5297 };
5298
5299 static const CostTblEntry AVX1BoolReduction[] = {
5300 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5301 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5302 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5303 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5304 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5305 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5306 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5307 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5308 };
5309
5310 static const CostTblEntry SSE2BoolReduction[] = {
5311 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5312 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5313 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5314 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5315 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5316 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5317 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5318 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5319 };
5320
5321 // Handle bool allof/anyof patterns.
5322 if (ValVTy->getElementType()->isIntegerTy(1)) {
5323 InstructionCost ArithmeticCost = 0;
5324 if (LT.first != 1 && MTy.isVector() &&
5325 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5326 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5327 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5328 MTy.getVectorNumElements());
5329 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5330 ArithmeticCost *= LT.first - 1;
5331 }
5332
5333 if (ST->hasAVX512())
5334 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5335 return ArithmeticCost + Entry->Cost;
5336 if (ST->hasAVX2())
5337 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5338 return ArithmeticCost + Entry->Cost;
5339 if (ST->hasAVX())
5340 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5341 return ArithmeticCost + Entry->Cost;
5342 if (ST->hasSSE2())
5343 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5344 return ArithmeticCost + Entry->Cost;
5345
5346 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5347 }
5348
5349 unsigned NumVecElts = ValVTy->getNumElements();
5350 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5351
5352 // Special case power of 2 reductions where the scalar type isn't changed
5353 // by type legalization.
5354 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5355 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5356
5357 InstructionCost ReductionCost = 0;
5358
5359 auto *Ty = ValVTy;
5360 if (LT.first != 1 && MTy.isVector() &&
5361 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5362 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5363 Ty = FixedVectorType::get(ValVTy->getElementType(),
5364 MTy.getVectorNumElements());
5365 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5366 ReductionCost *= LT.first - 1;
5367 NumVecElts = MTy.getVectorNumElements();
5368 }
5369
5370 // Now handle reduction with the legal type, taking into account size changes
5371 // at each level.
5372 while (NumVecElts > 1) {
5373 // Determine the size of the remaining vector we need to reduce.
5374 unsigned Size = NumVecElts * ScalarSize;
5375 NumVecElts /= 2;
5376 // If we're reducing from 256/512 bits, use an extract_subvector.
5377 if (Size > 128) {
5378 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5379 ReductionCost +=
5381 NumVecElts, SubTy);
5382 Ty = SubTy;
5383 } else if (Size == 128) {
5384 // Reducing from 128 bits is a permute of v2f64/v2i64.
5385 FixedVectorType *ShufTy;
5386 if (ValVTy->isFloatingPointTy())
5387 ShufTy =
5388 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5389 else
5390 ShufTy =
5391 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5392 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5393 std::nullopt, CostKind, 0, nullptr);
5394 } else if (Size == 64) {
5395 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5396 FixedVectorType *ShufTy;
5397 if (ValVTy->isFloatingPointTy())
5398 ShufTy =
5399 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5400 else
5401 ShufTy =
5402 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5403 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5404 std::nullopt, CostKind, 0, nullptr);
5405 } else {
5406 // Reducing from smaller size is a shift by immediate.
5407 auto *ShiftTy = FixedVectorType::get(
5408 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5409 ReductionCost += getArithmeticInstrCost(
5410 Instruction::LShr, ShiftTy, CostKind,
5413 }
5414
5415 // Add the arithmetic op for this level.
5416 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5417 }
5418
5419 // Add the final extract element to the cost.
5420 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5421 CostKind, 0, nullptr, nullptr);
5422}
5423
5426 FastMathFlags FMF) {
5427 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5428 return getIntrinsicInstrCost(ICA, CostKind);
5429}
5430
5433 FastMathFlags FMF,
5435 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5436
5437 MVT MTy = LT.second;
5438
5439 int ISD;
5440 if (ValTy->isIntOrIntVectorTy()) {
5441 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5442 : ISD::SMIN;
5443 } else {
5444 assert(ValTy->isFPOrFPVectorTy() &&
5445 "Expected float point or integer vector type.");
5446 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5447 ? ISD::FMINNUM
5448 : ISD::FMINIMUM;
5449 }
5450
5451 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5452 // and make it as the cost.
5453
5454 static const CostTblEntry SSE2CostTbl[] = {
5455 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5456 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5457 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5458 };
5459
5460 static const CostTblEntry SSE41CostTbl[] = {
5461 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5462 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5463 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5464 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5465 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5466 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5467 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5468 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5469 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5470 {ISD::SMIN, MVT::v16i8, 6},
5471 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5472 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5473 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5474 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5475 };
5476
5477 static const CostTblEntry AVX1CostTbl[] = {
5478 {ISD::SMIN, MVT::v16i16, 6},
5479 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5480 {ISD::SMIN, MVT::v32i8, 8},
5481 {ISD::UMIN, MVT::v32i8, 8},
5482 };
5483
5484 static const CostTblEntry AVX512BWCostTbl[] = {
5485 {ISD::SMIN, MVT::v32i16, 8},
5486 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5487 {ISD::SMIN, MVT::v64i8, 10},
5488 {ISD::UMIN, MVT::v64i8, 10},
5489 };
5490
5491 // Before legalizing the type, give a chance to look up illegal narrow types
5492 // in the table.
5493 // FIXME: Is there a better way to do this?
5494 EVT VT = TLI->getValueType(DL, ValTy);
5495 if (VT.isSimple()) {
5496 MVT MTy = VT.getSimpleVT();
5497 if (ST->hasBWI())
5498 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5499 return Entry->Cost;
5500
5501 if (ST->hasAVX())
5502 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5503 return Entry->Cost;
5504
5505 if (ST->hasSSE41())
5506 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5507 return Entry->Cost;
5508
5509 if (ST->hasSSE2())
5510 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5511 return Entry->Cost;
5512 }
5513
5514 auto *ValVTy = cast<FixedVectorType>(ValTy);
5515 unsigned NumVecElts = ValVTy->getNumElements();
5516
5517 auto *Ty = ValVTy;
5518 InstructionCost MinMaxCost = 0;
5519 if (LT.first != 1 && MTy.isVector() &&
5520 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5521 // Type needs to be split. We need LT.first - 1 operations ops.
5522 Ty = FixedVectorType::get(ValVTy->getElementType(),
5523 MTy.getVectorNumElements());
5524 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5525 MinMaxCost *= LT.first - 1;
5526 NumVecElts = MTy.getVectorNumElements();
5527 }
5528
5529 if (ST->hasBWI())
5530 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5531 return MinMaxCost + Entry->Cost;
5532
5533 if (ST->hasAVX())
5534 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5535 return MinMaxCost + Entry->Cost;
5536
5537 if (ST->hasSSE41())
5538 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5539 return MinMaxCost + Entry->Cost;
5540
5541 if (ST->hasSSE2())
5542 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5543 return MinMaxCost + Entry->Cost;
5544
5545 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5546
5547 // Special case power of 2 reductions where the scalar type isn't changed
5548 // by type legalization.
5549 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5550 ScalarSize != MTy.getScalarSizeInBits())
5551 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5552
5553 // Now handle reduction with the legal type, taking into account size changes
5554 // at each level.
5555 while (NumVecElts > 1) {
5556 // Determine the size of the remaining vector we need to reduce.
5557 unsigned Size = NumVecElts * ScalarSize;
5558 NumVecElts /= 2;
5559 // If we're reducing from 256/512 bits, use an extract_subvector.
5560 if (Size > 128) {
5561 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5562 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
5563 CostKind, NumVecElts, SubTy);
5564 Ty = SubTy;
5565 } else if (Size == 128) {
5566 // Reducing from 128 bits is a permute of v2f64/v2i64.
5567 VectorType *ShufTy;
5568 if (ValTy->isFloatingPointTy())
5569 ShufTy =
5571 else
5572 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5573 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5574 std::nullopt, CostKind, 0, nullptr);
5575 } else if (Size == 64) {
5576 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5577 FixedVectorType *ShufTy;
5578 if (ValTy->isFloatingPointTy())
5579 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5580 else
5581 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5582 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5583 std::nullopt, CostKind, 0, nullptr);
5584 } else {
5585 // Reducing from smaller size is a shift by immediate.
5586 auto *ShiftTy = FixedVectorType::get(
5587 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5588 MinMaxCost += getArithmeticInstrCost(
5589 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5592 }
5593
5594 // Add the arithmetic op for this level.
5595 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5596 }
5597
5598 // Add the final extract element to the cost.
5599 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5600 CostKind, 0, nullptr, nullptr);
5601}
5602
5603/// Calculate the cost of materializing a 64-bit value. This helper
5604/// method might only calculate a fraction of a larger immediate. Therefore it
5605/// is valid to return a cost of ZERO.
5607 if (Val == 0)
5608 return TTI::TCC_Free;
5609
5610 if (isInt<32>(Val))
5611 return TTI::TCC_Basic;
5612
5613 return 2 * TTI::TCC_Basic;
5614}
5615
5618 assert(Ty->isIntegerTy());
5619
5620 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5621 if (BitSize == 0)
5622 return ~0U;
5623
5624 // Never hoist constants larger than 128bit, because this might lead to
5625 // incorrect code generation or assertions in codegen.
5626 // Fixme: Create a cost model for types larger than i128 once the codegen
5627 // issues have been fixed.
5628 if (BitSize > 128)
5629 return TTI::TCC_Free;
5630
5631 if (Imm == 0)
5632 return TTI::TCC_Free;
5633
5634 // Sign-extend all constants to a multiple of 64-bit.
5635 APInt ImmVal = Imm;
5636 if (BitSize % 64 != 0)
5637 ImmVal = Imm.sext(alignTo(BitSize, 64));
5638
5639 // Split the constant into 64-bit chunks and calculate the cost for each
5640 // chunk.
5642 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5643 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5644 int64_t Val = Tmp.getSExtValue();
5645 Cost += getIntImmCost(Val);
5646 }
5647 // We need at least one instruction to materialize the constant.
5648 return std::max<InstructionCost>(1, Cost);
5649}
5650
5652 const APInt &Imm, Type *Ty,
5654 Instruction *Inst) {
5655 assert(Ty->isIntegerTy());
5656
5657 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5658 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5659 // here, so that constant hoisting will ignore this constant.
5660 if (BitSize == 0)
5661 return TTI::TCC_Free;
5662
5663 unsigned ImmIdx = ~0U;
5664 switch (Opcode) {
5665 default:
5666 return TTI::TCC_Free;
5667 case Instruction::GetElementPtr:
5668 // Always hoist the base address of a GetElementPtr. This prevents the
5669 // creation of new constants for every base constant that gets constant
5670 // folded with the offset.
5671 if (Idx == 0)
5672 return 2 * TTI::TCC_Basic;
5673 return TTI::TCC_Free;
5674 case Instruction::Store:
5675 ImmIdx = 0;
5676 break;
5677 case Instruction::ICmp:
5678 // This is an imperfect hack to prevent constant hoisting of
5679 // compares that might be trying to check if a 64-bit value fits in
5680 // 32-bits. The backend can optimize these cases using a right shift by 32.
5681 // Ideally we would check the compare predicate here. There also other
5682 // similar immediates the backend can use shifts for.
5683 if (Idx == 1 && Imm.getBitWidth() == 64) {
5684 uint64_t ImmVal = Imm.getZExtValue();
5685 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5686 return TTI::TCC_Free;
5687 }
5688 ImmIdx = 1;
5689 break;
5690 case Instruction::And:
5691 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5692 // by using a 32-bit operation with implicit zero extension. Detect such
5693 // immediates here as the normal path expects bit 31 to be sign extended.
5694 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(32))
5695 return TTI::TCC_Free;
5696 ImmIdx = 1;
5697 break;
5698 case Instruction::Add:
5699 case Instruction::Sub:
5700 // For add/sub, we can use the opposite instruction for INT32_MIN.
5701 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
5702 return TTI::TCC_Free;
5703 ImmIdx = 1;
5704 break;
5705 case Instruction::UDiv:
5706 case Instruction::SDiv:
5707 case Instruction::URem:
5708 case Instruction::SRem:
5709 // Division by constant is typically expanded later into a different
5710 // instruction sequence. This completely changes the constants.
5711 // Report them as "free" to stop ConstantHoist from marking them as opaque.
5712 return TTI::TCC_Free;
5713 case Instruction::Mul:
5714 case Instruction::Or:
5715 case Instruction::Xor:
5716 ImmIdx = 1;
5717 break;
5718 // Always return TCC_Free for the shift value of a shift instruction.
5719 case Instruction::Shl:
5720 case Instruction::LShr:
5721 case Instruction::AShr:
5722 if (Idx == 1)
5723 return TTI::TCC_Free;
5724 break;
5725 case Instruction::Trunc:
5726 case Instruction::ZExt:
5727 case Instruction::SExt:
5728 case Instruction::IntToPtr:
5729 case Instruction::PtrToInt:
5730 case Instruction::BitCast:
5731 case Instruction::PHI:
5732 case Instruction::Call:
5733 case Instruction::Select:
5734 case Instruction::Ret:
5735 case Instruction::Load:
5736 break;
5737 }
5738
5739 if (Idx == ImmIdx) {
5740 uint64_t NumConstants = divideCeil(BitSize, 64);
5742 return (Cost <= NumConstants * TTI::TCC_Basic)
5743 ? static_cast<int>(TTI::TCC_Free)
5744 : Cost;
5745 }
5746
5747 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5748}
5749
5751 const APInt &Imm, Type *Ty,
5753 assert(Ty->isIntegerTy());
5754
5755 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5756 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5757 // here, so that constant hoisting will ignore this constant.
5758 if (BitSize == 0)
5759 return TTI::TCC_Free;
5760
5761 switch (IID) {
5762 default:
5763 return TTI::TCC_Free;
5764 case Intrinsic::sadd_with_overflow:
5765 case Intrinsic::uadd_with_overflow:
5766 case Intrinsic::ssub_with_overflow:
5767 case Intrinsic::usub_with_overflow:
5768 case Intrinsic::smul_with_overflow:
5769 case Intrinsic::umul_with_overflow:
5770 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
5771 return TTI::TCC_Free;
5772 break;
5773 case Intrinsic::experimental_stackmap:
5774 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5775 return TTI::TCC_Free;
5776 break;
5777 case Intrinsic::experimental_patchpoint_void:
5778 case Intrinsic::experimental_patchpoint:
5779 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5780 return TTI::TCC_Free;
5781 break;
5782 }
5783 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5784}
5785
5788 const Instruction *I) {
5790 return Opcode == Instruction::PHI ? 0 : 1;
5791 // Branches are assumed to be predicted.
5792 return 0;
5793}
5794
5795int X86TTIImpl::getGatherOverhead() const {
5796 // Some CPUs have more overhead for gather. The specified overhead is relative
5797 // to the Load operation. "2" is the number provided by Intel architects. This
5798 // parameter is used for cost estimation of Gather Op and comparison with
5799 // other alternatives.
5800 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
5801 // enable gather with a -march.
5802 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
5803 return 2;
5804
5805 return 1024;
5806}
5807
5808int X86TTIImpl::getScatterOverhead() const {
5809 if (ST->hasAVX512())
5810 return 2;
5811
5812 return 1024;
5813}
5814
5815// Return an average cost of Gather / Scatter instruction, maybe improved later.
5816InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
5818 Type *SrcVTy, const Value *Ptr,
5819 Align Alignment,
5820 unsigned AddressSpace) {
5821
5822 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
5823 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5824
5825 // Try to reduce index size from 64 bit (default for GEP)
5826 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
5827 // operation will use 16 x 64 indices which do not fit in a zmm and needs
5828 // to split. Also check that the base pointer is the same for all lanes,
5829 // and that there's at most one variable index.
5830 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
5831 unsigned IndexSize = DL.getPointerSizeInBits();
5832 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
5833 if (IndexSize < 64 || !GEP)
5834 return IndexSize;
5835
5836 unsigned NumOfVarIndices = 0;
5837 const Value *Ptrs = GEP->getPointerOperand();
5838 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
5839 return IndexSize;
5840 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
5841 if (isa<Constant>(GEP->getOperand(I)))
5842 continue;
5843 Type *IndxTy = GEP->getOperand(I)->getType();
5844 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
5845 IndxTy = IndexVTy->getElementType();
5846 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
5847 !isa<SExtInst>(GEP->getOperand(I))) ||
5848 ++NumOfVarIndices > 1)
5849 return IndexSize; // 64
5850 }
5851 return (unsigned)32;
5852 };
5853
5854 // Trying to reduce IndexSize to 32 bits for vector 16.
5855 // By default the IndexSize is equal to pointer size.
5856 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
5857 ? getIndexSizeInBits(Ptr, DL)
5859
5860 auto *IndexVTy = FixedVectorType::get(
5861 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
5862 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
5863 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
5864 InstructionCost::CostType SplitFactor =
5865 *std::max(IdxsLT.first, SrcLT.first).getValue();
5866 if (SplitFactor > 1) {
5867 // Handle splitting of vector of pointers
5868 auto *SplitSrcTy =
5869 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
5870 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
5871 Alignment, AddressSpace);
5872 }
5873
5874 // If we didn't split, this will be a single gather/scatter instruction.
5876 return 1;
5877
5878 // The gather / scatter cost is given by Intel architects. It is a rough
5879 // number since we are looking at one instruction in a time.
5880 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
5881 : getScatterOverhead();
5882 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5883 MaybeAlign(Alignment), AddressSpace,
5884 CostKind);
5885}
5886
5887/// Calculate the cost of Gather / Scatter operation
5889 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
5891 const Instruction *I = nullptr) {
5892 if ((Opcode == Instruction::Load &&
5893 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
5894 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5895 Align(Alignment)))) ||
5896 (Opcode == Instruction::Store &&
5897 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
5898 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5899 Align(Alignment)))))
5900 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
5901 Alignment, CostKind, I);
5902
5903 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
5904 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
5905 if (!PtrTy && Ptr->getType()->isVectorTy())
5906 PtrTy = dyn_cast<PointerType>(
5907 cast<VectorType>(Ptr->getType())->getElementType());
5908 assert(PtrTy && "Unexpected type for Ptr argument");
5909 unsigned AddressSpace = PtrTy->getAddressSpace();
5910 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
5911 AddressSpace);
5912}
5913
5915 const TargetTransformInfo::LSRCost &C2) {
5916 // X86 specific here are "instruction number 1st priority".
5917 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
5918 C1.NumIVMuls, C1.NumBaseAdds,
5919 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5920 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
5921 C2.NumIVMuls, C2.NumBaseAdds,
5922 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5923}
5924
5926 return ST->hasMacroFusion() || ST->hasBranchFusion();
5927}
5928
5929bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
5930 Type *ScalarTy = DataTy->getScalarType();
5931
5932 // The backend can't handle a single element vector w/o CFCMOV.
5933 if (isa<VectorType>(DataTy) && cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5934 return ST->hasCF() && hasConditionalLoadStoreForType(ScalarTy);
5935
5936 if (!ST->hasAVX())
5937 return false;
5938
5939 if (ScalarTy->isPointerTy())
5940 return true;
5941
5942 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5943 return true;
5944
5945 if (ScalarTy->isHalfTy() && ST->hasBWI())
5946 return true;
5947
5948 if (ScalarTy->isBFloatTy() && ST->hasBF16())
5949 return true;
5950
5951 if (!ScalarTy->isIntegerTy())
5952 return false;
5953
5954 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5955 return IntWidth == 32 || IntWidth == 64 ||
5956 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
5957}
5958
5959bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
5960 return isLegalMaskedLoad(DataType, Alignment);
5961}
5962
5963bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
5964 unsigned DataSize = DL.getTypeStoreSize(DataType);
5965 // The only supported nontemporal loads are for aligned vectors of 16 or 32
5966 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
5967 // (the equivalent stores only require AVX).
5968 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
5969 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
5970
5971 return false;
5972}
5973
5974bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
5975 unsigned DataSize = DL.getTypeStoreSize(DataType);
5976
5977 // SSE4A supports nontemporal stores of float and double at arbitrary
5978 // alignment.
5979 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
5980 return true;
5981
5982 // Besides the SSE4A subtarget exception above, only aligned stores are
5983 // available nontemporaly on any other subtarget. And only stores with a size
5984 // of 4..32 bytes (powers of 2, only) are permitted.
5985 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
5986 !isPowerOf2_32(DataSize))
5987 return false;
5988
5989 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
5990 // loads require AVX2).
5991 if (DataSize == 32)
5992 return ST->hasAVX();
5993 if (DataSize == 16)
5994 return ST->hasSSE1();
5995 return true;
5996}
5997
5999 ElementCount NumElements) const {
6000 // movddup
6001 return ST->hasSSE3() && !NumElements.isScalable() &&
6002 NumElements.getFixedValue() == 2 &&
6003 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6004}
6005
6007 if (!isa<VectorType>(DataTy))
6008 return false;
6009
6010 if (!ST->hasAVX512())
6011 return false;
6012
6013 // The backend can't handle a single element vector.
6014 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6015 return false;
6016
6017 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6018
6019 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6020 return true;
6021
6022 if (!ScalarTy->isIntegerTy())
6023 return false;
6024
6025 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6026 return IntWidth == 32 || IntWidth == 64 ||
6027 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6028}
6029
6031 return isLegalMaskedExpandLoad(DataTy, Alignment);
6032}
6033
6034bool X86TTIImpl::supportsGather() const {
6035 // Some CPUs have better gather performance than others.
6036 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6037 // enable gather with a -march.
6038 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6039}
6040
6042 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6043 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6044 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6045 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6046 // Check, maybe the gather/scatter instruction is better in the VariableMask
6047 // case.
6048 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6049 return NumElts == 1 ||
6050 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6051}
6052
6054 Type *ScalarTy = DataTy->getScalarType();
6055 if (ScalarTy->isPointerTy())
6056 return true;
6057
6058 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6059 return true;
6060
6061 if (!ScalarTy->isIntegerTy())
6062 return false;
6063
6064 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6065 return IntWidth == 32 || IntWidth == 64;
6066}
6067
6069 if (!supportsGather() || !ST->preferGather())
6070 return false;
6071 return isLegalMaskedGatherScatter(DataTy, Alignment);
6072}
6073
6074bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6075 unsigned Opcode1,
6076 const SmallBitVector &OpcodeMask) const {
6077 // ADDSUBPS 4xf32 SSE3
6078 // VADDSUBPS 4xf32 AVX
6079 // VADDSUBPS 8xf32 AVX2
6080 // ADDSUBPD 2xf64 SSE3
6081 // VADDSUBPD 2xf64 AVX
6082 // VADDSUBPD 4xf64 AVX2
6083
6084 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6085 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6086 if (!isPowerOf2_32(NumElements))
6087 return false;
6088 // Check the opcode pattern. We apply the mask on the opcode arguments and
6089 // then check if it is what we expect.
6090 for (int Lane : seq<int>(0, NumElements)) {
6091 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6092 // We expect FSub for even lanes and FAdd for odd lanes.
6093 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6094 return false;
6095 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6096 return false;
6097 }
6098 // Now check that the pattern is supported by the target ISA.
6099 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6100 if (ElemTy->isFloatTy())
6101 return ST->hasSSE3() && NumElements % 4 == 0;
6102 if (ElemTy->isDoubleTy())
6103 return ST->hasSSE3() && NumElements % 2 == 0;
6104 return false;
6105}
6106
6107bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6108 // AVX2 doesn't support scatter
6109 if (!ST->hasAVX512() || !ST->preferScatter())
6110 return false;
6111 return isLegalMaskedGatherScatter(DataType, Alignment);
6112}
6113
6114bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6115 EVT VT = TLI->getValueType(DL, DataType);
6116 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6117}
6118
6120 // FDIV is always expensive, even if it has a very low uop count.
6121 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6122 if (I->getOpcode() == Instruction::FDiv)
6123 return true;
6124
6126}
6127
6129 return false;
6130}
6131
6133 const Function *Callee) const {
6134 const TargetMachine &TM = getTLI()->getTargetMachine();
6135
6136 // Work this as a subsetting of subtarget features.
6137 const FeatureBitset &CallerBits =
6138 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6139 const FeatureBitset &CalleeBits =
6140 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6141
6142 // Check whether features are the same (apart from the ignore list).
6143 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6144 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6145 if (RealCallerBits == RealCalleeBits)
6146 return true;
6147
6148 // If the features are a subset, we need to additionally check for calls
6149 // that may become ABI-incompatible as a result of inlining.
6150 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6151 return false;
6152
6153 for (const Instruction &I : instructions(Callee)) {
6154 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6155 // Having more target features is fine for inline ASM.
6156 if (CB->isInlineAsm())
6157 continue;
6158
6160 for (Value *Arg : CB->args())
6161 Types.push_back(Arg->getType());
6162 if (!CB->getType()->isVoidTy())
6163 Types.push_back(CB->getType());
6164
6165 // Simple types are always ABI compatible.
6166 auto IsSimpleTy = [](Type *Ty) {
6167 return !Ty->isVectorTy() && !Ty->isAggregateType();
6168 };
6169 if (all_of(Types, IsSimpleTy))
6170 continue;
6171
6172 if (Function *NestedCallee = CB->getCalledFunction()) {
6173 // Assume that intrinsics are always ABI compatible.
6174 if (NestedCallee->isIntrinsic())
6175 continue;
6176
6177 // Do a precise compatibility check.
6178 if (!areTypesABICompatible(Caller, NestedCallee, Types))
6179 return false;
6180 } else {
6181 // We don't know the target features of the callee,
6182 // assume it is incompatible.
6183 return false;
6184 }
6185 }
6186 }
6187 return true;
6188}
6189
6191 const Function *Callee,
6192 const ArrayRef<Type *> &Types) const {
6193 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6194 return false;
6195
6196 // If we get here, we know the target features match. If one function
6197 // considers 512-bit vectors legal and the other does not, consider them
6198 // incompatible.
6199 const TargetMachine &TM = getTLI()->getTargetMachine();
6200
6201 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6202 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6203 return true;
6204
6205 // Consider the arguments compatible if they aren't vectors or aggregates.
6206 // FIXME: Look at the size of vectors.
6207 // FIXME: Look at the element types of aggregates to see if there are vectors.
6208 return llvm::none_of(Types,
6209 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6210}
6211
6213X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6215 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6216 Options.NumLoadsPerBlock = 2;
6217 // All GPR and vector loads can be unaligned.
6218 Options.AllowOverlappingLoads = true;
6219 if (IsZeroCmp) {
6220 // Only enable vector loads for equality comparison. Right now the vector
6221 // version is not as fast for three way compare (see #33329).
6222 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6223 if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6224 Options.LoadSizes.push_back(64);
6225 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6226 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6227 }
6228 if (ST->is64Bit()) {
6229 Options.LoadSizes.push_back(8);
6230 }
6231 Options.LoadSizes.push_back(4);
6232 Options.LoadSizes.push_back(2);
6233 Options.LoadSizes.push_back(1);
6234 return Options;
6235}
6236
6238 return supportsGather();
6239}
6240
6242 return false;
6243}
6244
6246 // TODO: We expect this to be beneficial regardless of arch,
6247 // but there are currently some unexplained performance artifacts on Atom.
6248 // As a temporary solution, disable on Atom.
6249 return !(ST->isAtom());
6250}
6251
6252// Get estimation for interleaved load/store operations and strided load.
6253// \p Indices contains indices for strided load.
6254// \p Factor - the factor of interleaving.
6255// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6257 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6258 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6259 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6260 // VecTy for interleave memop is <VF*Factor x Elt>.
6261 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6262 // VecTy = <12 x i32>.
6263
6264 // Calculate the number of memory operations (NumOfMemOps), required
6265 // for load/store the VecTy.
6266 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6267 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6268 unsigned LegalVTSize = LegalVT.getStoreSize();
6269 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6270
6271 // Get the cost of one memory operation.
6272 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6273 LegalVT.getVectorNumElements());
6274 InstructionCost MemOpCost;
6275 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6276 if (UseMaskedMemOp)
6277 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6279 else
6280 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6282
6283 unsigned VF = VecTy->getNumElements() / Factor;
6284 MVT VT =
6286
6287 InstructionCost MaskCost;
6288 if (UseMaskedMemOp) {
6289 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6290 for (unsigned Index : Indices) {
6291 assert(Index < Factor && "Invalid index for interleaved memory op");
6292 for (unsigned Elm = 0; Elm < VF; Elm++)
6293 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6294 }
6295
6296 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6297
6298 MaskCost = getReplicationShuffleCost(
6299 I1Type, Factor, VF,
6300 UseMaskForGaps ? DemandedLoadStoreElts
6302 CostKind);
6303
6304 // The Gaps mask is invariant and created outside the loop, therefore the
6305 // cost of creating it is not accounted for here. However if we have both
6306 // a MaskForGaps and some other mask that guards the execution of the
6307 // memory access, we need to account for the cost of And-ing the two masks
6308 // inside the loop.
6309 if (UseMaskForGaps) {
6310 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6311 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6312 }
6313 }
6314
6315 if (Opcode == Instruction::Load) {
6316 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6317 // contain the cost of the optimized shuffle sequence that the
6318 // X86InterleavedAccess pass will generate.
6319 // The cost of loads and stores are computed separately from the table.
6320
6321 // X86InterleavedAccess support only the following interleaved-access group.
6322 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6323 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6324 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6325 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6326 };
6327
6328 if (const auto *Entry =
6329 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6330 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6331 //If an entry does not exist, fallback to the default implementation.
6332
6333 // Kind of shuffle depends on number of loaded values.
6334 // If we load the entire data in one register, we can use a 1-src shuffle.
6335 // Otherwise, we'll merge 2 sources in each operation.
6336 TTI::ShuffleKind ShuffleKind =
6337 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6338
6339 InstructionCost ShuffleCost = getShuffleCost(
6340 ShuffleKind, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6341
6342 unsigned NumOfLoadsInInterleaveGrp =
6343 Indices.size() ? Indices.size() : Factor;
6344 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6345 VecTy->getNumElements() / Factor);
6346 InstructionCost NumOfResults =
6347 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6348
6349 // About a half of the loads may be folded in shuffles when we have only
6350 // one result. If we have more than one result, or the loads are masked,
6351 // we do not fold loads at all.
6352 unsigned NumOfUnfoldedLoads =
6353 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6354
6355 // Get a number of shuffle operations per result.
6356 unsigned NumOfShufflesPerResult =
6357 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6358
6359 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6360 // When we have more than one destination, we need additional instructions
6361 // to keep sources.
6362 InstructionCost NumOfMoves = 0;
6363 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6364 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6365
6366 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6367 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6368 NumOfMoves;
6369
6370 return Cost;
6371 }
6372
6373 // Store.
6374 assert(Opcode == Instruction::Store &&
6375 "Expected Store Instruction at this point");
6376 // X86InterleavedAccess support only the following interleaved-access group.
6377 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6378 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6379 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6380 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6381
6382 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6383 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6384 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6385 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6386 };
6387
6388 if (const auto *Entry =
6389 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6390 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6391 //If an entry does not exist, fallback to the default implementation.
6392
6393 // There is no strided stores meanwhile. And store can't be folded in
6394 // shuffle.
6395 unsigned NumOfSources = Factor; // The number of values to be merged.
6396 InstructionCost ShuffleCost = getShuffleCost(
6397 TTI::SK_PermuteTwoSrc, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6398 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6399
6400 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6401 // We need additional instructions to keep sources.
6402 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6404 MaskCost +
6405 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6406 NumOfMoves;
6407 return Cost;
6408}
6409
6411 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6412 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6413 bool UseMaskForCond, bool UseMaskForGaps) {
6414 auto *VecTy = cast<FixedVectorType>(BaseTy);
6415
6416 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6417 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6418 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6419 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6420 return true;
6421 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6422 return ST->hasBWI();
6423 if (EltTy->isBFloatTy())
6424 return ST->hasBF16();
6425 return false;
6426 };
6427 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6429 Opcode, VecTy, Factor, Indices, Alignment,
6430 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6431
6432 if (UseMaskForCond || UseMaskForGaps)
6433 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6434 Alignment, AddressSpace, CostKind,
6435 UseMaskForCond, UseMaskForGaps);
6436
6437 // Get estimation for interleaved load/store operations for SSE-AVX2.
6438 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6439 // computing the cost using a generic formula as a function of generic
6440 // shuffles. We therefore use a lookup table instead, filled according to
6441 // the instruction sequences that codegen currently generates.
6442
6443 // VecTy for interleave memop is <VF*Factor x Elt>.
6444 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6445 // VecTy = <12 x i32>.
6446 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6447
6448 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6449 // the VF=2, while v2i128 is an unsupported MVT vector type
6450 // (see MachineValueType.h::getVectorVT()).
6451 if (!LegalVT.isVector())
6452 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6453 Alignment, AddressSpace, CostKind);
6454
6455 unsigned VF = VecTy->getNumElements() / Factor;
6456 Type *ScalarTy = VecTy->getElementType();
6457 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6458 if (!ScalarTy->isIntegerTy())
6459 ScalarTy =
6460 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6461
6462 // Get the cost of all the memory operations.
6463 // FIXME: discount dead loads.
6464 InstructionCost MemOpCosts = getMemoryOpCost(
6465 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6466
6467 auto *VT = FixedVectorType::get(ScalarTy, VF);
6468 EVT ETy = TLI->getValueType(DL, VT);
6469 if (!ETy.isSimple())
6470 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6471 Alignment, AddressSpace, CostKind);
6472
6473 // TODO: Complete for other data-types and strides.
6474 // Each combination of Stride, element bit width and VF results in a different
6475 // sequence; The cost tables are therefore accessed with:
6476 // Factor (stride) and VectorType=VFxiN.
6477 // The Cost accounts only for the shuffle sequence;
6478 // The cost of the loads/stores is accounted for separately.
6479 //
6480 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6481 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6482 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6483 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6484 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6485 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6486
6487 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6488 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6489 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6490
6491 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6492 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6493 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6494
6495 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6496 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6497 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6498 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6499
6500 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6501 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6502 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6503 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6504 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6505
6506 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6507 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6508 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6509 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6510 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6511
6512 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6513 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6514 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6515 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6516 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6517
6518 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6519 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6520 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6521 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6522
6523 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6524 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6525 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6526 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6527 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6528
6529 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6530 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6531 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6532 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6533 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6534
6535 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6536 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6537 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6538 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6539 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6540
6541 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6542 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6543 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6544 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6545
6546 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6547 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6548 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6549 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6550 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6551
6552 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6553 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6554 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6555 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6556 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6557
6558 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6559 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6560 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6561 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6562
6563 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6564 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6565 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6566
6567 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6568 };
6569
6570 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6571 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6572 };
6573
6574 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6575 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6576 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6577
6578 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6579 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6580
6581 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6582 };
6583
6584 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6585 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6586 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6587
6588 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6589 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6590 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6591
6592 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6593 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6594 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6595 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6596
6597 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6598 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6599 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6600 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6601 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6602
6603 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6604 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6605 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6606 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6607 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6608
6609 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6610 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6611 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6612 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6613 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6614
6615 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6616 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6617 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
6618 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6619 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6620
6621 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
6622 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
6623 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
6624 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6625
6626 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
6627 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
6628 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
6629 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
6630 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6631
6632 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
6633 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
6634 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
6635 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6636 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6637
6638 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
6639 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
6640 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
6641 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6642 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6643
6644 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
6645 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
6646 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6647 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6648
6649 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
6650 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
6651 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
6652 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6653 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6654
6655 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
6656 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
6657 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
6658 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6659 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6660
6661 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
6662 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
6663 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
6664 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6665
6666 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
6667 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6668 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6669 };
6670
6671 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6672 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
6673 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
6674 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
6675
6676 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
6677 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
6678
6679 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
6680 };
6681
6682 if (Opcode == Instruction::Load) {
6683 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6684 MemOpCosts](const CostTblEntry *Entry) {
6685 // NOTE: this is just an approximation!
6686 // It can over/under -estimate the cost!
6687 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6688 };
6689
6690 if (ST->hasAVX2())
6691 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6692 ETy.getSimpleVT()))
6693 return GetDiscountedCost(Entry);
6694
6695 if (ST->hasSSSE3())
6696 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
6697 ETy.getSimpleVT()))
6698 return GetDiscountedCost(Entry);
6699
6700 if (ST->hasSSE2())
6701 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
6702 ETy.getSimpleVT()))
6703 return GetDiscountedCost(Entry);
6704 } else {
6705 assert(Opcode == Instruction::Store &&
6706 "Expected Store Instruction at this point");
6707 assert((!Indices.size() || Indices.size() == Factor) &&
6708 "Interleaved store only supports fully-interleaved groups.");
6709 if (ST->hasAVX2())
6710 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
6711 ETy.getSimpleVT()))
6712 return MemOpCosts + Entry->Cost;
6713
6714 if (ST->hasSSE2())
6715 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
6716 ETy.getSimpleVT()))
6717 return MemOpCosts + Entry->Cost;
6718 }
6719
6720 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6721 Alignment, AddressSpace, CostKind,
6722 UseMaskForCond, UseMaskForGaps);
6723}
6724
6726 StackOffset BaseOffset,
6727 bool HasBaseReg, int64_t Scale,
6728 unsigned AddrSpace) const {
6729 // Scaling factors are not free at all.
6730 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
6731 // will take 2 allocations in the out of order engine instead of 1
6732 // for plain addressing mode, i.e. inst (reg1).
6733 // E.g.,
6734 // vaddps (%rsi,%rdx), %ymm0, %ymm1
6735 // Requires two allocations (one for the load, one for the computation)
6736 // whereas:
6737 // vaddps (%rsi), %ymm0, %ymm1
6738 // Requires just 1 allocation, i.e., freeing allocations for other operations
6739 // and having less micro operations to execute.
6740 //
6741 // For some X86 architectures, this is even worse because for instance for
6742 // stores, the complex addressing mode forces the instruction to use the
6743 // "load" ports instead of the dedicated "store" port.
6744 // E.g., on Haswell:
6745 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
6746 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
6748 AM.BaseGV = BaseGV;
6749 AM.BaseOffs = BaseOffset.getFixed();
6750 AM.HasBaseReg = HasBaseReg;
6751 AM.Scale = Scale;
6752 AM.ScalableOffset = BaseOffset.getScalable();
6753 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6754 // Scale represents reg2 * scale, thus account for 1
6755 // as soon as we use a second register.
6756 return AM.Scale != 0;
6757 return -1;
6758}
6759
6761 // TODO: Hook MispredictPenalty of SchedMachineModel into this.
6762 return 14;
6763}
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
Hexagon Common GEP
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
uint64_t IntrinsicInst * II
if(VerifyEach)
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getNumElements(Type *Ty)
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:214
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1629
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1310
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:351
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:238
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:360
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1448
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:807
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:180
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:453
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1522
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:975
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:768
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:444
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:897
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:861
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:340
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:760
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:787
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:781
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:765
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:768
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:785
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:767
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:410
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:915
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Class to represent pointers.
Definition: DerivedTypes.h:646
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:679
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
MVT getSimpleValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the MVT corresponding to this LLVM type. See getValueType.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:216
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
Definition: DerivedTypes.h:463
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Definition: DerivedTypes.h:517
Type * getElementType() const
Definition: DerivedTypes.h:436
bool hasSSE1() const
Definition: X86Subtarget.h:193
bool hasSSE42() const
Definition: X86Subtarget.h:198
bool useAVX512Regs() const
Definition: X86Subtarget.h:253
bool hasSSE3() const
Definition: X86Subtarget.h:195
bool hasAVX512() const
Definition: X86Subtarget.h:201
bool hasSSE41() const
Definition: X86Subtarget.h:197
bool hasSSE2() const
Definition: X86Subtarget.h:194
bool hasSSSE3() const
Definition: X86Subtarget.h:196
bool hasAVX() const
Definition: X86Subtarget.h:199
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:225
bool hasAVX2() const
Definition: X86Subtarget.h:200
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool isLegalNTStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLegalNTLoad(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment)
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool isLegalMaskedLoad(Type *DataType, Align Alignment)
bool hasConditionalLoadStoreForType(Type *Ty=nullptr) const
bool supportsEfficientVectorElementLoadStore() const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
bool prefersVectorizedAddressing() const
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment)
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment)
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
Calculate the cost of Gather / Scatter operation.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getMaxInterleaveFactor(ElementCount VF)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getNumberOfRegisters(unsigned ClassID) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
unsigned getAtomicMemIntrinsicMaxElementSize() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
InstructionCost getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getBranchMispredictPenalty() const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Type) const
bool hasDivRemOp(Type *DataType, bool IsSigned)
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:779
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:752
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:743
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:820
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:716
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:804
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:751
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:960
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:756
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:734
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:549
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:810
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1008
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:696
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:918
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1027
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:866
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:899
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:816
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
apint_match m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
Definition: PatternMatch.h:305
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
AddressSpace
Definition: NVPTXBaseInfo.h:21
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:547
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
InstructionCost Cost
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition: Alignment.h:141
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition: CostTable.h:55