LLVM 19.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include "llvm/Support/Debug.h"
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
102
105 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
106 // TODO: Currently the __builtin_popcount() implementation using SSE3
107 // instructions is inefficient. Once the problem is fixed, we should
108 // call ST->hasSSE3() instead of ST->hasPOPCNT().
109 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
110}
111
112std::optional<unsigned> X86TTIImpl::getCacheSize(
114 switch (Level) {
116 // - Penryn
117 // - Nehalem
118 // - Westmere
119 // - Sandy Bridge
120 // - Ivy Bridge
121 // - Haswell
122 // - Broadwell
123 // - Skylake
124 // - Kabylake
125 return 32 * 1024; // 32 KByte
127 // - Penryn
128 // - Nehalem
129 // - Westmere
130 // - Sandy Bridge
131 // - Ivy Bridge
132 // - Haswell
133 // - Broadwell
134 // - Skylake
135 // - Kabylake
136 return 256 * 1024; // 256 KByte
137 }
138
139 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
140}
141
142std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
144 // - Penryn
145 // - Nehalem
146 // - Westmere
147 // - Sandy Bridge
148 // - Ivy Bridge
149 // - Haswell
150 // - Broadwell
151 // - Skylake
152 // - Kabylake
153 switch (Level) {
155 [[fallthrough]];
157 return 8;
158 }
159
160 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
161}
162
163unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
164 bool Vector = (ClassID == 1);
165 if (Vector && !ST->hasSSE1())
166 return 0;
167
168 if (ST->is64Bit()) {
169 if (Vector && ST->hasAVX512())
170 return 32;
171 return 16;
172 }
173 return 8;
174}
175
178 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
179 switch (K) {
181 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
183 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
184 return TypeSize::getFixed(512);
185 if (ST->hasAVX() && PreferVectorWidth >= 256)
186 return TypeSize::getFixed(256);
187 if (ST->hasSSE1() && PreferVectorWidth >= 128)
188 return TypeSize::getFixed(128);
189 return TypeSize::getFixed(0);
191 return TypeSize::getScalable(0);
192 }
193
194 llvm_unreachable("Unsupported register kind");
195}
196
199 .getFixedValue();
200}
201
203 // If the loop will not be vectorized, don't interleave the loop.
204 // Let regular unroll to unroll the loop, which saves the overflow
205 // check and memory check cost.
206 if (VF.isScalar())
207 return 1;
208
209 if (ST->isAtom())
210 return 1;
211
212 // Sandybridge and Haswell have multiple execution ports and pipelined
213 // vector units.
214 if (ST->hasAVX())
215 return 4;
216
217 return 2;
218}
219
221 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
224 const Instruction *CxtI) {
225
226 // vXi8 multiplications are always promoted to vXi16.
227 // Sub-128-bit types can be extended/packed more efficiently.
228 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
229 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
230 Type *WideVecTy =
231 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
232 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
234 CostKind) +
235 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
237 CostKind) +
238 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
239 }
240
241 // Legalize the type.
242 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
243
244 int ISD = TLI->InstructionOpcodeToISD(Opcode);
245 assert(ISD && "Invalid opcode");
246
247 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
248 (LT.second.getScalarType() == MVT::i32 ||
249 LT.second.getScalarType() == MVT::i64)) {
250 // Check if the operands can be represented as a smaller datatype.
251 bool Op1Signed = false, Op2Signed = false;
252 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
253 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
254 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
255 bool SignedMode = Op1Signed || Op2Signed;
256
257 // If both vXi32 are representable as i15 and at least one is constant,
258 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
259 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
260 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
261 LT.second.getScalarType() == MVT::i32) {
262 bool Op1Constant =
263 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
264 bool Op2Constant =
265 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
266 bool Op1Sext = isa<SExtInst>(Args[0]) &&
267 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
268 bool Op2Sext = isa<SExtInst>(Args[1]) &&
269 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
270
271 bool IsZeroExtended = !Op1Signed || !Op2Signed;
272 bool IsConstant = Op1Constant || Op2Constant;
273 bool IsSext = Op1Sext || Op2Sext;
274 if (IsConstant || IsZeroExtended || IsSext)
275 LT.second =
276 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
277 }
278
279 // Check if the vXi32 operands can be shrunk into a smaller datatype.
280 // This should match the codegen from reduceVMULWidth.
281 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
282 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
283 if (OpMinSize <= 7)
284 return LT.first * 3; // pmullw/sext
285 if (!SignedMode && OpMinSize <= 8)
286 return LT.first * 3; // pmullw/zext
287 if (OpMinSize <= 15)
288 return LT.first * 5; // pmullw/pmulhw/pshuf
289 if (!SignedMode && OpMinSize <= 16)
290 return LT.first * 5; // pmullw/pmulhw/pshuf
291 }
292
293 // If both vXi64 are representable as (unsigned) i32, then we can perform
294 // the multiple with a single PMULUDQ instruction.
295 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
296 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
297 ISD = X86ISD::PMULUDQ;
298 }
299
300 // Vector multiply by pow2 will be simplified to shifts.
301 // Vector multiply by -pow2 will be simplified to shifts/negates.
302 if (ISD == ISD::MUL && Op2Info.isConstant() &&
303 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
305 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
306 Op1Info.getNoProps(), Op2Info.getNoProps());
307 if (Op2Info.isNegatedPowerOf2())
308 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
309 return Cost;
310 }
311
312 // On X86, vector signed division by constants power-of-two are
313 // normally expanded to the sequence SRA + SRL + ADD + SRA.
314 // The OperandValue properties may not be the same as that of the previous
315 // operation; conservatively assume OP_None.
316 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
317 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
319 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
320 Op1Info.getNoProps(), Op2Info.getNoProps());
321 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
322 Op1Info.getNoProps(), Op2Info.getNoProps());
323 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
324 Op1Info.getNoProps(), Op2Info.getNoProps());
325
326 if (ISD == ISD::SREM) {
327 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
328 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
329 Op2Info.getNoProps());
330 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
331 Op2Info.getNoProps());
332 }
333
334 return Cost;
335 }
336
337 // Vector unsigned division/remainder will be simplified to shifts/masks.
338 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
339 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
340 if (ISD == ISD::UDIV)
341 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
342 Op1Info.getNoProps(), Op2Info.getNoProps());
343 // UREM
344 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
345 Op1Info.getNoProps(), Op2Info.getNoProps());
346 }
347
348 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
349 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
350 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
351 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
352 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
353 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
354 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
355 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
356 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
357 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
358
359 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
360 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
361 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
362 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
363 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
364 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
365 };
366
367 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
368 if (const auto *Entry =
369 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
370 if (auto KindCost = Entry->Cost[CostKind])
371 return LT.first * *KindCost;
372
373 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
374 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
375 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
376 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
377
378 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
379 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
380 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
381
382 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
383 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
384 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
385 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
386 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
387 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
388
389 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
390 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
391 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
392 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
393 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
394 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
395 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
396
397 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
398 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
399 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
400 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
401 };
402
403 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
404 if (const auto *Entry =
405 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
406 if (auto KindCost = Entry->Cost[CostKind])
407 return LT.first * *KindCost;
408
409 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
410 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
411 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
412 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
413 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
414 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
415 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
416
417 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
418 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
419 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
420 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
421 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
422 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
423
424 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
425 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
426 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
427 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
428 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
429 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
430
431 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
432 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
433 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
434 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
435 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
436 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
437
438 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
439 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
440 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
441 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
442 };
443
444 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
445 if (const auto *Entry =
446 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
447 if (auto KindCost = Entry->Cost[CostKind])
448 return LT.first * *KindCost;
449
450 static const CostKindTblEntry AVXUniformConstCostTable[] = {
451 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
452 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
453 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
454 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
455 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
456 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
457
458 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
459 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
460 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
461 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
462 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
463 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
464
465 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
466 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
467 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
468 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
469 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
470 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
471
472 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
473 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
474 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
475 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
476 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
477 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
478
479 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
480 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
481 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
482 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
483 };
484
485 // XOP has faster vXi8 shifts.
486 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
487 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
488 if (const auto *Entry =
489 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
490 if (auto KindCost = Entry->Cost[CostKind])
491 return LT.first * *KindCost;
492
493 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
494 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
495 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
496 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
497
498 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
499 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
500 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
501
502 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
503 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
504 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
505
506 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
507 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
508 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
509
510 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
511 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
512 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
513 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
514 };
515
516 // XOP has faster vXi8 shifts.
517 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
518 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
519 if (const auto *Entry =
520 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
521 if (auto KindCost = Entry->Cost[CostKind])
522 return LT.first * *KindCost;
523
524 static const CostKindTblEntry AVX512BWConstCostTable[] = {
525 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
526 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
527 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
528 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
529
530 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
531 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
532 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
533 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
534 };
535
536 if (Op2Info.isConstant() && ST->hasBWI())
537 if (const auto *Entry =
538 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
539 if (auto KindCost = Entry->Cost[CostKind])
540 return LT.first * *KindCost;
541
542 static const CostKindTblEntry AVX512ConstCostTable[] = {
543 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
544 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
545 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
546 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
547
548 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
549 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
550 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
551 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
552
553 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
554 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
555 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
556 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
557 };
558
559 if (Op2Info.isConstant() && ST->hasAVX512())
560 if (const auto *Entry =
561 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
562 if (auto KindCost = Entry->Cost[CostKind])
563 return LT.first * *KindCost;
564
565 static const CostKindTblEntry AVX2ConstCostTable[] = {
566 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
567 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
568 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
569 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
570
571 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
572 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
573 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
574 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
575
576 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
577 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
578 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
579 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
580 };
581
582 if (Op2Info.isConstant() && ST->hasAVX2())
583 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
584 if (auto KindCost = Entry->Cost[CostKind])
585 return LT.first * *KindCost;
586
587 static const CostKindTblEntry AVXConstCostTable[] = {
588 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
589 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
590 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
591 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
592
593 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
594 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
595 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
596 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
597
598 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
599 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
600 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
601 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
602 };
603
604 if (Op2Info.isConstant() && ST->hasAVX())
605 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
606 if (auto KindCost = Entry->Cost[CostKind])
607 return LT.first * *KindCost;
608
609 static const CostKindTblEntry SSE41ConstCostTable[] = {
610 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
611 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
612 };
613
614 if (Op2Info.isConstant() && ST->hasSSE41())
615 if (const auto *Entry =
616 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
617 if (auto KindCost = Entry->Cost[CostKind])
618 return LT.first * *KindCost;
619
620 static const CostKindTblEntry SSE2ConstCostTable[] = {
621 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
622 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
623 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
624 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
625
626 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
627 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
628 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
629 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
630
631 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
632 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
633 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
634 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
635 };
636
637 if (Op2Info.isConstant() && ST->hasSSE2())
638 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
639 if (auto KindCost = Entry->Cost[CostKind])
640 return LT.first * *KindCost;
641
642 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
643 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
644 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
645 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
646 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
647 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
648 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
649 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
650 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
651 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
652
653 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
654 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
655 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
656 };
657
658 if (ST->hasBWI() && Op2Info.isUniform())
659 if (const auto *Entry =
660 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
661 if (auto KindCost = Entry->Cost[CostKind])
662 return LT.first * *KindCost;
663
664 static const CostKindTblEntry AVX512UniformCostTable[] = {
665 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
666 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
667 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
668
669 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
670 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
671 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
672
673 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
674 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
675 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
676 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
677 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
678 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
679 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
680 };
681
682 if (ST->hasAVX512() && Op2Info.isUniform())
683 if (const auto *Entry =
684 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
685 if (auto KindCost = Entry->Cost[CostKind])
686 return LT.first * *KindCost;
687
688 static const CostKindTblEntry AVX2UniformCostTable[] = {
689 // Uniform splats are cheaper for the following instructions.
690 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
691 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
692 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
693 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
694 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
695 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
696
697 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
698 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
699 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
700 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
701 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
702 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
703
704 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
705 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
706 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
707 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
708 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
709 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
710
711 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
712 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
713 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
714 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
715 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
716 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
717 };
718
719 if (ST->hasAVX2() && Op2Info.isUniform())
720 if (const auto *Entry =
721 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
722 if (auto KindCost = Entry->Cost[CostKind])
723 return LT.first * *KindCost;
724
725 static const CostKindTblEntry AVXUniformCostTable[] = {
726 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
727 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
728 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
729 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
730 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
731 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
732
733 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
734 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
735 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
736 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
737 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
738 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
739
740 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
741 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
742 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
743 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
744 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
745 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
746
747 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
748 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
749 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
750 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
751 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
752 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
753 };
754
755 // XOP has faster vXi8 shifts.
756 if (ST->hasAVX() && Op2Info.isUniform() &&
757 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
758 if (const auto *Entry =
759 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
760 if (auto KindCost = Entry->Cost[CostKind])
761 return LT.first * *KindCost;
762
763 static const CostKindTblEntry SSE2UniformCostTable[] = {
764 // Uniform splats are cheaper for the following instructions.
765 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
766 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
767 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
768
769 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
770 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
771 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
772
773 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
774 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
775 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
776
777 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
778 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
779 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
780 };
781
782 if (ST->hasSSE2() && Op2Info.isUniform() &&
783 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
784 if (const auto *Entry =
785 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
786 if (auto KindCost = Entry->Cost[CostKind])
787 return LT.first * *KindCost;
788
789 static const CostKindTblEntry AVX512DQCostTable[] = {
790 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
791 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
792 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
793 };
794
795 // Look for AVX512DQ lowering tricks for custom cases.
796 if (ST->hasDQI())
797 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
798 if (auto KindCost = Entry->Cost[CostKind])
799 return LT.first * *KindCost;
800
801 static const CostKindTblEntry AVX512BWCostTable[] = {
802 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
803 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
804 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
805 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
806 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
807 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
808 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
809 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
810 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
811
812 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
813 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
814 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
815 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
816 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
817 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
818 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
819 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
820 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
821
822 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
823 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
824
825 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
826 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
827 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
828 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
829
830 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
831 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
832
833 { ISD::MUL, MVT::v64i8, { 5, 10,10,11 } },
834 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
835
836 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
837 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
838 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
839 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
840 };
841
842 // Look for AVX512BW lowering tricks for custom cases.
843 if (ST->hasBWI())
844 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
845 if (auto KindCost = Entry->Cost[CostKind])
846 return LT.first * *KindCost;
847
848 static const CostKindTblEntry AVX512CostTable[] = {
849 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
850 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
851 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
852
853 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
854 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
855 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
856
857 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
858 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
859 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
860 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
861 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
862 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
863 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
864 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
865 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
866
867 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
868 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
869 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
870 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
871 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
872 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
873 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
874 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
875 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
876
877 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
878 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
879
880 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
881 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
882
883 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
884 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
885 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
886 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
887
888 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
889 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
890 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
891 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
892
893 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
894 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
895 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
896 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
897
898 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
899 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
900 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
901 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
902 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
903
904 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
905
906 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
907 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
908 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
909 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
910 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
911 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
912 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
913 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
914 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
915
916 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
917 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
918 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
919 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
920
921 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
922 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
923 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
924 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
925 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
926 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
927 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
928 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
929 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
930
931 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
932 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
933 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
934 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
935 };
936
937 if (ST->hasAVX512())
938 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
939 if (auto KindCost = Entry->Cost[CostKind])
940 return LT.first * *KindCost;
941
942 static const CostKindTblEntry AVX2ShiftCostTable[] = {
943 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
944 // customize them to detect the cases where shift amount is a scalar one.
945 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
946 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
947 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
948 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
949 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
950 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
951 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
952 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
953 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
954 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
955 };
956
957 if (ST->hasAVX512()) {
958 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
959 // On AVX512, a packed v32i16 shift left by a constant build_vector
960 // is lowered into a vector multiply (vpmullw).
961 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
962 Op1Info.getNoProps(), Op2Info.getNoProps());
963 }
964
965 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
966 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
967 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
968 Op2Info.isConstant())
969 // On AVX2, a packed v16i16 shift left by a constant build_vector
970 // is lowered into a vector multiply (vpmullw).
971 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
972 Op1Info.getNoProps(), Op2Info.getNoProps());
973
974 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
975 if (auto KindCost = Entry->Cost[CostKind])
976 return LT.first * *KindCost;
977 }
978
979 static const CostKindTblEntry XOPShiftCostTable[] = {
980 // 128bit shifts take 1cy, but right shifts require negation beforehand.
981 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
982 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
983 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
984 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
985 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
986 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
987 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
988 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
989 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
990 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
991 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
992 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
993 // 256bit shifts require splitting if AVX2 didn't catch them above.
994 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
995 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
996 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
997 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
998 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
999 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1000 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1001 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1002 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1003 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1004 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1005 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1006 };
1007
1008 // Look for XOP lowering tricks.
1009 if (ST->hasXOP()) {
1010 // If the right shift is constant then we'll fold the negation so
1011 // it's as cheap as a left shift.
1012 int ShiftISD = ISD;
1013 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1014 ShiftISD = ISD::SHL;
1015 if (const auto *Entry =
1016 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1017 if (auto KindCost = Entry->Cost[CostKind])
1018 return LT.first * *KindCost;
1019 }
1020
1021 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1022 MVT VT = LT.second;
1023 // Vector shift left by non uniform constant can be lowered
1024 // into vector multiply.
1025 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1026 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1027 ISD = ISD::MUL;
1028 }
1029
1030 static const CostKindTblEntry GLMCostTable[] = {
1031 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1032 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1033 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1034 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1035 };
1036
1037 if (ST->useGLMDivSqrtCosts())
1038 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1039 if (auto KindCost = Entry->Cost[CostKind])
1040 return LT.first * *KindCost;
1041
1042 static const CostKindTblEntry SLMCostTable[] = {
1043 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1044 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1045 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1046 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1047 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1048 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1049 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1050 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1051 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1052 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1053 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1054 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1055 // v2i64/v4i64 mul is custom lowered as a series of long:
1056 // multiplies(3), shifts(3) and adds(2)
1057 // slm muldq version throughput is 2 and addq throughput 4
1058 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1059 // 3X4 (addq throughput) = 17
1060 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1061 // slm addq\subq throughput is 4
1062 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1063 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1064 };
1065
1066 if (ST->useSLMArithCosts())
1067 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1068 if (auto KindCost = Entry->Cost[CostKind])
1069 return LT.first * *KindCost;
1070
1071 static const CostKindTblEntry AVX2CostTable[] = {
1072 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1073 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1074 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1075 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1076
1077 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1078 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1079 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1080 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1081
1082 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1083 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1084 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1085 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1086 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1087 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1088
1089 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1090 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1091 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1092 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1093 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1094 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1095 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1096 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1097
1098 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1099 { ISD::MUL, MVT::v32i8, { 6, 11,10,19 } }, // unpack/pmullw
1100 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1101 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1102 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1103 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1104 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1105
1106 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1107
1108 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1109 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1110
1111 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1112 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1113 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1114 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1115 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1116 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1117
1118 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1119 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1120 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1121 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1122 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1123 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1124
1125 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1126 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1127 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1128 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1129 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1130 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1131
1132 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1133 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1134 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1135 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1136 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1137 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1138 };
1139
1140 // Look for AVX2 lowering tricks for custom cases.
1141 if (ST->hasAVX2())
1142 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1143 if (auto KindCost = Entry->Cost[CostKind])
1144 return LT.first * *KindCost;
1145
1146 static const CostKindTblEntry AVX1CostTable[] = {
1147 // We don't have to scalarize unsupported ops. We can issue two half-sized
1148 // operations and we only need to extract the upper YMM half.
1149 // Two ops + 1 extract + 1 insert = 4.
1150 { ISD::MUL, MVT::v32i8, { 12, 13, 22, 23 } }, // unpack/pmullw + split
1151 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1152 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1153 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1154 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1155
1156 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1157 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1158 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1159 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1160
1161 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1162 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1163 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1164 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1165
1166 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1167 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1168 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1169 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1170
1171 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1172 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1173 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1174 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1175 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1176 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1177 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1178 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1179 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1180 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1181
1182 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1183 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1184 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1185 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1186 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1187 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1188 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1189 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1190
1191 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1192 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1193 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1194 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1195 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1196 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1197 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1198 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1199
1200 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1201 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1202 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1203 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1204 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1205 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1206 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1207 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1208
1209 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1210 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1211
1212 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1213 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1214 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1215 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1216 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1217 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1218
1219 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1220 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1221 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1222 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1223 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1224 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1225
1226 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1227 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1228 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1229 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1230 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1231 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1232
1233 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1234 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1235 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1236 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1237 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1238 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1239 };
1240
1241 if (ST->hasAVX())
1242 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1243 if (auto KindCost = Entry->Cost[CostKind])
1244 return LT.first * *KindCost;
1245
1246 static const CostKindTblEntry SSE42CostTable[] = {
1247 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1248 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1249 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1250 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1251
1252 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1253 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1254 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1255 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1256
1257 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1258 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1259 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1260 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1261
1262 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1263 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1264 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1265 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1266
1267 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1268 };
1269
1270 if (ST->hasSSE42())
1271 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1272 if (auto KindCost = Entry->Cost[CostKind])
1273 return LT.first * *KindCost;
1274
1275 static const CostKindTblEntry SSE41CostTable[] = {
1276 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1277 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1278 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1279
1280 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1281 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1282 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1283 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1284
1285 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1286 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1287 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1288 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1289
1290 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack
1291 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1292 };
1293
1294 if (ST->hasSSE41())
1295 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1296 if (auto KindCost = Entry->Cost[CostKind])
1297 return LT.first * *KindCost;
1298
1299 static const CostKindTblEntry SSE2CostTable[] = {
1300 // We don't correctly identify costs of casts because they are marked as
1301 // custom.
1302 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1303 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1304 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1305 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1306
1307 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1308 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1309 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1310 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1311
1312 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1313 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1314 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1315 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1316
1317 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1318 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1319 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1320 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1321
1322 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1323 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1324 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1325 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1326
1327 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1328 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1329 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1330 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1331
1332 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1333 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1334
1335 { ISD::MUL, MVT::v16i8, { 5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1336 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1337 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1338 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1339
1340 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1341
1342 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1343 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1344 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1345 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1346
1347 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1348 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1349 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1350 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1351
1352 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1353 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1354 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1355
1356 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1357 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1358 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1359
1360 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1361 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1362 };
1363
1364 if (ST->hasSSE2())
1365 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1366 if (auto KindCost = Entry->Cost[CostKind])
1367 return LT.first * *KindCost;
1368
1369 static const CostKindTblEntry SSE1CostTable[] = {
1370 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1371 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1372
1373 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1374 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1375
1376 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1377 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1378
1379 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1380 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1381
1382 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1383 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1384 };
1385
1386 if (ST->hasSSE1())
1387 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1388 if (auto KindCost = Entry->Cost[CostKind])
1389 return LT.first * *KindCost;
1390
1391 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1392 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1393 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1394 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1395 };
1396
1397 if (ST->is64Bit())
1398 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1399 if (auto KindCost = Entry->Cost[CostKind])
1400 return LT.first * *KindCost;
1401
1402 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1403 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1404 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1405 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1406
1407 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1408 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1409 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1410
1411 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1412 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1413 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1414
1415 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1416 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1417 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1418 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1419 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1420 };
1421
1422 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1423 if (auto KindCost = Entry->Cost[CostKind])
1424 return LT.first * *KindCost;
1425
1426 // It is not a good idea to vectorize division. We have to scalarize it and
1427 // in the process we will often end up having to spilling regular
1428 // registers. The overhead of division is going to dominate most kernels
1429 // anyways so try hard to prevent vectorization of division - it is
1430 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1431 // to hide "20 cycles" for each lane.
1432 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1433 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1434 ISD == ISD::UREM)) {
1435 InstructionCost ScalarCost =
1437 Op1Info.getNoProps(), Op2Info.getNoProps());
1438 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1439 }
1440
1441 // Handle some basic single instruction code size cases.
1442 if (CostKind == TTI::TCK_CodeSize) {
1443 switch (ISD) {
1444 case ISD::FADD:
1445 case ISD::FSUB:
1446 case ISD::FMUL:
1447 case ISD::FDIV:
1448 case ISD::FNEG:
1449 case ISD::AND:
1450 case ISD::OR:
1451 case ISD::XOR:
1452 return LT.first;
1453 break;
1454 }
1455 }
1456
1457 // Fallback to the default implementation.
1458 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1459 Args, CxtI);
1460}
1461
1464 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1466 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1467 return TTI::TCC_Basic;
1469}
1470
1472 TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask,
1474 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1475 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1476 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1477 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1478
1479 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1480
1481 // Recognize a basic concat_vector shuffle.
1482 if (Kind == TTI::SK_PermuteTwoSrc &&
1483 Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1484 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1487 CostKind, Mask.size() / 2, BaseTp);
1488
1489 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1490 if (Kind == TTI::SK_Transpose)
1491 Kind = TTI::SK_PermuteTwoSrc;
1492
1493 if (Kind == TTI::SK_Broadcast) {
1494 // For Broadcasts we are splatting the first element from the first input
1495 // register, so only need to reference that input and all the output
1496 // registers are the same.
1497 LT.first = 1;
1498
1499 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1500 using namespace PatternMatch;
1501 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1502 (ST->hasAVX2() ||
1503 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1504 return TTI::TCC_Free;
1505 }
1506
1507 // Treat <X x bfloat> shuffles as <X x half>.
1508 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1509 LT.second = LT.second.changeVectorElementType(MVT::f16);
1510
1511 // Subvector extractions are free if they start at the beginning of a
1512 // vector and cheap if the subvectors are aligned.
1513 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1514 int NumElts = LT.second.getVectorNumElements();
1515 if ((Index % NumElts) == 0)
1516 return 0;
1517 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1518 if (SubLT.second.isVector()) {
1519 int NumSubElts = SubLT.second.getVectorNumElements();
1520 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1521 return SubLT.first;
1522 // Handle some cases for widening legalization. For now we only handle
1523 // cases where the original subvector was naturally aligned and evenly
1524 // fit in its legalized subvector type.
1525 // FIXME: Remove some of the alignment restrictions.
1526 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1527 // vectors.
1528 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1529 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1530 (NumSubElts % OrigSubElts) == 0 &&
1531 LT.second.getVectorElementType() ==
1532 SubLT.second.getVectorElementType() &&
1533 LT.second.getVectorElementType().getSizeInBits() ==
1535 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1536 "Unexpected number of elements!");
1537 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1538 LT.second.getVectorNumElements());
1539 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1540 SubLT.second.getVectorNumElements());
1541 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1542 InstructionCost ExtractCost =
1543 getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt,
1544 CostKind, ExtractIndex, SubTy);
1545
1546 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1547 // if we have SSSE3 we can use pshufb.
1548 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1549 return ExtractCost + 1; // pshufd or pshufb
1550
1551 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1552 "Unexpected vector size");
1553
1554 return ExtractCost + 2; // worst case pshufhw + pshufd
1555 }
1556 }
1557 // If the extract subvector is not optimal, treat it as single op shuffle.
1559 }
1560
1561 // Subvector insertions are cheap if the subvectors are aligned.
1562 // Note that in general, the insertion starting at the beginning of a vector
1563 // isn't free, because we need to preserve the rest of the wide vector.
1564 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1565 int NumElts = LT.second.getVectorNumElements();
1566 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1567 if (SubLT.second.isVector()) {
1568 int NumSubElts = SubLT.second.getVectorNumElements();
1569 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1570 return SubLT.first;
1571 }
1572
1573 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1574 Kind = TTI::SK_PermuteTwoSrc;
1575 }
1576
1577 // Handle some common (illegal) sub-vector types as they are often very cheap
1578 // to shuffle even on targets without PSHUFB.
1579 EVT VT = TLI->getValueType(DL, BaseTp);
1580 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1581 !ST->hasSSSE3()) {
1582 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1583 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1584 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1585 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1586 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1587 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1588
1589 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1590 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1591 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1592 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1593
1594 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1595 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1596 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1597 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1598
1599 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1600 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1601 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1602 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1603 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1604
1605 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1606 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1607 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1608 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1609 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1610 };
1611
1612 if (ST->hasSSE2())
1613 if (const auto *Entry =
1614 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1615 return Entry->Cost;
1616 }
1617
1618 // We are going to permute multiple sources and the result will be in multiple
1619 // destinations. Providing an accurate cost only for splits where the element
1620 // type remains the same.
1621 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1622 MVT LegalVT = LT.second;
1623 if (LegalVT.isVector() &&
1624 LegalVT.getVectorElementType().getSizeInBits() ==
1626 LegalVT.getVectorNumElements() <
1627 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1628 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1629 unsigned LegalVTSize = LegalVT.getStoreSize();
1630 // Number of source vectors after legalization:
1631 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1632 // Number of destination vectors after legalization:
1633 InstructionCost NumOfDests = LT.first;
1634
1635 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1636 LegalVT.getVectorNumElements());
1637
1638 if (!Mask.empty() && NumOfDests.isValid()) {
1639 // Try to perform better estimation of the permutation.
1640 // 1. Split the source/destination vectors into real registers.
1641 // 2. Do the mask analysis to identify which real registers are
1642 // permuted. If more than 1 source registers are used for the
1643 // destination register building, the cost for this destination register
1644 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1645 // source register is used, build mask and calculate the cost as a cost
1646 // of PermuteSingleSrc.
1647 // Also, for the single register permute we try to identify if the
1648 // destination register is just a copy of the source register or the
1649 // copy of the previous destination register (the cost is
1650 // TTI::TCC_Basic). If the source register is just reused, the cost for
1651 // this operation is 0.
1652 NumOfDests =
1654 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1655 .first;
1656 unsigned E = *NumOfDests.getValue();
1657 unsigned NormalizedVF =
1658 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1659 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1660 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1661 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1662 copy(Mask, NormalizedMask.begin());
1663 unsigned PrevSrcReg = 0;
1664 ArrayRef<int> PrevRegMask;
1667 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1668 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1669 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1670 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1671 // Check if the previous register can be just copied to the next
1672 // one.
1673 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1674 PrevRegMask != RegMask)
1676 RegMask, CostKind, 0, nullptr);
1677 else
1678 // Just a copy of previous destination register.
1680 return;
1681 }
1682 if (SrcReg != DestReg &&
1683 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1684 // Just a copy of the source register.
1686 }
1687 PrevSrcReg = SrcReg;
1688 PrevRegMask = RegMask;
1689 },
1690 [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1691 unsigned /*Unused*/,
1692 unsigned /*Unused*/) {
1693 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1694 CostKind, 0, nullptr);
1695 });
1696 return Cost;
1697 }
1698
1699 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1700 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1701 std::nullopt, CostKind, 0, nullptr);
1702 }
1703
1704 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1705 }
1706
1707 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1708 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1709 // We assume that source and destination have the same vector type.
1710 InstructionCost NumOfDests = LT.first;
1711 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1712 LT.first = NumOfDests * NumOfShufflesPerDest;
1713 }
1714
1715 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1716 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1717 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1718
1719 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1720 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1721
1722 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1723 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1724 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1725 };
1726
1727 if (ST->hasVBMI())
1728 if (const auto *Entry =
1729 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1730 return LT.first * Entry->Cost;
1731
1732 static const CostTblEntry AVX512BWShuffleTbl[] = {
1733 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1734 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1735 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1736
1737 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1738 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1739 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1740 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1741
1742 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1743 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1744 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1745 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1746 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1747
1748 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1749 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1750 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1751 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1752 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1753
1754 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1755 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1756
1757 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1758 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1759 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1760 };
1761
1762 if (ST->hasBWI())
1763 if (const auto *Entry =
1764 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1765 return LT.first * Entry->Cost;
1766
1767 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1768 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1769 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1770 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1771 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1772 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1773 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1774 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1775
1776 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1777 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1778 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1779 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1780 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1781 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1782 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1783
1784 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1785 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1786 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1787 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1788 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1789 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1790 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1791 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1792 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1793 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1794 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1795
1796 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1797 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1798 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1799 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1800 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1801 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1802 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1803 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1804 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1805 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1806 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1807 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1808 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1809
1810 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1811 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1812 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1813 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1814 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1815 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1816 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1817 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1818 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1819 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1820 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1821 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1822
1823 // FIXME: This just applies the type legalization cost rules above
1824 // assuming these completely split.
1825 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1826 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1827 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1828 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1829 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1830 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1831
1832 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1833 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1834 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1835 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1836 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1837 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1838 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1839 };
1840
1841 if (ST->hasAVX512())
1842 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1843 if (auto KindCost = Entry->Cost[CostKind])
1844 return LT.first * *KindCost;
1845
1846 static const CostTblEntry AVX2ShuffleTbl[] = {
1847 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1848 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1849 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1850 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1851 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1852 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1853 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1854
1855 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1856 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1857 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1858 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1859 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1860 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1861 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1862
1863 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1864 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1865 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1866
1867 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1868 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1869 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1870 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1871 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1872
1873 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1874 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1875 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1876 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1877 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1878 // + vpblendvb
1879 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1880 // + vpblendvb
1881 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1882 // + vpblendvb
1883
1884 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1885 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1886 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1887 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1888 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1889 // + vpblendvb
1890 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1891 // + vpblendvb
1892 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1893 // + vpblendvb
1894 };
1895
1896 if (ST->hasAVX2())
1897 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1898 return LT.first * Entry->Cost;
1899
1900 static const CostTblEntry XOPShuffleTbl[] = {
1901 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1902 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1903 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1904 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1905 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1906 // + vinsertf128
1907 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1908 // + vinsertf128
1909
1910 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1911 // + vinsertf128
1912 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1913 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1914 // + vinsertf128
1915 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1916 };
1917
1918 if (ST->hasXOP())
1919 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1920 return LT.first * Entry->Cost;
1921
1922 static const CostTblEntry AVX1ShuffleTbl[] = {
1923 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1924 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1925 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1926 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1927 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1928 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1929 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1930
1931 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1932 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1933 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1934 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1935 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1936 // + vinsertf128
1937 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1938 // + vinsertf128
1939 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1940 // + vinsertf128
1941
1942 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1943 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1944 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1945 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1946 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1947 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
1948 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1949
1950 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
1951 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
1952 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1953 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1954 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1955 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1956 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1957
1958 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1959 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1960 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1961 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1962 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1963 // + 2*por + vinsertf128
1964 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
1965 // + 2*por + vinsertf128
1966 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1967 // + 2*por + vinsertf128
1968
1969 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1970 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1971 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1972 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1973 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1974 // + 4*por + vinsertf128
1975 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
1976 // + 4*por + vinsertf128
1977 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1978 // + 4*por + vinsertf128
1979 };
1980
1981 if (ST->hasAVX())
1982 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1983 return LT.first * Entry->Cost;
1984
1985 static const CostTblEntry SSE41ShuffleTbl[] = {
1986 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1987 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1988 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1989 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1990 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1991 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
1992 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1993 };
1994
1995 if (ST->hasSSE41())
1996 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1997 return LT.first * Entry->Cost;
1998
1999 static const CostTblEntry SSSE3ShuffleTbl[] = {
2000 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
2001 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
2002 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
2003
2004 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
2005 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
2006 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
2007
2008 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
2009 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
2010 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
2011
2012 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
2013 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
2014 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
2015 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
2016 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
2017
2018 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
2019 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
2020 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
2021
2022 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
2023 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
2024 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
2025 };
2026
2027 if (ST->hasSSSE3())
2028 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2029 return LT.first * Entry->Cost;
2030
2031 static const CostTblEntry SSE2ShuffleTbl[] = {
2032 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2033 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2034 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2035 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2036 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2037 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2038
2039 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2040 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2041 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2042 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2043 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2044 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2045 // + 2*pshufd + 2*unpck + packus
2046
2047 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2048 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2049 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2050 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2051 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2052 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2053
2054 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2055 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2056 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2057 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2058 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2059 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2060
2061 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2062 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2063 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2064 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2065 // + pshufd/unpck
2066 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2067 // + pshufd/unpck
2068 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2069 // + 2*pshufd + 2*unpck + 2*packus
2070
2071 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2072 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2073 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2074 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2075 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2076 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2077 };
2078
2079 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2080 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2081 };
2082
2083 if (ST->hasSSE2()) {
2084 bool IsLoad =
2085 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2086 if (ST->hasSSE3() && IsLoad)
2087 if (const auto *Entry =
2088 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2090 LT.second.getVectorElementCount()) &&
2091 "Table entry missing from isLegalBroadcastLoad()");
2092 return LT.first * Entry->Cost;
2093 }
2094
2095 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2096 return LT.first * Entry->Cost;
2097 }
2098
2099 static const CostTblEntry SSE1ShuffleTbl[] = {
2100 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2101 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2102 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2103 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2104 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2105 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2106 };
2107
2108 if (ST->hasSSE1())
2109 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2110 return LT.first * Entry->Cost;
2111
2112 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2113}
2114
2116 Type *Src,
2119 const Instruction *I) {
2120 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2121 assert(ISD && "Invalid opcode");
2122
2123 // TODO: Allow non-throughput costs that aren't binary.
2124 auto AdjustCost = [&CostKind](InstructionCost Cost,
2127 return Cost == 0 ? 0 : N;
2128 return Cost * N;
2129 };
2130
2131 // The cost tables include both specific, custom (non-legal) src/dst type
2132 // conversions and generic, legalized types. We test for customs first, before
2133 // falling back to legalization.
2134 // FIXME: Need a better design of the cost table to handle non-simple types of
2135 // potential massive combinations (elem_num x src_type x dst_type).
2136 static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
2137 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2138 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2139
2140 // Mask sign extend has an instruction.
2141 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
2142 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
2143 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
2144 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
2145 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
2146 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
2147 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
2148 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
2149 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
2150 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
2151 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
2152 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
2153 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2154 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
2155 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
2156 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
2157 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 },
2158
2159 // Mask zero extend is a sext + shift.
2160 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
2161 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
2162 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
2163 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
2164 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
2165 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
2166 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
2167 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
2168 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
2169 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
2170 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
2171 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
2172 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
2173 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
2174 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
2175 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
2176 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 },
2177
2178 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
2179 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
2180 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
2181 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
2182 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
2183 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
2184 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
2185 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
2186 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
2187 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
2188 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
2189 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
2190 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
2191 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
2192 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
2193 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
2194 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 },
2195
2196 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
2197 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
2198 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
2199 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
2200 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
2201 };
2202
2203 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
2204 // Mask sign extend has an instruction.
2205 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
2206 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
2207 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
2208 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
2209 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
2210 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 },
2211 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 },
2212 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 },
2213
2214 // Mask zero extend is a sext + shift.
2215 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
2216 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
2217 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
2218 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
2219 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
2220 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 },
2221 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 },
2222 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
2223
2224 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
2225 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
2226 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
2227 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
2228 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2229 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 },
2230 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 },
2231 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 },
2232
2233 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
2234 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
2235
2236 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
2237 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
2238
2239 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
2240 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
2241
2242 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
2243 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
2244 };
2245
2246 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2247 // 256-bit wide vectors.
2248
2249 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
2250 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
2251 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
2252 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, 4 }, // 2*vcvtps2pd+vextractf64x4
2253 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
2254
2255 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2256 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2257 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2258 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
2259 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2260 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2261 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2262 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
2263 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
2264 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
2265 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
2266 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
2267 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
2268 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
2269 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
2270 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
2271 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
2272 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
2273 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb
2274 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb
2275 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw
2276 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw
2277 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
2278 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
2279 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
2280 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb
2281 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb
2282 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb
2283 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
2284 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw
2285 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw
2286 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
2287 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
2288 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
2289
2290 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
2291 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 },
2292 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 },
2293
2294 // Sign extend is zmm vpternlogd+vptruncdb.
2295 // Zero extend is zmm broadcast load+vptruncdw.
2296 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
2297 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
2298 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
2299 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
2300 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
2301 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
2302 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
2303 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },
2304
2305 // Sign extend is zmm vpternlogd+vptruncdw.
2306 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2307 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
2308 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
2309 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
2310 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
2311 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
2312 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
2313 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
2314 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2315
2316 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
2317 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
2318 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
2319 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
2320 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
2321 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
2322 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
2323 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
2324 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
2325 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
2326
2327 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
2328 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
2329 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
2330 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
2331
2332 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
2333 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
2334 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2335 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2336 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
2337 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
2338 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
2339 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
2340 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
2341 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
2342
2343 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
2344 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
2345
2346 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
2347 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
2348 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
2349 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
2350 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
2351 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
2352 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
2353 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
2354
2355 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
2356 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
2357 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
2358 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
2359 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
2360 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
2361 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
2362 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
2363 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
2364 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
2365
2366 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
2367 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 },
2368 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 },
2369 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 },
2370 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 },
2371 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
2372 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 },
2373 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 },
2374 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 },
2375 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 },
2376 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 },
2377
2378 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
2379 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
2380 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 },
2381 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
2382 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 },
2383 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 },
2384 };
2385
2386 static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
2387 // Mask sign extend has an instruction.
2388 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
2389 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
2390 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
2391 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
2392 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
2393 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
2394 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
2395 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
2396 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
2397 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
2398 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
2399 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
2400 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2401 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
2402 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 },
2403 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 },
2404 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 },
2405
2406 // Mask zero extend is a sext + shift.
2407 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
2408 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
2409 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
2410 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
2411 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
2412 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
2413 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
2414 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
2415 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
2416 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
2417 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
2418 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
2419 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
2420 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
2421 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 },
2422 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 },
2423 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 },
2424
2425 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
2426 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
2427 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
2428 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
2429 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
2430 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
2431 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
2432 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
2433 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
2434 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
2435 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
2436 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
2437 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
2438 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
2439 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 },
2440 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 },
2441 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 },
2442
2443 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
2444 };
2445
2446 static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
2447 // Mask sign extend has an instruction.
2448 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
2449 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
2450 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
2451 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 },
2452 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
2453 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 },
2454 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 },
2455 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
2456
2457 // Mask zero extend is a sext + shift.
2458 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
2459 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
2460 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
2461 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 },
2462 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
2463 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 },
2464 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 },
2465 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
2466
2467 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 },
2468 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 },
2469 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
2470 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
2471 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
2472 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
2473 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 },
2474 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2475
2476 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
2477 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2478 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
2479 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
2480
2481 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
2482 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2483 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
2484 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
2485
2486 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 },
2487 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
2488 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
2489 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
2490
2491 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 },
2492 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
2493 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
2494 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
2495 };
2496
2497 static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
2498 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2499 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2500 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2501 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
2502 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2503 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2504 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2505 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
2506 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
2507 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
2508 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2509 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2510 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
2511 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
2512 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
2513 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
2514 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
2515 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
2516
2517 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2518 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2519 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
2520 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
2521 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
2522 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
2523 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
2524 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
2525 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
2526 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },
2527
2528 // sign extend is vpcmpeq+maskedmove+vpmovdw
2529 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2530 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
2531 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
2532 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
2533 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
2534 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
2535 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
2536 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
2537 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
2538
2539 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
2540 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
2541 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
2542 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
2543 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
2544 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
2545 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, // vpternlogd
2546 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, // vpternlogd+psrld
2547
2548 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
2549 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
2550 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
2551 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
2552
2553 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
2554 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
2555 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
2556 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
2557 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
2558 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
2559 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
2560 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
2561 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
2562 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
2563 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
2564 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
2565
2566 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2567 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
2568 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2569 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
2570
2571 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
2572 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
2573 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2574 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
2575 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2576 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
2577 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2578 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2579 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
2580 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
2581 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
2582 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2583 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
2584
2585 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2586 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
2587 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 },
2588
2589 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
2590 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
2591 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
2592 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 },
2593 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
2594 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
2595 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
2596 };
2597
2598 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
2599 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
2600 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
2601 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
2602 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
2603 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2604 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2605
2606 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
2607 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
2608 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
2609 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
2610 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2611 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2612 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
2613 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
2614 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2615 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2616 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2617 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2618 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2619 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2620
2621 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2622
2623 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 },
2624 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 },
2625 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 },
2626 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 },
2627 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 },
2628 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 },
2629 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 },
2630 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 },
2631 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 },
2632 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 },
2633 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
2634 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
2635
2636 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
2637 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
2638
2639 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 },
2640 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 },
2641 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 },
2642 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 },
2643
2644 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 },
2645 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 },
2646 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 },
2647 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2648 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2649 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 },
2650 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 },
2651 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 },
2652
2653 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
2654 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
2655 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
2656 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
2657 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
2658 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
2659 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 },
2660
2661 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
2662 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
2663 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
2664 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
2665 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
2666 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
2667 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 },
2668 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2669 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2670 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2671 };
2672
2673 static const TypeConversionCostTblEntry AVXConversionTbl[] = {
2674 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
2675 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
2676 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
2677 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
2678 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2679 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2680
2681 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2682 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2683 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2684 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2685 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2686 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2687 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2688 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2689 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2690 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2691 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2692 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2693
2694 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
2695 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
2696 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
2697 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
2698 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
2699
2700 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
2701 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
2702 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
2703 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 },
2704 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2705 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 },
2706 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
2707 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
2708
2709 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
2710 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
2711 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
2712 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2713 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2714 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2715 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2716 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2717 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2718 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2719 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 },
2720 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 },
2721
2722 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
2723 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
2724 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
2725 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2726 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2727 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2728 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2729 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 },
2730 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 },
2731 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2732 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
2733 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
2734 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 },
2735 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 },
2736 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 },
2737 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2738 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 },
2739
2740 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2741 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 },
2742 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 },
2743 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 },
2744 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 },
2745 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 },
2746 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 },
2747 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 },
2748 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 },
2749 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 },
2750 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 },
2751
2752 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 },
2753 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 },
2754 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 },
2755 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 },
2756 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 },
2757 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 },
2758 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 },
2759 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 },
2760 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2761 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2762 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 },
2763 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 },
2764 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 },
2765
2766 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
2767 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
2768 };
2769
2770 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2771 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2772 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2773 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2774 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2775 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2776 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2777 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2778 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2779 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2780 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2781 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2782 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2783
2784 // These truncates end up widening elements.
2785 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
2786 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
2787 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
2788
2789 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 },
2790 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 },
2791 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 },
2792
2793 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 },
2794 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 },
2795 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 },
2796 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 },
2797 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2798 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2799 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2800 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2801 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2802 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 },
2803 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2804
2805 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 },
2806 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 },
2807 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
2808 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
2809 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2810 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2811 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2812 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2813 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 },
2814 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2815 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 },
2816 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 },
2817 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 },
2818 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 },
2819
2820 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 },
2821 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 },
2822 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 },
2823 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 },
2824 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 },
2825 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 },
2826 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 },
2827 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 },
2828 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
2829 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 },
2830
2831 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 },
2832 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2833 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 },
2834 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
2835 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 },
2836 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 },
2837 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 },
2838 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 },
2839 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 },
2840 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2841 };
2842
2843 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2844 // These are somewhat magic numbers justified by comparing the
2845 // output of llvm-mca for our various supported scheduler models
2846 // and basing it off the worst case scenario.
2847 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 },
2848 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 },
2849 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 },
2850 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 },
2851 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 },
2852 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2853 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 },
2854 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2855 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2856 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 },
2857 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 },
2858 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 },
2859
2860 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 },
2861 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 },
2862 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 },
2863 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 },
2864 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2865 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 },
2866 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 },
2867 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2868 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 },
2869 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 },
2870 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2871 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 },
2872 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 },
2873
2874 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 },
2875 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 },
2876 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 },
2877 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 },
2878 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 },
2879 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 },
2880 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 },
2881 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 },
2882 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 },
2883 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 },
2884
2885 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 },
2886 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2887 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 },
2888 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
2889 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 },
2890 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 },
2891 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 },
2892 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 },
2893 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 },
2894 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 },
2895
2896 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2897 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2898 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 },
2899 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 },
2900 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2901 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 },
2902 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 },
2903 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 },
2904 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2905 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 },
2906 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2907 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 },
2908
2909 // These truncates are really widening elements.
2910 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2911 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2912 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2913 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2914 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2915 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2916
2917 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2918 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
2919 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
2920 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
2921 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
2922 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 },
2923 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2924 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 },
2925 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2926 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2927 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
2928 };
2929
2930 // Attempt to map directly to (simple) MVT types to let us match custom entries.
2931 EVT SrcTy = TLI->getValueType(DL, Src);
2932 EVT DstTy = TLI->getValueType(DL, Dst);
2933
2934 // The function getSimpleVT only handles simple value types.
2935 if (SrcTy.isSimple() && DstTy.isSimple()) {
2936 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2937 MVT SimpleDstTy = DstTy.getSimpleVT();
2938
2939 if (ST->useAVX512Regs()) {
2940 if (ST->hasBWI())
2941 if (const auto *Entry = ConvertCostTableLookup(
2942 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2943 return AdjustCost(Entry->Cost);
2944
2945 if (ST->hasDQI())
2946 if (const auto *Entry = ConvertCostTableLookup(
2947 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2948 return AdjustCost(Entry->Cost);
2949
2950 if (ST->hasAVX512())
2951 if (const auto *Entry = ConvertCostTableLookup(
2952 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2953 return AdjustCost(Entry->Cost);
2954 }
2955
2956 if (ST->hasBWI())
2957 if (const auto *Entry = ConvertCostTableLookup(
2958 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2959 return AdjustCost(Entry->Cost);
2960
2961 if (ST->hasDQI())
2962 if (const auto *Entry = ConvertCostTableLookup(
2963 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2964 return AdjustCost(Entry->Cost);
2965
2966 if (ST->hasAVX512())
2967 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2968 SimpleDstTy, SimpleSrcTy))
2969 return AdjustCost(Entry->Cost);
2970
2971 if (ST->hasAVX2()) {
2972 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2973 SimpleDstTy, SimpleSrcTy))
2974 return AdjustCost(Entry->Cost);
2975 }
2976
2977 if (ST->hasAVX()) {
2978 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2979 SimpleDstTy, SimpleSrcTy))
2980 return AdjustCost(Entry->Cost);
2981 }
2982
2983 if (ST->hasSSE41()) {
2984 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2985 SimpleDstTy, SimpleSrcTy))
2986 return AdjustCost(Entry->Cost);
2987 }
2988
2989 if (ST->hasSSE2()) {
2990 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2991 SimpleDstTy, SimpleSrcTy))
2992 return AdjustCost(Entry->Cost);
2993 }
2994 }
2995
2996 // Fall back to legalized types.
2997 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
2998 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
2999
3000 // If we're truncating to the same legalized type - just assume its free.
3001 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3002 return TTI::TCC_Free;
3003
3004 if (ST->useAVX512Regs()) {
3005 if (ST->hasBWI())
3006 if (const auto *Entry = ConvertCostTableLookup(
3007 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3008 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3009
3010 if (ST->hasDQI())
3011 if (const auto *Entry = ConvertCostTableLookup(
3012 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3013 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3014
3015 if (ST->hasAVX512())
3016 if (const auto *Entry = ConvertCostTableLookup(
3017 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3018 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3019 }
3020
3021 if (ST->hasBWI())
3022 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3023 LTDest.second, LTSrc.second))
3024 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3025
3026 if (ST->hasDQI())
3027 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3028 LTDest.second, LTSrc.second))
3029 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3030
3031 if (ST->hasAVX512())
3032 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3033 LTDest.second, LTSrc.second))
3034 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3035
3036 if (ST->hasAVX2())
3037 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3038 LTDest.second, LTSrc.second))
3039 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3040
3041 if (ST->hasAVX())
3042 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3043 LTDest.second, LTSrc.second))
3044 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3045
3046 if (ST->hasSSE41())
3047 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3048 LTDest.second, LTSrc.second))
3049 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3050
3051 if (ST->hasSSE2())
3052 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3053 LTDest.second, LTSrc.second))
3054 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3055
3056 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3057 // sitofp.
3058 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3059 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3060 Type *ExtSrc = Src->getWithNewBitWidth(32);
3061 unsigned ExtOpc =
3062 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3063
3064 // For scalar loads the extend would be free.
3065 InstructionCost ExtCost = 0;
3066 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3067 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3068
3069 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3071 }
3072
3073 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3074 // i32.
3075 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3076 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3077 Type *TruncDst = Dst->getWithNewBitWidth(32);
3078 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3079 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3081 }
3082
3083 return AdjustCost(
3084 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3085}
3086
3088 Type *CondTy,
3089 CmpInst::Predicate VecPred,
3091 const Instruction *I) {
3092 // Early out if this type isn't scalar/vector integer/float.
3093 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3094 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3095 I);
3096
3097 // Legalize the type.
3098 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3099
3100 MVT MTy = LT.second;
3101
3102 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3103 assert(ISD && "Invalid opcode");
3104
3105 InstructionCost ExtraCost = 0;
3106 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3107 // Some vector comparison predicates cost extra instructions.
3108 // TODO: Adjust ExtraCost based on CostKind?
3109 // TODO: Should we invert this and assume worst case cmp costs
3110 // and reduce for particular predicates?
3111 if (MTy.isVector() &&
3112 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3113 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3114 ST->hasBWI())) {
3115 // Fallback to I if a specific predicate wasn't specified.
3116 CmpInst::Predicate Pred = VecPred;
3117 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3119 Pred = cast<CmpInst>(I)->getPredicate();
3120
3121 bool CmpWithConstant = false;
3122 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3123 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3124
3125 switch (Pred) {
3127 // xor(cmpeq(x,y),-1)
3128 ExtraCost = CmpWithConstant ? 0 : 1;
3129 break;
3132 // xor(cmpgt(x,y),-1)
3133 ExtraCost = CmpWithConstant ? 0 : 1;
3134 break;
3137 // cmpgt(xor(x,signbit),xor(y,signbit))
3138 // xor(cmpeq(pmaxu(x,y),x),-1)
3139 ExtraCost = CmpWithConstant ? 1 : 2;
3140 break;
3143 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3144 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3145 // cmpeq(psubus(x,y),0)
3146 // cmpeq(pminu(x,y),x)
3147 ExtraCost = 1;
3148 } else {
3149 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3150 ExtraCost = CmpWithConstant ? 2 : 3;
3151 }
3152 break;
3155 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3156 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3157 if (CondTy && !ST->hasAVX())
3158 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3160 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3162 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3163
3164 break;
3167 // Assume worst case scenario and add the maximum extra cost.
3168 ExtraCost = 3;
3169 break;
3170 default:
3171 break;
3172 }
3173 }
3174 }
3175
3176 static const CostKindTblEntry SLMCostTbl[] = {
3177 // slm pcmpeq/pcmpgt throughput is 2
3178 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3179 // slm pblendvb/blendvpd/blendvps throughput is 4
3180 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3181 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3182 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3183 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3184 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3185 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3186 };
3187
3188 static const CostKindTblEntry AVX512BWCostTbl[] = {
3189 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3190 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3191 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3192 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3193
3194 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3195 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3196 };
3197
3198 static const CostKindTblEntry AVX512CostTbl[] = {
3199 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3200 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3201 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3202 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3203
3204 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3205 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3206 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3207 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3208 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3209 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3210 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3211
3212 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3213 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3214 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3215 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3216 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3217 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3218 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3219 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3220 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3221 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3222 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3223 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3224 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3225 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3226
3227 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3228 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3229 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3230 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3231 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3232 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3233 };
3234
3235 static const CostKindTblEntry AVX2CostTbl[] = {
3236 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3237 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3238 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3239 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3240 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3241 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3242
3243 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3244 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3245 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3246 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3247
3248 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3249 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3250 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3251 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3252 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3253 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3254 };
3255
3256 static const CostKindTblEntry XOPCostTbl[] = {
3257 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3258 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3259 };
3260
3261 static const CostKindTblEntry AVX1CostTbl[] = {
3262 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3263 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3264 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3265 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3266 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3267 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3268
3269 // AVX1 does not support 8-wide integer compare.
3270 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3271 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3272 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3273 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3274
3275 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3276 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3277 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3278 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3279 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3280 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3281 };
3282
3283 static const CostKindTblEntry SSE42CostTbl[] = {
3284 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3285 };
3286
3287 static const CostKindTblEntry SSE41CostTbl[] = {
3288 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3289 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3290
3291 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3292 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3293 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3294 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3295 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3296 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3297 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3298 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3299 };
3300
3301 static const CostKindTblEntry SSE2CostTbl[] = {
3302 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3303 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3304
3305 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3306 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3307 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3308 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3309
3310 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3311 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3312 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3313 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3314 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3315 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3316 };
3317
3318 static const CostKindTblEntry SSE1CostTbl[] = {
3319 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3320 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3321
3322 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3323 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3324 };
3325
3326 if (ST->useSLMArithCosts())
3327 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3328 if (auto KindCost = Entry->Cost[CostKind])
3329 return LT.first * (ExtraCost + *KindCost);
3330
3331 if (ST->hasBWI())
3332 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3333 if (auto KindCost = Entry->Cost[CostKind])
3334 return LT.first * (ExtraCost + *KindCost);
3335
3336 if (ST->hasAVX512())
3337 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3338 if (auto KindCost = Entry->Cost[CostKind])
3339 return LT.first * (ExtraCost + *KindCost);
3340
3341 if (ST->hasAVX2())
3342 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3343 if (auto KindCost = Entry->Cost[CostKind])
3344 return LT.first * (ExtraCost + *KindCost);
3345
3346 if (ST->hasXOP())
3347 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3348 if (auto KindCost = Entry->Cost[CostKind])
3349 return LT.first * (ExtraCost + *KindCost);
3350
3351 if (ST->hasAVX())
3352 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3353 if (auto KindCost = Entry->Cost[CostKind])
3354 return LT.first * (ExtraCost + *KindCost);
3355
3356 if (ST->hasSSE42())
3357 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3358 if (auto KindCost = Entry->Cost[CostKind])
3359 return LT.first * (ExtraCost + *KindCost);
3360
3361 if (ST->hasSSE41())
3362 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3363 if (auto KindCost = Entry->Cost[CostKind])
3364 return LT.first * (ExtraCost + *KindCost);
3365
3366 if (ST->hasSSE2())
3367 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3368 if (auto KindCost = Entry->Cost[CostKind])
3369 return LT.first * (ExtraCost + *KindCost);
3370
3371 if (ST->hasSSE1())
3372 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3373 if (auto KindCost = Entry->Cost[CostKind])
3374 return LT.first * (ExtraCost + *KindCost);
3375
3376 // Assume a 3cy latency for fp select ops.
3377 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3378 if (ValTy->getScalarType()->isFloatingPointTy())
3379 return 3;
3380
3381 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3382}
3383
3385
3389 // Costs should match the codegen from:
3390 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3391 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3392 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3393 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3394 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3395
3396 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3397 // specialized in these tables yet.
3398 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3399 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3400 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3401 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3402 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3403 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3404 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3405 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3406 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3407 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3408 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3409 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3410 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3411 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3412 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3413 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3414 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3415 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3416 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3417 };
3418 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3419 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3420 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3421 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3422 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3423 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3424 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3425 };
3426 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3427 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3428 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3429 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3430 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3431 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3432 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3433 };
3434 static const CostKindTblEntry AVX512CDCostTbl[] = {
3435 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3436 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3437 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3438 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3439 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3440 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3441 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3442 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3443 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3444 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3445 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3446 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3447
3448 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3449 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3450 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3451 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3452 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3453 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3454 };
3455 static const CostKindTblEntry AVX512BWCostTbl[] = {
3456 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3457 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3458 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3459 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3460 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3461 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3462 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3463 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3464 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3465 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3466 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3467 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3468 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3469 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3470 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3471 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3472 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3473 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3474 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3475 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3476 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3477 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3478 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3479 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3480 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3481 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3482 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3483 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3484 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3485 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3486 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3487 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3488 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3489 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3490 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3491 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3492 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3493 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3494 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3495 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3496 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3497 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3498 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3499 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3500 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3501 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3502 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3503 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3504 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3505 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3506 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3507 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3508 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3509 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3510 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3511 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3512 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3513 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3514 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3515 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3516 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3517 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3518 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3519 { ISD::SADDSAT, MVT::v32i16, { 1 } },
3520 { ISD::SADDSAT, MVT::v64i8, { 1 } },
3521 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3522 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3523 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3524 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3525 { ISD::SSUBSAT, MVT::v32i16, { 1 } },
3526 { ISD::SSUBSAT, MVT::v64i8, { 1 } },
3527 { ISD::UADDSAT, MVT::v32i16, { 1 } },
3528 { ISD::UADDSAT, MVT::v64i8, { 1 } },
3529 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3530 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3531 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3532 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3533 { ISD::USUBSAT, MVT::v32i16, { 1 } },
3534 { ISD::USUBSAT, MVT::v64i8, { 1 } },
3535 };
3536 static const CostKindTblEntry AVX512CostTbl[] = {
3537 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3538 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3539 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3540 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3541 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3542 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3543 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3544 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3545 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3546 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3547 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3548 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3549 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3550 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3551 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3552 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3553 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3554 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3555 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3556 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3557 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3558 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3559 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3560 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3561 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3562 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3563 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3564 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3565 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3566 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3567 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3568 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3569 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3570 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3571 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3572 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3573 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3574 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3575 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3576 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3577 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3578 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3579 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3580 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3581 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3582 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3583 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3584 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3585 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3586 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3587 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3588 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3589 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3590 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3591 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3592 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3593 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3594 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3595 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3596 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3597 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3598 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3599 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3600 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3601 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3602 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3603 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3604 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3605 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3606 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3607 { ISD::USUBSAT, MVT::v16i32, { 2 } }, // pmaxud + psubd
3608 { ISD::USUBSAT, MVT::v2i64, { 2 } }, // pmaxuq + psubq
3609 { ISD::USUBSAT, MVT::v4i64, { 2 } }, // pmaxuq + psubq
3610 { ISD::USUBSAT, MVT::v8i64, { 2 } }, // pmaxuq + psubq
3611 { ISD::UADDSAT, MVT::v16i32, { 3 } }, // not + pminud + paddd
3612 { ISD::UADDSAT, MVT::v2i64, { 3 } }, // not + pminuq + paddq
3613 { ISD::UADDSAT, MVT::v4i64, { 3 } }, // not + pminuq + paddq
3614 { ISD::UADDSAT, MVT::v8i64, { 3 } }, // not + pminuq + paddq
3615 { ISD::SADDSAT, MVT::v32i16, { 2 } },
3616 { ISD::SADDSAT, MVT::v64i8, { 2 } },
3617 { ISD::SSUBSAT, MVT::v32i16, { 2 } },
3618 { ISD::SSUBSAT, MVT::v64i8, { 2 } },
3619 { ISD::UADDSAT, MVT::v32i16, { 2 } },
3620 { ISD::UADDSAT, MVT::v64i8, { 2 } },
3621 { ISD::USUBSAT, MVT::v32i16, { 2 } },
3622 { ISD::USUBSAT, MVT::v64i8, { 2 } },
3623 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3624 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3625 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3626 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3627 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3628 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3629 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3630 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3631 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3632 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3633 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3634 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3635 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3636 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3637 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3638 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3639 };
3640 static const CostKindTblEntry XOPCostTbl[] = {
3641 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3642 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3643 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3644 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3645 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3646 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3647 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3648 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3649 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3650 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3651 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3652 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3653 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3654 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3655 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3656 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3657 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3658 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3659 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3660 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3661 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3662 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3663 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3664 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3665 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3666 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3667 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3668 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3669 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3670 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3671 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3672 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3673 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3674 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3675 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3676 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3677 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3678 };
3679 static const CostKindTblEntry AVX2CostTbl[] = {
3680 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3681 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3682 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3683 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3684 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3685 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3686 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3687 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3688 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3689 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3690 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3691 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3692 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3693 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3694 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3695 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3696 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3697 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3698 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3699 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3700 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3701 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3702 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3703 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3704 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3705 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3706 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3707 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3708 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3709 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3710 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3711 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3712 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3713 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3714 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3715 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3716 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3717 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3718 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3719 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3720 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3721 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3722 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3723 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3724 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3725 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3726 { ISD::SADDSAT, MVT::v16i16, { 1 } },
3727 { ISD::SADDSAT, MVT::v32i8, { 1 } },
3728 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
3729 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
3730 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3731 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3732 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3733 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
3734 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
3735 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3736 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3737 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3738 { ISD::SSUBSAT, MVT::v16i16, { 1 } },
3739 { ISD::SSUBSAT, MVT::v32i8, { 1 } },
3740 { ISD::UADDSAT, MVT::v16i16, { 1 } },
3741 { ISD::UADDSAT, MVT::v32i8, { 1 } },
3742 { ISD::UADDSAT, MVT::v8i32, { 3 } }, // not + pminud + paddd
3743 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
3744 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
3745 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3746 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3747 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3748 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
3749 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
3750 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3751 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3752 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3753 { ISD::USUBSAT, MVT::v16i16, { 1 } },
3754 { ISD::USUBSAT, MVT::v32i8, { 1 } },
3755 { ISD::USUBSAT, MVT::v8i32, { 2 } }, // pmaxud + psubd
3756 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3757 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3758 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3759 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3760 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3761 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3762 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
3763 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
3764 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
3765 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
3766 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
3767 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
3768 };
3769 static const CostKindTblEntry AVX1CostTbl[] = {
3770 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3771 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
3772 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
3773 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
3774 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3775 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
3776 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3777 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
3778 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3779 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
3780 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
3781 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
3782 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
3783 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
3784 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
3785 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
3786 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
3787 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
3788 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
3789 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
3790 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
3791 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
3792 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
3793 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
3794 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3795 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
3796 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3797 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
3798 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3799 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
3800 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
3801 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
3802 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
3803 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
3804 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
3805 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
3806 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
3807 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
3808 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3809 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
3810 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
3811 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
3812 { ISD::SADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3813 { ISD::SADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3814 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3815 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
3816 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3817 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3818 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3819 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3820 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3821 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3822 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3823 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3824 { ISD::SSUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3825 { ISD::SSUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3826 { ISD::UADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3827 { ISD::UADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3828 { ISD::UADDSAT, MVT::v8i32, { 8 } }, // 2 x 128-bit Op + extract/insert
3829 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3830 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
3831 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3832 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3833 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3834 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3835 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
3836 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3837 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3838 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3839 { ISD::USUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3840 { ISD::USUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3841 { ISD::USUBSAT, MVT::v8i32, { 6 } }, // 2 x 128-bit Op + extract/insert
3842 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3843 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3844 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3845 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3846 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3847 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3848 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
3849 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
3850 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
3851 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
3852 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
3853 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
3854 };
3855 static const CostKindTblEntry GFNICostTbl[] = {
3856 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
3857 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
3858 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
3859 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
3860 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3861 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3862 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3863 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
3864 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
3865 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
3866 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
3867 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
3868 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
3869 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
3870 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3871 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3872 };
3873 static const CostKindTblEntry GLMCostTbl[] = {
3874 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
3875 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
3876 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
3877 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
3878 };
3879 static const CostKindTblEntry SLMCostTbl[] = {
3880 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
3881 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
3882 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
3883 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
3884 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
3885 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
3886 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
3887 };
3888 static const CostKindTblEntry SSE42CostTbl[] = {
3889 { ISD::USUBSAT, MVT::v4i32, { 2 } }, // pmaxud + psubd
3890 { ISD::UADDSAT, MVT::v4i32, { 3 } }, // not + pminud + paddd
3891 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3892 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3893 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3894 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3895 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3896 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3897 };
3898 static const CostKindTblEntry SSE41CostTbl[] = {
3899 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
3900 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
3901 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3902 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
3903 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3904 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3905 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
3906 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
3907 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3908 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
3909 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
3910 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3911 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
3912 };
3913 static const CostKindTblEntry SSSE3CostTbl[] = {
3914 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
3915 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
3916 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
3917 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
3918 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
3919 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
3920 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
3921 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
3922 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
3923 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
3924 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
3925 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
3926 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
3927 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
3928 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
3929 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
3930 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
3931 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
3932 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
3933 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
3934 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
3935 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
3936 };
3937 static const CostKindTblEntry SSE2CostTbl[] = {
3938 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
3939 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
3940 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
3941 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
3942 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
3943 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
3944 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
3945 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
3946 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
3947 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
3948 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
3949 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
3950 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
3951 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
3952 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
3953 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
3954 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
3955 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
3956 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
3957 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
3958 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
3959 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
3960 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
3961 { ISD::SADDSAT, MVT::v8i16, { 1 } },
3962 { ISD::SADDSAT, MVT::v16i8, { 1 } },
3963 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
3964 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
3965 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
3966 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
3967 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
3968 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
3969 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
3970 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
3971 { ISD::SSUBSAT, MVT::v8i16, { 1 } },
3972 { ISD::SSUBSAT, MVT::v16i8, { 1 } },
3973 { ISD::UADDSAT, MVT::v8i16, { 1 } },
3974 { ISD::UADDSAT, MVT::v16i8, { 1 } },
3975 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
3976 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
3977 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
3978 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
3979 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
3980 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
3981 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
3982 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
3983 { ISD::USUBSAT, MVT::v8i16, { 1 } },
3984 { ISD::USUBSAT, MVT::v16i8, { 1 } },
3985 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
3986 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
3987 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
3988 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
3989 };
3990 static const CostKindTblEntry SSE1CostTbl[] = {
3991 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
3992 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
3993 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
3994 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
3995 };
3996 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
3997 { ISD::CTTZ, MVT::i64, { 1 } },
3998 };
3999 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4000 { ISD::CTTZ, MVT::i32, { 1 } },
4001 { ISD::CTTZ, MVT::i16, { 1 } },
4002 { ISD::CTTZ, MVT::i8, { 1 } },
4003 };
4004 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4005 { ISD::CTLZ, MVT::i64, { 1 } },
4006 };
4007 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4008 { ISD::CTLZ, MVT::i32, { 1 } },
4009 { ISD::CTLZ, MVT::i16, { 2 } },
4010 { ISD::CTLZ, MVT::i8, { 2 } },
4011 };
4012 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4013 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4014 };
4015 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4016 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4017 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4018 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4019 };
4020 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4021 { ISD::ABS, MVT::i64, { 1, 2, 3, 4 } }, // SUB+CMOV
4022 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4023 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4024 { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4025 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR+XOR
4026 { ISD::CTTZ, MVT::i64, { 3 } }, // TEST+BSF+CMOV/BRANCH
4027 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR
4028 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4029 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4030 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4031 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4032 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4033 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4034 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4035 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4036 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4037 { ISD::SADDO, MVT::i64, { 1 } },
4038 { ISD::UADDO, MVT::i64, { 1 } },
4039 { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto
4040 };
4041 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4042 { ISD::ABS, MVT::i32, { 1, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV
4043 { ISD::ABS, MVT::i16, { 2, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV
4044 { ISD::ABS, MVT::i8, { 2, 4, 4, 4 } }, // SUB+XOR+SRA
4045 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4046 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4047 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4048 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4049 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4050 { ISD::CTLZ, MVT::i32, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4051 { ISD::CTLZ, MVT::i16, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4052 { ISD::CTLZ, MVT::i8, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4053 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSR+XOR
4054 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 3, 3 } }, // BSR+XOR
4055 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4056 { ISD::CTTZ, MVT::i32, { 3 } }, // TEST+BSF+CMOV/BRANCH
4057 { ISD::CTTZ, MVT::i16, { 3 } }, // TEST+BSF+CMOV/BRANCH
4058 { ISD::CTTZ, MVT::i8, { 3 } }, // TEST+BSF+CMOV/BRANCH
4059 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSF
4060 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 1 } }, // BSF
4061 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 1 } }, // BSF
4062 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4063 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4064 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4065 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4066 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4067 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4068 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4069 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4070 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4071 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4072 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4073 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4074 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4075 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4076 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4077 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4078 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4079 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4080 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4081 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4082 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4083 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4084 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4085 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4086 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4087 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4088 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4089 { ISD::SADDO, MVT::i32, { 1 } },
4090 { ISD::SADDO, MVT::i16, { 1 } },
4091 { ISD::SADDO, MVT::i8, { 1 } },
4092 { ISD::UADDO, MVT::i32, { 1 } },
4093 { ISD::UADDO, MVT::i16, { 1 } },
4094 { ISD::UADDO, MVT::i8, { 1 } },
4095 { ISD::UMULO, MVT::i32, { 2 } }, // mul + seto
4096 { ISD::UMULO, MVT::i16, { 2 } },
4097 { ISD::UMULO, MVT::i8, { 2 } },
4098 };
4099
4100 Type *RetTy = ICA.getReturnType();
4101 Type *OpTy = RetTy;
4102 Intrinsic::ID IID = ICA.getID();
4103 unsigned ISD = ISD::DELETED_NODE;
4104 switch (IID) {
4105 default:
4106 break;
4107 case Intrinsic::abs:
4108 ISD = ISD::ABS;
4109 break;
4110 case Intrinsic::bitreverse:
4111 ISD = ISD::BITREVERSE;
4112 break;
4113 case Intrinsic::bswap:
4114 ISD = ISD::BSWAP;
4115 break;
4116 case Intrinsic::ctlz:
4117 ISD = ISD::CTLZ;
4118 break;
4119 case Intrinsic::ctpop:
4120 ISD = ISD::CTPOP;
4121 break;
4122 case Intrinsic::cttz:
4123 ISD = ISD::CTTZ;
4124 break;
4125 case Intrinsic::fshl:
4126 ISD = ISD::FSHL;
4127 if (!ICA.isTypeBasedOnly()) {
4128 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4129 if (Args[0] == Args[1]) {
4130 ISD = ISD::ROTL;
4131 // Handle uniform constant rotation amounts.
4132 // TODO: Handle funnel-shift cases.
4133 const APInt *Amt;
4134 if (Args[2] &&
4136 ISD = X86ISD::VROTLI;
4137 }
4138 }
4139 break;
4140 case Intrinsic::fshr:
4141 // FSHR has same costs so don't duplicate.
4142 ISD = ISD::FSHL;
4143 if (!ICA.isTypeBasedOnly()) {
4144 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4145 if (Args[0] == Args[1]) {
4146 ISD = ISD::ROTR;
4147 // Handle uniform constant rotation amount.
4148 // TODO: Handle funnel-shift cases.
4149 const APInt *Amt;
4150 if (Args[2] &&
4152 ISD = X86ISD::VROTLI;
4153 }
4154 }
4155 break;
4156 case Intrinsic::lrint:
4157 case Intrinsic::llrint:
4158 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4159 // have the same costs as the CVTTP2SI (fptosi) instructions
4160 if (!ICA.isTypeBasedOnly()) {
4161 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4162 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4164 }
4165 break;
4166 case Intrinsic::maxnum:
4167 case Intrinsic::minnum:
4168 // FMINNUM has same costs so don't duplicate.
4169 ISD = ISD::FMAXNUM;
4170 break;
4171 case Intrinsic::sadd_sat:
4172 ISD = ISD::SADDSAT;
4173 break;
4174 case Intrinsic::smax:
4175 ISD = ISD::SMAX;
4176 break;
4177 case Intrinsic::smin:
4178 ISD = ISD::SMIN;
4179 break;
4180 case Intrinsic::ssub_sat:
4181 ISD = ISD::SSUBSAT;
4182 break;
4183 case Intrinsic::uadd_sat:
4184 ISD = ISD::UADDSAT;
4185 break;
4186 case Intrinsic::umax:
4187 ISD = ISD::UMAX;
4188 break;
4189 case Intrinsic::umin:
4190 ISD = ISD::UMIN;
4191 break;
4192 case Intrinsic::usub_sat:
4193 ISD = ISD::USUBSAT;
4194 break;
4195 case Intrinsic::sqrt:
4196 ISD = ISD::FSQRT;
4197 break;
4198 case Intrinsic::sadd_with_overflow:
4199 case Intrinsic::ssub_with_overflow:
4200 // SSUBO has same costs so don't duplicate.
4201 ISD = ISD::SADDO;
4202 OpTy = RetTy->getContainedType(0);
4203 break;
4204 case Intrinsic::uadd_with_overflow:
4205 case Intrinsic::usub_with_overflow:
4206 // USUBO has same costs so don't duplicate.
4207 ISD = ISD::UADDO;
4208 OpTy = RetTy->getContainedType(0);
4209 break;
4210 case Intrinsic::umul_with_overflow:
4211 case Intrinsic::smul_with_overflow:
4212 // SMULO has same costs so don't duplicate.
4213 ISD = ISD::UMULO;
4214 OpTy = RetTy->getContainedType(0);
4215 break;
4216 }
4217
4218 if (ISD != ISD::DELETED_NODE) {
4219 // Legalize the type.
4220 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4221 MVT MTy = LT.second;
4222
4223 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4224 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4225 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4226 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4227 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4228 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4229 if (Cst->isAllOnesValue())
4231 }
4232
4233 // FSQRT is a single instruction.
4234 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4235 return LT.first;
4236
4237 auto adjustTableCost = [](int ISD, unsigned Cost,
4238 InstructionCost LegalizationCost,
4239 FastMathFlags FMF) {
4240 // If there are no NANs to deal with, then these are reduced to a
4241 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4242 // assume is used in the non-fast case.
4243 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4244 if (FMF.noNaNs())
4245 return LegalizationCost * 1;
4246 }
4247 return LegalizationCost * (int)Cost;
4248 };
4249
4250 if (ST->useGLMDivSqrtCosts())
4251 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4252 if (auto KindCost = Entry->Cost[CostKind])
4253 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4254 ICA.getFlags());
4255
4256 if (ST->useSLMArithCosts())
4257 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4258 if (auto KindCost = Entry->Cost[CostKind])
4259 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4260 ICA.getFlags());
4261
4262 if (ST->hasVBMI2())
4263 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4264 if (auto KindCost = Entry->Cost[CostKind])
4265 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4266 ICA.getFlags());
4267
4268 if (ST->hasBITALG())
4269 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4270 if (auto KindCost = Entry->Cost[CostKind])
4271 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4272 ICA.getFlags());
4273
4274 if (ST->hasVPOPCNTDQ())
4275 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4276 if (auto KindCost = Entry->Cost[CostKind])
4277 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4278 ICA.getFlags());
4279
4280 if (ST->hasGFNI())
4281 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4282 if (auto KindCost = Entry->Cost[CostKind])
4283 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4284 ICA.getFlags());
4285
4286 if (ST->hasCDI())
4287 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4288 if (auto KindCost = Entry->Cost[CostKind])
4289 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4290 ICA.getFlags());
4291
4292 if (ST->hasBWI())
4293 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4294 if (auto KindCost = Entry->Cost[CostKind])
4295 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4296 ICA.getFlags());
4297
4298 if (ST->hasAVX512())
4299 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4300 if (auto KindCost = Entry->Cost[CostKind])
4301 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4302 ICA.getFlags());
4303
4304 if (ST->hasXOP())
4305 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4306 if (auto KindCost = Entry->Cost[CostKind])
4307 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4308 ICA.getFlags());
4309
4310 if (ST->hasAVX2())
4311 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4312 if (auto KindCost = Entry->Cost[CostKind])
4313 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4314 ICA.getFlags());
4315
4316 if (ST->hasAVX())
4317 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4318 if (auto KindCost = Entry->Cost[CostKind])
4319 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4320 ICA.getFlags());
4321
4322 if (ST->hasSSE42())
4323 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4324 if (auto KindCost = Entry->Cost[CostKind])
4325 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4326 ICA.getFlags());
4327
4328 if (ST->hasSSE41())
4329 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4330 if (auto KindCost = Entry->Cost[CostKind])
4331 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4332 ICA.getFlags());
4333
4334 if (ST->hasSSSE3())
4335 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4336 if (auto KindCost = Entry->Cost[CostKind])
4337 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4338 ICA.getFlags());
4339
4340 if (ST->hasSSE2())
4341 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4342 if (auto KindCost = Entry->Cost[CostKind])
4343 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4344 ICA.getFlags());
4345
4346 if (ST->hasSSE1())
4347 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4348 if (auto KindCost = Entry->Cost[CostKind])
4349 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4350 ICA.getFlags());
4351
4352 if (ST->hasBMI()) {
4353 if (ST->is64Bit())
4354 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4355 if (auto KindCost = Entry->Cost[CostKind])
4356 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4357 ICA.getFlags());
4358
4359 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4360 if (auto KindCost = Entry->Cost[CostKind])
4361 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4362 ICA.getFlags());
4363 }
4364
4365 if (ST->hasLZCNT()) {
4366 if (ST->is64Bit())
4367 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4368 if (auto KindCost = Entry->Cost[CostKind])
4369 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4370 ICA.getFlags());
4371
4372 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4373 if (auto KindCost = Entry->Cost[CostKind])
4374 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4375 ICA.getFlags());
4376 }
4377
4378 if (ST->hasPOPCNT()) {
4379 if (ST->is64Bit())
4380 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4381 if (auto KindCost = Entry->Cost[CostKind])
4382 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4383 ICA.getFlags());
4384
4385 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4386 if (auto KindCost = Entry->Cost[CostKind])
4387 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4388 ICA.getFlags());
4389 }
4390
4391 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4392 if (const Instruction *II = ICA.getInst()) {
4393 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4394 return TTI::TCC_Free;
4395 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4396 if (LI->hasOneUse())
4397 return TTI::TCC_Free;
4398 }
4399 }
4400 }
4401
4402 if (ST->is64Bit())
4403 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4404 if (auto KindCost = Entry->Cost[CostKind])
4405 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4406 ICA.getFlags());
4407
4408 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4409 if (auto KindCost = Entry->Cost[CostKind])
4410 return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags());
4411 }
4412
4414}
4415
4418 unsigned Index, Value *Op0,
4419 Value *Op1) {
4420 static const CostTblEntry SLMCostTbl[] = {
4421 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4422 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4423 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4424 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4425 };
4426
4427 assert(Val->isVectorTy() && "This must be a vector type");
4428 Type *ScalarType = Val->getScalarType();
4429 InstructionCost RegisterFileMoveCost = 0;
4430
4431 // Non-immediate extraction/insertion can be handled as a sequence of
4432 // aliased loads+stores via the stack.
4433 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4434 Opcode == Instruction::InsertElement)) {
4435 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4436 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4437
4438 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4439 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4440 Align VecAlign = DL.getPrefTypeAlign(Val);
4441 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4442
4443 // Extract - store vector to stack, load scalar.
4444 if (Opcode == Instruction::ExtractElement) {
4445 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4446 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4447 CostKind);
4448 }
4449 // Insert - store vector to stack, store scalar, load vector.
4450 if (Opcode == Instruction::InsertElement) {
4451 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4452 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4453 CostKind) +
4454 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4455 }
4456 }
4457
4458 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4459 Opcode == Instruction::InsertElement)) {
4460 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4461 if (Opcode == Instruction::ExtractElement &&
4462 ScalarType->getScalarSizeInBits() == 1 &&
4463 cast<FixedVectorType>(Val)->getNumElements() > 1)
4464 return 1;
4465
4466 // Legalize the type.
4467 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4468
4469 // This type is legalized to a scalar type.
4470 if (!LT.second.isVector())
4471 return 0;
4472
4473 // The type may be split. Normalize the index to the new type.
4474 unsigned SizeInBits = LT.second.getSizeInBits();
4475 unsigned NumElts = LT.second.getVectorNumElements();
4476 unsigned SubNumElts = NumElts;
4477 Index = Index % NumElts;
4478
4479 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4480 // For inserts, we also need to insert the subvector back.
4481 if (SizeInBits > 128) {
4482 assert((SizeInBits % 128) == 0 && "Illegal vector");
4483 unsigned NumSubVecs = SizeInBits / 128;
4484 SubNumElts = NumElts / NumSubVecs;
4485 if (SubNumElts <= Index) {
4486 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4487 Index %= SubNumElts;
4488 }
4489 }
4490
4491 MVT MScalarTy = LT.second.getScalarType();
4492 auto IsCheapPInsrPExtrInsertPS = [&]() {
4493 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4494 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4495 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4496 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4497 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4498 Opcode == Instruction::InsertElement);
4499 };
4500
4501 if (Index == 0) {
4502 // Floating point scalars are already located in index #0.
4503 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4504 // true for all.
4505 if (ScalarType->isFloatingPointTy() &&
4506 (Opcode != Instruction::InsertElement || !Op0 ||
4507 isa<UndefValue>(Op0)))
4508 return RegisterFileMoveCost;
4509
4510 if (Opcode == Instruction::InsertElement &&
4511 isa_and_nonnull<UndefValue>(Op0)) {
4512 // Consider the gather cost to be cheap.
4513 if (isa_and_nonnull<LoadInst>(Op1))
4514 return RegisterFileMoveCost;
4515 if (!IsCheapPInsrPExtrInsertPS()) {
4516 // mov constant-to-GPR + movd/movq GPR -> XMM.
4517 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4518 return 2 + RegisterFileMoveCost;
4519 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4520 return 1 + RegisterFileMoveCost;
4521 }
4522 }
4523
4524 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4525 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4526 return 1 + RegisterFileMoveCost;
4527 }
4528
4529 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4530 assert(ISD && "Unexpected vector opcode");
4531 if (ST->useSLMArithCosts())
4532 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4533 return Entry->Cost + RegisterFileMoveCost;
4534
4535 // Consider cheap cases.
4536 if (IsCheapPInsrPExtrInsertPS())
4537 return 1 + RegisterFileMoveCost;
4538
4539 // For extractions we just need to shuffle the element to index 0, which
4540 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4541 // the elements to its destination. In both cases we must handle the
4542 // subvector move(s).
4543 // If the vector type is already less than 128-bits then don't reduce it.
4544 // TODO: Under what circumstances should we shuffle using the full width?
4545 InstructionCost ShuffleCost = 1;
4546 if (Opcode == Instruction::InsertElement) {
4547 auto *SubTy = cast<VectorType>(Val);
4548 EVT VT = TLI->getValueType(DL, Val);
4549 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4550 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4551 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, std::nullopt,
4552 CostKind, 0, SubTy);
4553 }
4554 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4555 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4556 }
4557
4558 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4559 RegisterFileMoveCost;
4560}
4561
4564 bool Insert, bool Extract,
4566 assert(DemandedElts.getBitWidth() ==
4567 cast<FixedVectorType>(Ty)->getNumElements() &&
4568 "Vector size mismatch");
4569
4570 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4571 MVT MScalarTy = LT.second.getScalarType();
4572 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4574
4575 constexpr unsigned LaneBitWidth = 128;
4576 assert((LegalVectorBitWidth < LaneBitWidth ||
4577 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4578 "Illegal vector");
4579
4580 const int NumLegalVectors = *LT.first.getValue();
4581 assert(NumLegalVectors >= 0 && "Negative cost!");
4582
4583 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4584 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4585 if (Insert) {
4586 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4587 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4588 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4589 // For types we can insert directly, insertion into 128-bit sub vectors is
4590 // cheap, followed by a cheap chain of concatenations.
4591 if (LegalVectorBitWidth <= LaneBitWidth) {
4592 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4593 /*Extract*/ false, CostKind);
4594 } else {
4595 // In each 128-lane, if at least one index is demanded but not all
4596 // indices are demanded and this 128-lane is not the first 128-lane of
4597 // the legalized-vector, then this 128-lane needs a extracti128; If in
4598 // each 128-lane, there is at least one demanded index, this 128-lane
4599 // needs a inserti128.
4600
4601 // The following cases will help you build a better understanding:
4602 // Assume we insert several elements into a v8i32 vector in avx2,
4603 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4604 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4605 // inserti128.
4606 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4607 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4608 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4609 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4610 unsigned NumLegalElts =
4611 LT.second.getVectorNumElements() * NumLegalVectors;
4612 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4613 "Vector has been legalized to smaller element count");
4614 assert((NumLegalElts % NumLanesTotal) == 0 &&
4615 "Unexpected elts per lane");
4616 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4617
4618 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4619 auto *LaneTy =
4620 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4621
4622 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4623 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4624 NumEltsPerLane, NumEltsPerLane * I);
4625 if (LaneEltMask.isZero())
4626 continue;
4627 // FIXME: we don't need to extract if all non-demanded elements
4628 // are legalization-inserted padding.
4629 if (!LaneEltMask.isAllOnes())
4630 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4631 CostKind, I * NumEltsPerLane, LaneTy);
4632 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4633 /*Extract*/ false, CostKind);
4634 }
4635
4636 APInt AffectedLanes =
4637 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4638 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4639 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4640 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4641 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4642 unsigned I = NumLegalLanes * LegalVec + Lane;
4643 // No need to insert unaffected lane; or lane 0 of each legal vector
4644 // iff ALL lanes of that vector were affected and will be inserted.
4645 if (!AffectedLanes[I] ||
4646 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
4647 continue;
4648 Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, std::nullopt,
4649 CostKind, I * NumEltsPerLane, LaneTy);
4650 }
4651 }
4652 }
4653 } else if (LT.second.isVector()) {
4654 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
4655 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
4656 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
4657 // considered cheap.
4658 if (Ty->isIntOrIntVectorTy())
4659 Cost += DemandedElts.popcount();
4660
4661 // Get the smaller of the legalized or original pow2-extended number of
4662 // vector elements, which represents the number of unpacks we'll end up
4663 // performing.
4664 unsigned NumElts = LT.second.getVectorNumElements();
4665 unsigned Pow2Elts =
4666 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
4667 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
4668 }
4669 }
4670
4671 if (Extract) {
4672 // vXi1 can be efficiently extracted with MOVMSK.
4673 // TODO: AVX512 predicate mask handling.
4674 // NOTE: This doesn't work well for roundtrip scalarization.
4675 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
4676 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
4677 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
4678 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
4679 return MOVMSKCost;
4680 }
4681
4682 if (LT.second.isVector()) {
4683 unsigned NumLegalElts =
4684 LT.second.getVectorNumElements() * NumLegalVectors;
4685 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4686 "Vector has been legalized to smaller element count");
4687
4688 // If we're extracting elements from a 128-bit subvector lane,
4689 // we only need to extract each lane once, not for every element.
4690 if (LegalVectorBitWidth > LaneBitWidth) {
4691 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4692 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4693 assert((NumLegalElts % NumLanesTotal) == 0 &&
4694 "Unexpected elts per lane");
4695 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4696
4697 // Add cost for each demanded 128-bit subvector extraction.
4698 // Luckily this is a lot easier than for insertion.
4699 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4700 auto *LaneTy =
4701 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4702
4703 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4704 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4705 NumEltsPerLane, I * NumEltsPerLane);
4706 if (LaneEltMask.isZero())
4707 continue;
4708 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4709 CostKind, I * NumEltsPerLane, LaneTy);
4711 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
4712 }
4713
4714 return Cost;
4715 }
4716 }
4717
4718 // Fallback to default extraction.
4719 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
4720 Extract, CostKind);
4721 }
4722
4723 return Cost;
4724}
4725
4727X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
4728 int VF, const APInt &DemandedDstElts,
4730 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
4731 // We don't differentiate element types here, only element bit width.
4732 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
4733
4734 auto bailout = [&]() {
4735 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
4736 DemandedDstElts, CostKind);
4737 };
4738
4739 // For now, only deal with AVX512 cases.
4740 if (!ST->hasAVX512())
4741 return bailout();
4742
4743 // Do we have a native shuffle for this element type, or should we promote?
4744 unsigned PromEltTyBits = EltTyBits;
4745 switch (EltTyBits) {
4746 case 32:
4747 case 64:
4748 break; // AVX512F.
4749 case 16:
4750 if (!ST->hasBWI())
4751 PromEltTyBits = 32; // promote to i32, AVX512F.
4752 break; // AVX512BW
4753 case 8:
4754 if (!ST->hasVBMI())
4755 PromEltTyBits = 32; // promote to i32, AVX512F.
4756 break; // AVX512VBMI
4757 case 1:
4758 // There is no support for shuffling i1 elements. We *must* promote.
4759 if (ST->hasBWI()) {
4760 if (ST->hasVBMI())
4761 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
4762 else
4763 PromEltTyBits = 16; // promote to i16, AVX512BW.
4764 break;
4765 }
4766 PromEltTyBits = 32; // promote to i32, AVX512F.
4767 break;
4768 default:
4769 return bailout();
4770 }
4771 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
4772
4773 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
4774 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
4775
4776 int NumDstElements = VF * ReplicationFactor;
4777 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
4778 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
4779
4780 // Legalize the types.
4781 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
4782 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
4783 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
4784 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
4785 // They should have legalized into vector types.
4786 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
4787 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
4788 return bailout();
4789
4790 if (PromEltTyBits != EltTyBits) {
4791 // If we have to perform the shuffle with wider elt type than our data type,
4792 // then we will first need to anyext (we don't care about the new bits)
4793 // the source elements, and then truncate Dst elements.
4794 InstructionCost PromotionCost;
4795 PromotionCost += getCastInstrCost(
4796 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
4798 PromotionCost +=
4799 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
4800 /*Src=*/PromDstVecTy,
4802 return PromotionCost + getReplicationShuffleCost(PromEltTy,
4803 ReplicationFactor, VF,
4804 DemandedDstElts, CostKind);
4805 }
4806
4807 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
4808 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
4809 "We expect that the legalization doesn't affect the element width, "
4810 "doesn't coalesce/split elements.");
4811
4812 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
4813 unsigned NumDstVectors =
4814 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
4815
4816 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
4817
4818 // Not all the produced Dst elements may be demanded. In our case,
4819 // given that a single Dst vector is formed by a single shuffle,
4820 // if all elements that will form a single Dst vector aren't demanded,
4821 // then we won't need to do that shuffle, so adjust the cost accordingly.
4822 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
4823 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
4824 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
4825
4826 InstructionCost SingleShuffleCost = getShuffleCost(
4827 TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/std::nullopt, CostKind,
4828 /*Index=*/0, /*SubTp=*/nullptr);
4829 return NumDstVectorsDemanded * SingleShuffleCost;
4830}
4831
4833 MaybeAlign Alignment,
4834 unsigned AddressSpace,
4836 TTI::OperandValueInfo OpInfo,
4837 const Instruction *I) {
4838 // TODO: Handle other cost kinds.
4840 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
4841 // Store instruction with index and scale costs 2 Uops.
4842 // Check the preceding GEP to identify non-const indices.
4843 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
4844 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
4845 return TTI::TCC_Basic * 2;
4846 }
4847 }
4848 return TTI::TCC_Basic;
4849 }
4850
4851 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4852 "Invalid Opcode");
4853 // Type legalization can't handle structs
4854 if (TLI->getValueType(DL, Src, true) == MVT::Other)
4855 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4856 CostKind);
4857
4858 // Legalize the type.
4859 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
4860
4861 auto *VTy = dyn_cast<FixedVectorType>(Src);
4862
4864
4865 // Add a cost for constant load to vector.
4866 if (Opcode == Instruction::Store && OpInfo.isConstant())
4867 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
4868 /*AddressSpace=*/0, CostKind);
4869
4870 // Handle the simple case of non-vectors.
4871 // NOTE: this assumes that legalization never creates vector from scalars!
4872 if (!VTy || !LT.second.isVector()) {
4873 // Each load/store unit costs 1.
4874 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
4875 }
4876
4877 bool IsLoad = Opcode == Instruction::Load;
4878
4879 Type *EltTy = VTy->getElementType();
4880
4881 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
4882
4883 // Source of truth: how many elements were there in the original IR vector?
4884 const unsigned SrcNumElt = VTy->getNumElements();
4885
4886 // How far have we gotten?
4887 int NumEltRemaining = SrcNumElt;
4888 // Note that we intentionally capture by-reference, NumEltRemaining changes.
4889 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
4890
4891 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
4892
4893 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
4894 const unsigned XMMBits = 128;
4895 if (XMMBits % EltTyBits != 0)
4896 // Vector size must be a multiple of the element size. I.e. no padding.
4897 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4898 CostKind);
4899 const int NumEltPerXMM = XMMBits / EltTyBits;
4900
4901 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
4902
4903 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
4904 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
4905 // How many elements would a single op deal with at once?
4906 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
4907 // Vector size must be a multiple of the element size. I.e. no padding.
4908 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4909 CostKind);
4910 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
4911
4912 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
4913 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
4914 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
4915 "Unless we haven't halved the op size yet, "
4916 "we have less than two op's sized units of work left.");
4917
4918 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
4919 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
4920 : XMMVecTy;
4921
4922 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
4923 "After halving sizes, the vector elt count is no longer a multiple "
4924 "of number of elements per operation?");
4925 auto *CoalescedVecTy =
4926 CurrNumEltPerOp == 1
4927 ? CurrVecTy
4929 IntegerType::get(Src->getContext(),
4930 EltTyBits * CurrNumEltPerOp),
4931 CurrVecTy->getNumElements() / CurrNumEltPerOp);
4932 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
4933 DL.getTypeSizeInBits(CurrVecTy) &&
4934 "coalesciing elements doesn't change vector width.");
4935
4936 while (NumEltRemaining > 0) {
4937 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
4938
4939 // Can we use this vector size, as per the remaining element count?
4940 // Iff the vector is naturally aligned, we can do a wide load regardless.
4941 if (NumEltRemaining < CurrNumEltPerOp &&
4942 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
4943 CurrOpSizeBytes != 1)
4944 break; // Try smalled vector size.
4945
4946 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
4947
4948 // If we have fully processed the previous reg, we need to replenish it.
4949 if (SubVecEltsLeft == 0) {
4950 SubVecEltsLeft += CurrVecTy->getNumElements();
4951 // And that's free only for the 0'th subvector of a legalized vector.
4952 if (!Is0thSubVec)
4955 VTy, std::nullopt, CostKind, NumEltDone(),
4956 CurrVecTy);
4957 }
4958
4959 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
4960 // for smaller widths (32/16/8) we have to insert/extract them separately.
4961 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
4962 // but let's pretend that it is also true for 16/8 bit wide ops...)
4963 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
4964 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
4965 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
4966 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
4967 APInt DemandedElts =
4968 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
4969 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
4970 assert(DemandedElts.popcount() == 1 && "Inserting single value");
4971 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
4972 !IsLoad, CostKind);
4973 }
4974
4975 // This isn't exactly right. We're using slow unaligned 32-byte accesses
4976 // as a proxy for a double-pumped AVX memory interface such as on
4977 // Sandybridge.
4978 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
4979 // will be scalarized.
4980 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
4981 Cost += 2;
4982 else if (CurrOpSizeBytes < 4)
4983 Cost += 2;
4984 else
4985 Cost += 1;
4986
4987 SubVecEltsLeft -= CurrNumEltPerOp;
4988 NumEltRemaining -= CurrNumEltPerOp;
4989 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
4990 }
4991 }
4992
4993 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
4994
4995 return Cost;
4996}
4997
4999X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
5000 unsigned AddressSpace,
5002 bool IsLoad = (Instruction::Load == Opcode);
5003 bool IsStore = (Instruction::Store == Opcode);
5004
5005 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5006 if (!SrcVTy)
5007 // To calculate scalar take the regular cost, without mask
5008 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5009
5010 unsigned NumElem = SrcVTy->getNumElements();
5011 auto *MaskTy =
5012 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5013 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
5014 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
5015 // Scalarization
5016 APInt DemandedElts = APInt::getAllOnes(NumElem);
5018 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5019 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5020 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5022 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5023 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5025 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5026 InstructionCost MemopCost =
5027 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5028 Alignment, AddressSpace, CostKind);
5029 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5030 }
5031
5032 // Legalize the type.
5033 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5034 auto VT = TLI->getValueType(DL, SrcVTy);
5036 if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
5037 LT.second.getVectorNumElements() == NumElem)
5038 // Promotion requires extend/truncate for data and a shuffle for mask.
5039 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt,
5040 CostKind, 0, nullptr) +
5041 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt,
5042 CostKind, 0, nullptr);
5043
5044 else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
5045 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5046 LT.second.getVectorNumElements());
5047 // Expanding requires fill mask with zeroes
5048 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt,
5049 CostKind, 0, MaskTy);
5050 }
5051
5052 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5053 if (!ST->hasAVX512())
5054 return Cost + LT.first * (IsLoad ? 2 : 8);
5055
5056 // AVX-512 masked load/store is cheaper
5057 return Cost + LT.first;
5058}
5059
5062 const Value *Base,
5063 const TTI::PointersChainInfo &Info,
5064 Type *AccessTy, TTI::TargetCostKind CostKind) {
5065 if (Info.isSameBase() && Info.isKnownStride()) {
5066 // If all the pointers have known stride all the differences are translated
5067 // into constants. X86 memory addressing allows encoding it into
5068 // displacement. So we just need to take the base GEP cost.
5069 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5070 SmallVector<const Value *> Indices(BaseGEP->indices());
5071 return getGEPCost(BaseGEP->getSourceElementType(),
5072 BaseGEP->getPointerOperand(), Indices, nullptr,
5073 CostKind);
5074 }
5075 return TTI::TCC_Free;
5076 }
5077 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5078}
5079
5081 ScalarEvolution *SE,
5082 const SCEV *Ptr) {
5083 // Address computations in vectorized code with non-consecutive addresses will
5084 // likely result in more instructions compared to scalar code where the
5085 // computation can more often be merged into the index mode. The resulting
5086 // extra micro-ops can significantly decrease throughput.
5087 const unsigned NumVectorInstToHideOverhead = 10;
5088
5089 // Cost modeling of Strided Access Computation is hidden by the indexing
5090 // modes of X86 regardless of the stride value. We dont believe that there
5091 // is a difference between constant strided access in gerenal and constant
5092 // strided value which is less than or equal to 64.
5093 // Even in the case of (loop invariant) stride whose value is not known at
5094 // compile time, the address computation will not incur more than one extra
5095 // ADD instruction.
5096 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
5097 // TODO: AVX2 is the current cut-off because we don't have correct
5098 // interleaving costs for prior ISA's.
5100 return NumVectorInstToHideOverhead;
5102 return 1;
5103 }
5104
5105 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5106}
5107
5110 std::optional<FastMathFlags> FMF,
5113 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5114
5115 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5116 // and make it as the cost.
5117
5118 static const CostTblEntry SLMCostTbl[] = {
5119 { ISD::FADD, MVT::v2f64, 3 },
5120 { ISD::ADD, MVT::v2i64, 5 },
5121 };
5122
5123 static const CostTblEntry SSE2CostTbl[] = {
5124 { ISD::FADD, MVT::v2f64, 2 },
5125 { ISD::FADD, MVT::v2f32, 2 },
5126 { ISD::FADD, MVT::v4f32, 4 },
5127 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5128 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5129 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5130 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5131 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5132 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5133 { ISD::ADD, MVT::v2i8, 2 },
5134 { ISD::ADD, MVT::v4i8, 2 },
5135 { ISD::ADD, MVT::v8i8, 2 },
5136 { ISD::ADD, MVT::v16i8, 3 },
5137 };
5138
5139 static const CostTblEntry AVX1CostTbl[] = {
5140 { ISD::FADD, MVT::v4f64, 3 },
5141 { ISD::FADD, MVT::v4f32, 3 },
5142 { ISD::FADD, MVT::v8f32, 4 },
5143 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5144 { ISD::ADD, MVT::v4i64, 3 },
5145 { ISD::ADD, MVT::v8i32, 5 },
5146 { ISD::ADD, MVT::v16i16, 5 },
5147 { ISD::ADD, MVT::v32i8, 4 },
5148 };
5149
5150 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5151 assert(ISD && "Invalid opcode");
5152
5153 // Before legalizing the type, give a chance to look up illegal narrow types
5154 // in the table.
5155 // FIXME: Is there a better way to do this?
5156 EVT VT = TLI->getValueType(DL, ValTy);
5157 if (VT.isSimple()) {
5158 MVT MTy = VT.getSimpleVT();
5159 if (ST->useSLMArithCosts())
5160 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5161 return Entry->Cost;
5162
5163 if (ST->hasAVX())
5164 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5165 return Entry->Cost;
5166
5167 if (ST->hasSSE2())
5168 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5169 return Entry->Cost;
5170 }
5171
5172 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5173
5174 MVT MTy = LT.second;
5175
5176 auto *ValVTy = cast<FixedVectorType>(ValTy);
5177
5178 // Special case: vXi8 mul reductions are performed as vXi16.
5179 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5180 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5181 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5182 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5184 CostKind) +
5185 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5186 }
5187
5188 InstructionCost ArithmeticCost = 0;
5189 if (LT.first != 1 && MTy.isVector() &&
5190 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5191 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5192 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5193 MTy.getVectorNumElements());
5194 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5195 ArithmeticCost *= LT.first - 1;
5196 }
5197
5198 if (ST->useSLMArithCosts())
5199 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5200 return ArithmeticCost + Entry->Cost;
5201
5202 if (ST->hasAVX())
5203 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5204 return ArithmeticCost + Entry->Cost;
5205
5206 if (ST->hasSSE2())
5207 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5208 return ArithmeticCost + Entry->Cost;
5209
5210 // FIXME: These assume a naive kshift+binop lowering, which is probably
5211 // conservative in most cases.
5212 static const CostTblEntry AVX512BoolReduction[] = {
5213 { ISD::AND, MVT::v2i1, 3 },
5214 { ISD::AND, MVT::v4i1, 5 },
5215 { ISD::AND, MVT::v8i1, 7 },
5216 { ISD::AND, MVT::v16i1, 9 },
5217 { ISD::AND, MVT::v32i1, 11 },
5218 { ISD::AND, MVT::v64i1, 13 },
5219 { ISD::OR, MVT::v2i1, 3 },
5220 { ISD::OR, MVT::v4i1, 5 },
5221 { ISD::OR, MVT::v8i1, 7 },
5222 { ISD::OR, MVT::v16i1, 9 },
5223 { ISD::OR, MVT::v32i1, 11 },
5224 { ISD::OR, MVT::v64i1, 13 },
5225 };
5226
5227 static const CostTblEntry AVX2BoolReduction[] = {
5228 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5229 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5230 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5231 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5232 };
5233
5234 static const CostTblEntry AVX1BoolReduction[] = {
5235 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5236 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5237 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5238 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5239 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5240 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5241 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5242 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5243 };
5244
5245 static const CostTblEntry SSE2BoolReduction[] = {
5246 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5247 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5248 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5249 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5250 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5251 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5252 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5253 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5254 };
5255
5256 // Handle bool allof/anyof patterns.
5257 if (ValVTy->getElementType()->isIntegerTy(1)) {
5258 InstructionCost ArithmeticCost = 0;
5259 if (LT.first != 1 && MTy.isVector() &&
5260 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5261 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5262 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5263 MTy.getVectorNumElements());
5264 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5265 ArithmeticCost *= LT.first - 1;
5266 }
5267
5268 if (ST->hasAVX512())
5269 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5270 return ArithmeticCost + Entry->Cost;
5271 if (ST->hasAVX2())
5272 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5273 return ArithmeticCost + Entry->Cost;
5274 if (ST->hasAVX())
5275 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5276 return ArithmeticCost + Entry->Cost;
5277 if (ST->hasSSE2())
5278 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5279 return ArithmeticCost + Entry->Cost;
5280
5281 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5282 }
5283
5284 unsigned NumVecElts = ValVTy->getNumElements();
5285 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5286
5287 // Special case power of 2 reductions where the scalar type isn't changed
5288 // by type legalization.
5289 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5290 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5291
5292 InstructionCost ReductionCost = 0;
5293
5294 auto *Ty = ValVTy;
5295 if (LT.first != 1 && MTy.isVector() &&
5296 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5297 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5298 Ty = FixedVectorType::get(ValVTy->getElementType(),
5299 MTy.getVectorNumElements());
5300 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5301 ReductionCost *= LT.first - 1;
5302 NumVecElts = MTy.getVectorNumElements();
5303 }
5304
5305 // Now handle reduction with the legal type, taking into account size changes
5306 // at each level.
5307 while (NumVecElts > 1) {
5308 // Determine the size of the remaining vector we need to reduce.
5309 unsigned Size = NumVecElts * ScalarSize;
5310 NumVecElts /= 2;
5311 // If we're reducing from 256/512 bits, use an extract_subvector.
5312 if (Size > 128) {
5313 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5314 ReductionCost +=
5316 NumVecElts, SubTy);
5317 Ty = SubTy;
5318 } else if (Size == 128) {
5319 // Reducing from 128 bits is a permute of v2f64/v2i64.
5320 FixedVectorType *ShufTy;
5321 if (ValVTy->isFloatingPointTy())
5322 ShufTy =
5323 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5324 else
5325 ShufTy =
5326 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5327 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5328 std::nullopt, CostKind, 0, nullptr);
5329 } else if (Size == 64) {
5330 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5331 FixedVectorType *ShufTy;
5332 if (ValVTy->isFloatingPointTy())
5333 ShufTy =
5334 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5335 else
5336 ShufTy =
5337 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5338 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5339 std::nullopt, CostKind, 0, nullptr);
5340 } else {
5341 // Reducing from smaller size is a shift by immediate.
5342 auto *ShiftTy = FixedVectorType::get(
5343 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5344 ReductionCost += getArithmeticInstrCost(
5345 Instruction::LShr, ShiftTy, CostKind,
5348 }
5349
5350 // Add the arithmetic op for this level.
5351 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5352 }
5353
5354 // Add the final extract element to the cost.
5355 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5356 CostKind, 0, nullptr, nullptr);
5357}
5358
5361 FastMathFlags FMF) {
5362 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5363 return getIntrinsicInstrCost(ICA, CostKind);
5364}
5365
5368 FastMathFlags FMF,
5370 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5371
5372 MVT MTy = LT.second;
5373
5374 int ISD;
5375 if (ValTy->isIntOrIntVectorTy()) {
5376 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5377 : ISD::SMIN;
5378 } else {
5379 assert(ValTy->isFPOrFPVectorTy() &&
5380 "Expected float point or integer vector type.");
5381 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5382 ? ISD::FMINNUM
5383 : ISD::FMINIMUM;
5384 }
5385
5386 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5387 // and make it as the cost.
5388
5389 static const CostTblEntry SSE2CostTbl[] = {
5390 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5391 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5392 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5393 };
5394
5395 static const CostTblEntry SSE41CostTbl[] = {
5396 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5397 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5398 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5399 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5400 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5401 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5402 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5403 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5404 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5405 {ISD::SMIN, MVT::v16i8, 6},
5406 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5407 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5408 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5409 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5410 };
5411
5412 static const CostTblEntry AVX1CostTbl[] = {
5413 {ISD::SMIN, MVT::v16i16, 6},
5414 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5415 {ISD::SMIN, MVT::v32i8, 8},
5416 {ISD::UMIN, MVT::v32i8, 8},
5417 };
5418
5419 static const CostTblEntry AVX512BWCostTbl[] = {
5420 {ISD::SMIN, MVT::v32i16, 8},
5421 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5422 {ISD::SMIN, MVT::v64i8, 10},
5423 {ISD::UMIN, MVT::v64i8, 10},
5424 };
5425
5426 // Before legalizing the type, give a chance to look up illegal narrow types
5427 // in the table.
5428 // FIXME: Is there a better way to do this?
5429 EVT VT = TLI->getValueType(DL, ValTy);
5430 if (VT.isSimple()) {
5431 MVT MTy = VT.getSimpleVT();
5432 if (ST->hasBWI())
5433 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5434 return Entry->Cost;
5435
5436 if (ST->hasAVX())
5437 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5438 return Entry->Cost;
5439
5440 if (ST->hasSSE41())
5441 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5442 return Entry->Cost;
5443
5444 if (ST->hasSSE2())
5445 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5446 return Entry->Cost;
5447 }
5448
5449 auto *ValVTy = cast<FixedVectorType>(ValTy);
5450 unsigned NumVecElts = ValVTy->getNumElements();
5451
5452 auto *Ty = ValVTy;
5453 InstructionCost MinMaxCost = 0;
5454 if (LT.first != 1 && MTy.isVector() &&
5455 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5456 // Type needs to be split. We need LT.first - 1 operations ops.
5457 Ty = FixedVectorType::get(ValVTy->getElementType(),
5458 MTy.getVectorNumElements());
5459 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5460 MinMaxCost *= LT.first - 1;
5461 NumVecElts = MTy.getVectorNumElements();
5462 }
5463
5464 if (ST->hasBWI())
5465 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5466 return MinMaxCost + Entry->Cost;
5467
5468 if (ST->hasAVX())
5469 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5470 return MinMaxCost + Entry->Cost;
5471
5472 if (ST->hasSSE41())
5473 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5474 return MinMaxCost + Entry->Cost;
5475
5476 if (ST->hasSSE2())
5477 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5478 return MinMaxCost + Entry->Cost;
5479
5480 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5481
5482 // Special case power of 2 reductions where the scalar type isn't changed
5483 // by type legalization.
5484 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5485 ScalarSize != MTy.getScalarSizeInBits())
5486 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5487
5488 // Now handle reduction with the legal type, taking into account size changes
5489 // at each level.
5490 while (NumVecElts > 1) {
5491 // Determine the size of the remaining vector we need to reduce.
5492 unsigned Size = NumVecElts * ScalarSize;
5493 NumVecElts /= 2;
5494 // If we're reducing from 256/512 bits, use an extract_subvector.
5495 if (Size > 128) {
5496 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5497 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
5498 CostKind, NumVecElts, SubTy);
5499 Ty = SubTy;
5500 } else if (Size == 128) {
5501 // Reducing from 128 bits is a permute of v2f64/v2i64.
5502 VectorType *ShufTy;
5503 if (ValTy->isFloatingPointTy())
5504 ShufTy =
5506 else
5507 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5508 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5509 std::nullopt, CostKind, 0, nullptr);
5510 } else if (Size == 64) {
5511 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5512 FixedVectorType *ShufTy;
5513 if (ValTy->isFloatingPointTy())
5514 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5515 else
5516 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5517 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5518 std::nullopt, CostKind, 0, nullptr);
5519 } else {
5520 // Reducing from smaller size is a shift by immediate.
5521 auto *ShiftTy = FixedVectorType::get(
5522 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5523 MinMaxCost += getArithmeticInstrCost(
5524 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5527 }
5528
5529 // Add the arithmetic op for this level.
5530 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5531 }
5532
5533 // Add the final extract element to the cost.
5534 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5535 CostKind, 0, nullptr, nullptr);
5536}
5537
5538/// Calculate the cost of materializing a 64-bit value. This helper
5539/// method might only calculate a fraction of a larger immediate. Therefore it
5540/// is valid to return a cost of ZERO.
5542 if (Val == 0)
5543 return TTI::TCC_Free;
5544
5545 if (isInt<32>(Val))
5546 return TTI::TCC_Basic;
5547
5548 return 2 * TTI::TCC_Basic;
5549}
5550
5553 assert(Ty->isIntegerTy());
5554
5555 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5556 if (BitSize == 0)
5557 return ~0U;
5558
5559 // Never hoist constants larger than 128bit, because this might lead to
5560 // incorrect code generation or assertions in codegen.
5561 // Fixme: Create a cost model for types larger than i128 once the codegen
5562 // issues have been fixed.
5563 if (BitSize > 128)
5564 return TTI::TCC_Free;
5565
5566 if (Imm == 0)
5567 return TTI::TCC_Free;
5568
5569 // Sign-extend all constants to a multiple of 64-bit.
5570 APInt ImmVal = Imm;
5571 if (BitSize % 64 != 0)
5572 ImmVal = Imm.sext(alignTo(BitSize, 64));
5573
5574 // Split the constant into 64-bit chunks and calculate the cost for each
5575 // chunk.
5577 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5578 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5579 int64_t Val = Tmp.getSExtValue();
5580 Cost += getIntImmCost(Val);
5581 }
5582 // We need at least one instruction to materialize the constant.
5583 return std::max<InstructionCost>(1, Cost);
5584}
5585
5587 const APInt &Imm, Type *Ty,
5589 Instruction *Inst) {
5590 assert(Ty->isIntegerTy());
5591
5592 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5593 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5594 // here, so that constant hoisting will ignore this constant.
5595 if (BitSize == 0)
5596 return TTI::TCC_Free;
5597
5598 unsigned ImmIdx = ~0U;
5599 switch (Opcode) {
5600 default:
5601 return TTI::TCC_Free;
5602 case Instruction::GetElementPtr:
5603 // Always hoist the base address of a GetElementPtr. This prevents the
5604 // creation of new constants for every base constant that gets constant
5605 // folded with the offset.
5606 if (Idx == 0)
5607 return 2 * TTI::TCC_Basic;
5608 return TTI::TCC_Free;
5609 case Instruction::Store:
5610 ImmIdx = 0;
5611 break;
5612 case Instruction::ICmp:
5613 // This is an imperfect hack to prevent constant hoisting of
5614 // compares that might be trying to check if a 64-bit value fits in
5615 // 32-bits. The backend can optimize these cases using a right shift by 32.
5616 // Ideally we would check the compare predicate here. There also other
5617 // similar immediates the backend can use shifts for.
5618 if (Idx == 1 && Imm.getBitWidth() == 64) {
5619 uint64_t ImmVal = Imm.getZExtValue();
5620 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5621 return TTI::TCC_Free;
5622 }
5623 ImmIdx = 1;
5624 break;
5625 case Instruction::And:
5626 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5627 // by using a 32-bit operation with implicit zero extension. Detect such
5628 // immediates here as the normal path expects bit 31 to be sign extended.
5629 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(32))
5630 return TTI::TCC_Free;
5631 ImmIdx = 1;
5632 break;
5633 case Instruction::Add:
5634 case Instruction::Sub:
5635 // For add/sub, we can use the opposite instruction for INT32_MIN.
5636 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
5637 return TTI::TCC_Free;
5638 ImmIdx = 1;
5639 break;
5640 case Instruction::UDiv:
5641 case Instruction::SDiv:
5642 case Instruction::URem:
5643 case Instruction::SRem:
5644 // Division by constant is typically expanded later into a different
5645 // instruction sequence. This completely changes the constants.
5646 // Report them as "free" to stop ConstantHoist from marking them as opaque.
5647 return TTI::TCC_Free;
5648 case Instruction::Mul:
5649 case Instruction::Or:
5650 case Instruction::Xor:
5651 ImmIdx = 1;
5652 break;
5653 // Always return TCC_Free for the shift value of a shift instruction.
5654 case Instruction::Shl:
5655 case Instruction::LShr:
5656 case Instruction::AShr:
5657 if (Idx == 1)
5658 return TTI::TCC_Free;
5659 break;
5660 case Instruction::Trunc:
5661 case Instruction::ZExt:
5662 case Instruction::SExt:
5663 case Instruction::IntToPtr:
5664 case Instruction::PtrToInt:
5665 case Instruction::BitCast:
5666 case Instruction::PHI:
5667 case Instruction::Call:
5668 case Instruction::Select:
5669 case Instruction::Ret:
5670 case Instruction::Load:
5671 break;
5672 }
5673
5674 if (Idx == ImmIdx) {
5675 uint64_t NumConstants = divideCeil(BitSize, 64);
5677 return (Cost <= NumConstants * TTI::TCC_Basic)
5678 ? static_cast<int>(TTI::TCC_Free)
5679 : Cost;
5680 }
5681
5682 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5683}
5684
5686 const APInt &Imm, Type *Ty,
5688 assert(Ty->isIntegerTy());
5689
5690 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5691 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5692 // here, so that constant hoisting will ignore this constant.
5693 if (BitSize == 0)
5694 return TTI::TCC_Free;
5695
5696 switch (IID) {
5697 default:
5698 return TTI::TCC_Free;
5699 case Intrinsic::sadd_with_overflow:
5700 case Intrinsic::uadd_with_overflow:
5701 case Intrinsic::ssub_with_overflow:
5702 case Intrinsic::usub_with_overflow:
5703 case Intrinsic::smul_with_overflow:
5704 case Intrinsic::umul_with_overflow:
5705 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
5706 return TTI::TCC_Free;
5707 break;
5708 case Intrinsic::experimental_stackmap:
5709 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5710 return TTI::TCC_Free;
5711 break;
5712 case Intrinsic::experimental_patchpoint_void:
5713 case Intrinsic::experimental_patchpoint:
5714 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5715 return TTI::TCC_Free;
5716 break;
5717 }
5718 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5719}
5720
5723 const Instruction *I) {
5725 return Opcode == Instruction::PHI ? 0 : 1;
5726 // Branches are assumed to be predicted.
5727 return 0;
5728}
5729
5730int X86TTIImpl::getGatherOverhead() const {
5731 // Some CPUs have more overhead for gather. The specified overhead is relative
5732 // to the Load operation. "2" is the number provided by Intel architects. This
5733 // parameter is used for cost estimation of Gather Op and comparison with
5734 // other alternatives.
5735 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
5736 // enable gather with a -march.
5737 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
5738 return 2;
5739
5740 return 1024;
5741}
5742
5743int X86TTIImpl::getScatterOverhead() const {
5744 if (ST->hasAVX512())
5745 return 2;
5746
5747 return 1024;
5748}
5749
5750// Return an average cost of Gather / Scatter instruction, maybe improved later.
5751// FIXME: Add TargetCostKind support.
5752InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
5754 Type *SrcVTy, const Value *Ptr,
5755 Align Alignment,
5756 unsigned AddressSpace) {
5757
5758 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
5759 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5760
5761 // Try to reduce index size from 64 bit (default for GEP)
5762 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
5763 // operation will use 16 x 64 indices which do not fit in a zmm and needs
5764 // to split. Also check that the base pointer is the same for all lanes,
5765 // and that there's at most one variable index.
5766 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
5767 unsigned IndexSize = DL.getPointerSizeInBits();
5768 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
5769 if (IndexSize < 64 || !GEP)
5770 return IndexSize;
5771
5772 unsigned NumOfVarIndices = 0;
5773 const Value *Ptrs = GEP->getPointerOperand();
5774 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
5775 return IndexSize;
5776 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
5777 if (isa<Constant>(GEP->getOperand(I)))
5778 continue;
5779 Type *IndxTy = GEP->getOperand(I)->getType();
5780 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
5781 IndxTy = IndexVTy->getElementType();
5782 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
5783 !isa<SExtInst>(GEP->getOperand(I))) ||
5784 ++NumOfVarIndices > 1)
5785 return IndexSize; // 64
5786 }
5787 return (unsigned)32;
5788 };
5789
5790 // Trying to reduce IndexSize to 32 bits for vector 16.
5791 // By default the IndexSize is equal to pointer size.
5792 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
5793 ? getIndexSizeInBits(Ptr, DL)
5795
5796 auto *IndexVTy = FixedVectorType::get(
5797 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
5798 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
5799 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
5800 InstructionCost::CostType SplitFactor =
5801 *std::max(IdxsLT.first, SrcLT.first).getValue();
5802 if (SplitFactor > 1) {
5803 // Handle splitting of vector of pointers
5804 auto *SplitSrcTy =
5805 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
5806 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
5807 Alignment, AddressSpace);
5808 }
5809
5810 // The gather / scatter cost is given by Intel architects. It is a rough
5811 // number since we are looking at one instruction in a time.
5812 const int GSOverhead = (Opcode == Instruction::Load)
5813 ? getGatherOverhead()
5814 : getScatterOverhead();
5815 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5816 MaybeAlign(Alignment), AddressSpace,
5818}
5819
5820/// Return the cost of full scalarization of gather / scatter operation.
5821///
5822/// Opcode - Load or Store instruction.
5823/// SrcVTy - The type of the data vector that should be gathered or scattered.
5824/// VariableMask - The mask is non-constant at compile time.
5825/// Alignment - Alignment for one element.
5826/// AddressSpace - pointer[s] address space.
5827/// TODO: Remove this and use getCommonMaskedMemoryOpCost directly.
5828InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode,
5830 Type *SrcVTy, bool VariableMask,
5831 Align Alignment,
5832 unsigned AddressSpace) {
5833 Type *ScalarTy = SrcVTy->getScalarType();
5834 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5835 APInt DemandedElts = APInt::getAllOnes(VF);
5836
5837 InstructionCost MaskUnpackCost = 0;
5838 if (VariableMask) {
5839 auto *MaskTy =
5841 MaskUnpackCost = getScalarizationOverhead(
5842 MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
5843 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5844 Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
5846 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5847 MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
5848 }
5849
5850 InstructionCost AddressUnpackCost = getScalarizationOverhead(
5852 DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
5853
5854 // The cost of the scalar loads/stores.
5855 InstructionCost MemoryOpCost =
5856 VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment),
5858
5859 // The cost of forming the vector from loaded scalars/
5860 // scalarizing the vector to perform scalar stores.
5861 InstructionCost InsertExtractCost = getScalarizationOverhead(
5862 cast<FixedVectorType>(SrcVTy), DemandedElts,
5863 /*Insert=*/Opcode == Instruction::Load,
5864 /*Extract=*/Opcode == Instruction::Store, CostKind);
5865
5866 return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost;
5867}
5868
5869/// Calculate the cost of Gather / Scatter operation
5871 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
5873 const Instruction *I = nullptr) {
5875 if ((Opcode == Instruction::Load &&
5876 isLegalMaskedGather(SrcVTy, Align(Alignment)) &&
5877 !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5878 Align(Alignment))) ||
5879 (Opcode == Instruction::Store &&
5880 isLegalMaskedScatter(SrcVTy, Align(Alignment)) &&
5881 !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5882 Align(Alignment))))
5883 return 1;
5884 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
5885 Alignment, CostKind, I);
5886 }
5887
5888 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
5889 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
5890 if (!PtrTy && Ptr->getType()->isVectorTy())
5891 PtrTy = dyn_cast<PointerType>(
5892 cast<VectorType>(Ptr->getType())->getElementType());
5893 assert(PtrTy && "Unexpected type for Ptr argument");
5894 unsigned AddressSpace = PtrTy->getAddressSpace();
5895
5896 if ((Opcode == Instruction::Load &&
5897 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
5898 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5899 Align(Alignment)))) ||
5900 (Opcode == Instruction::Store &&
5901 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
5902 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5903 Align(Alignment)))))
5904 return getGSScalarCost(Opcode, CostKind, SrcVTy, VariableMask, Alignment,
5905 AddressSpace);
5906
5907 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
5908 AddressSpace);
5909}
5910
5912 const TargetTransformInfo::LSRCost &C2) {
5913 // X86 specific here are "instruction number 1st priority".
5914 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
5915 C1.NumIVMuls, C1.NumBaseAdds,
5916 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5917 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
5918 C2.NumIVMuls, C2.NumBaseAdds,
5919 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5920}
5921
5923 return ST->hasMacroFusion() || ST->hasBranchFusion();
5924}
5925
5926bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
5927 if (!ST->hasAVX())
5928 return false;
5929
5930 // The backend can't handle a single element vector.
5931 if (isa<VectorType>(DataTy) &&
5932 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5933 return false;
5934 Type *ScalarTy = DataTy->getScalarType();
5935
5936 if (ScalarTy->isPointerTy())
5937 return true;
5938
5939 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5940 return true;
5941
5942 if (ScalarTy->isHalfTy() && ST->hasBWI())
5943 return true;
5944
5945 if (ScalarTy->isBFloatTy() && ST->hasBF16())
5946 return true;
5947
5948 if (!ScalarTy->isIntegerTy())
5949 return false;
5950
5951 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5952 return IntWidth == 32 || IntWidth == 64 ||
5953 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
5954}
5955
5956bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
5957 return isLegalMaskedLoad(DataType, Alignment);
5958}
5959
5960bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
5961 unsigned DataSize = DL.getTypeStoreSize(DataType);
5962 // The only supported nontemporal loads are for aligned vectors of 16 or 32
5963 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
5964 // (the equivalent stores only require AVX).
5965 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
5966 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
5967
5968 return false;
5969}
5970
5971bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
5972 unsigned DataSize = DL.getTypeStoreSize(DataType);
5973
5974 // SSE4A supports nontemporal stores of float and double at arbitrary
5975 // alignment.
5976 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
5977 return true;
5978
5979 // Besides the SSE4A subtarget exception above, only aligned stores are
5980 // available nontemporaly on any other subtarget. And only stores with a size
5981 // of 4..32 bytes (powers of 2, only) are permitted.
5982 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
5983 !isPowerOf2_32(DataSize))
5984 return false;
5985
5986 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
5987 // loads require AVX2).
5988 if (DataSize == 32)
5989 return ST->hasAVX();
5990 if (DataSize == 16)
5991 return ST->hasSSE1();
5992 return true;
5993}
5994
5996 ElementCount NumElements) const {
5997 // movddup
5998 return ST->hasSSE3() && !NumElements.isScalable() &&
5999 NumElements.getFixedValue() == 2 &&
6000 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6001}
6002
6004 if (!isa<VectorType>(DataTy))
6005 return false;
6006
6007 if (!ST->hasAVX512())
6008 return false;
6009
6010 // The backend can't handle a single element vector.
6011 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6012 return false;
6013
6014 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6015
6016 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6017 return true;
6018
6019 if (!ScalarTy->isIntegerTy())
6020 return false;
6021
6022 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6023 return IntWidth == 32 || IntWidth == 64 ||
6024 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6025}
6026
6028 return isLegalMaskedExpandLoad(DataTy, Alignment);
6029}
6030
6031bool X86TTIImpl::supportsGather() const {
6032 // Some CPUs have better gather performance than others.
6033 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6034 // enable gather with a -march.
6035 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6036}
6037
6039 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6040 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6041 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6042 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6043 // Check, maybe the gather/scatter instruction is better in the VariableMask
6044 // case.
6045 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6046 return NumElts == 1 ||
6047 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6048}
6049
6051 Type *ScalarTy = DataTy->getScalarType();
6052 if (ScalarTy->isPointerTy())
6053 return true;
6054
6055 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6056 return true;
6057
6058 if (!ScalarTy->isIntegerTy())
6059 return false;
6060
6061 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6062 return IntWidth == 32 || IntWidth == 64;
6063}
6064
6066 if (!supportsGather() || !ST->preferGather())
6067 return false;
6068 return isLegalMaskedGatherScatter(DataTy, Alignment);
6069}
6070
6071bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6072 unsigned Opcode1,
6073 const SmallBitVector &OpcodeMask) const {
6074 // ADDSUBPS 4xf32 SSE3
6075 // VADDSUBPS 4xf32 AVX
6076 // VADDSUBPS 8xf32 AVX2
6077 // ADDSUBPD 2xf64 SSE3
6078 // VADDSUBPD 2xf64 AVX
6079 // VADDSUBPD 4xf64 AVX2
6080
6081 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6082 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6083 if (!isPowerOf2_32(NumElements))
6084 return false;
6085 // Check the opcode pattern. We apply the mask on the opcode arguments and
6086 // then check if it is what we expect.
6087 for (int Lane : seq<int>(0, NumElements)) {
6088 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6089 // We expect FSub for even lanes and FAdd for odd lanes.
6090 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6091 return false;
6092 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6093 return false;
6094 }
6095 // Now check that the pattern is supported by the target ISA.
6096 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6097 if (ElemTy->isFloatTy())
6098 return ST->hasSSE3() && NumElements % 4 == 0;
6099 if (ElemTy->isDoubleTy())
6100 return ST->hasSSE3() && NumElements % 2 == 0;
6101 return false;
6102}
6103
6104bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6105 // AVX2 doesn't support scatter
6106 if (!ST->hasAVX512() || !ST->preferScatter())
6107 return false;
6108 return isLegalMaskedGatherScatter(DataType, Alignment);
6109}
6110
6111bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6112 EVT VT = TLI->getValueType(DL, DataType);
6113 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6114}
6115
6117 // FDIV is always expensive, even if it has a very low uop count.
6118 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6119 if (I->getOpcode() == Instruction::FDiv)
6120 return true;
6121
6123}
6124
6126 return false;
6127}
6128
6130 const Function *Callee) const {
6131 const TargetMachine &TM = getTLI()->getTargetMachine();
6132
6133 // Work this as a subsetting of subtarget features.
6134 const FeatureBitset &CallerBits =
6135 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6136 const FeatureBitset &CalleeBits =
6137 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6138
6139 // Check whether features are the same (apart from the ignore list).
6140 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6141 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6142 if (RealCallerBits == RealCalleeBits)
6143 return true;
6144
6145 // If the features are a subset, we need to additionally check for calls
6146 // that may become ABI-incompatible as a result of inlining.
6147 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6148 return false;
6149
6150 for (const Instruction &I : instructions(Callee)) {
6151 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6152 // Having more target features is fine for inline ASM.
6153 if (CB->isInlineAsm())
6154 continue;
6155
6157 for (Value *Arg : CB->args())
6158 Types.push_back(Arg->getType());
6159 if (!CB->getType()->isVoidTy())
6160 Types.push_back(CB->getType());
6161
6162 // Simple types are always ABI compatible.
6163 auto IsSimpleTy = [](Type *Ty) {
6164 return !Ty->isVectorTy() && !Ty->isAggregateType();
6165 };
6166 if (all_of(Types, IsSimpleTy))
6167 continue;
6168
6169 if (Function *NestedCallee = CB->getCalledFunction()) {
6170 // Assume that intrinsics are always ABI compatible.
6171 if (NestedCallee->isIntrinsic())
6172 continue;
6173
6174 // Do a precise compatibility check.
6175 if (!areTypesABICompatible(Caller, NestedCallee, Types))
6176 return false;
6177 } else {
6178 // We don't know the target features of the callee,
6179 // assume it is incompatible.
6180 return false;
6181 }
6182 }
6183 }
6184 return true;
6185}
6186
6188 const Function *Callee,
6189 const ArrayRef<Type *> &Types) const {
6190 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6191 return false;
6192
6193 // If we get here, we know the target features match. If one function
6194 // considers 512-bit vectors legal and the other does not, consider them
6195 // incompatible.
6196 const TargetMachine &TM = getTLI()->getTargetMachine();
6197
6198 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6199 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6200 return true;
6201
6202 // Consider the arguments compatible if they aren't vectors or aggregates.
6203 // FIXME: Look at the size of vectors.
6204 // FIXME: Look at the element types of aggregates to see if there are vectors.
6205 return llvm::none_of(Types,
6206 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6207}
6208
6210X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6212 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6213 Options.NumLoadsPerBlock = 2;
6214 // All GPR and vector loads can be unaligned.
6215 Options.AllowOverlappingLoads = true;
6216 if (IsZeroCmp) {
6217 // Only enable vector loads for equality comparison. Right now the vector
6218 // version is not as fast for three way compare (see #33329).
6219 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6220 if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6221 Options.LoadSizes.push_back(64);
6222 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6223 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6224 }
6225 if (ST->is64Bit()) {
6226 Options.LoadSizes.push_back(8);
6227 }
6228 Options.LoadSizes.push_back(4);
6229 Options.LoadSizes.push_back(2);
6230 Options.LoadSizes.push_back(1);
6231 return Options;
6232}
6233
6235 return supportsGather();
6236}
6237
6239 return false;
6240}
6241
6243 // TODO: We expect this to be beneficial regardless of arch,
6244 // but there are currently some unexplained performance artifacts on Atom.
6245 // As a temporary solution, disable on Atom.
6246 return !(ST->isAtom());
6247}
6248
6249// Get estimation for interleaved load/store operations and strided load.
6250// \p Indices contains indices for strided load.
6251// \p Factor - the factor of interleaving.
6252// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6254 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6255 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6256 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6257 // VecTy for interleave memop is <VF*Factor x Elt>.
6258 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6259 // VecTy = <12 x i32>.
6260
6261 // Calculate the number of memory operations (NumOfMemOps), required
6262 // for load/store the VecTy.
6263 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6264 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6265 unsigned LegalVTSize = LegalVT.getStoreSize();
6266 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6267
6268 // Get the cost of one memory operation.
6269 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6270 LegalVT.getVectorNumElements());
6271 InstructionCost MemOpCost;
6272 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6273 if (UseMaskedMemOp)
6274 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6276 else
6277 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6279
6280 unsigned VF = VecTy->getNumElements() / Factor;
6281 MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
6282
6283 InstructionCost MaskCost;
6284 if (UseMaskedMemOp) {
6285 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6286 for (unsigned Index : Indices) {
6287 assert(Index < Factor && "Invalid index for interleaved memory op");
6288 for (unsigned Elm = 0; Elm < VF; Elm++)
6289 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6290 }
6291
6292 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6293
6294 MaskCost = getReplicationShuffleCost(
6295 I1Type, Factor, VF,
6296 UseMaskForGaps ? DemandedLoadStoreElts
6298 CostKind);
6299
6300 // The Gaps mask is invariant and created outside the loop, therefore the
6301 // cost of creating it is not accounted for here. However if we have both
6302 // a MaskForGaps and some other mask that guards the execution of the
6303 // memory access, we need to account for the cost of And-ing the two masks
6304 // inside the loop.
6305 if (UseMaskForGaps) {
6306 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6307 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6308 }
6309 }
6310
6311 if (Opcode == Instruction::Load) {
6312 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6313 // contain the cost of the optimized shuffle sequence that the
6314 // X86InterleavedAccess pass will generate.
6315 // The cost of loads and stores are computed separately from the table.
6316
6317 // X86InterleavedAccess support only the following interleaved-access group.
6318 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6319 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6320 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6321 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6322 };
6323
6324 if (const auto *Entry =
6325 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6326 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6327 //If an entry does not exist, fallback to the default implementation.
6328
6329 // Kind of shuffle depends on number of loaded values.
6330 // If we load the entire data in one register, we can use a 1-src shuffle.
6331 // Otherwise, we'll merge 2 sources in each operation.
6332 TTI::ShuffleKind ShuffleKind =
6333 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6334
6335 InstructionCost ShuffleCost = getShuffleCost(
6336 ShuffleKind, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6337
6338 unsigned NumOfLoadsInInterleaveGrp =
6339 Indices.size() ? Indices.size() : Factor;
6340 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6341 VecTy->getNumElements() / Factor);
6342 InstructionCost NumOfResults =
6343 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6344
6345 // About a half of the loads may be folded in shuffles when we have only
6346 // one result. If we have more than one result, or the loads are masked,
6347 // we do not fold loads at all.
6348 unsigned NumOfUnfoldedLoads =
6349 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6350
6351 // Get a number of shuffle operations per result.
6352 unsigned NumOfShufflesPerResult =
6353 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6354
6355 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6356 // When we have more than one destination, we need additional instructions
6357 // to keep sources.
6358 InstructionCost NumOfMoves = 0;
6359 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6360 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6361
6362 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6363 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6364 NumOfMoves;
6365
6366 return Cost;
6367 }
6368
6369 // Store.
6370 assert(Opcode == Instruction::Store &&
6371 "Expected Store Instruction at this point");
6372 // X86InterleavedAccess support only the following interleaved-access group.
6373 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6374 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6375 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6376 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6377
6378 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6379 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6380 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6381 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6382 };
6383
6384 if (const auto *Entry =
6385 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6386 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6387 //If an entry does not exist, fallback to the default implementation.
6388
6389 // There is no strided stores meanwhile. And store can't be folded in
6390 // shuffle.
6391 unsigned NumOfSources = Factor; // The number of values to be merged.
6392 InstructionCost ShuffleCost = getShuffleCost(
6393 TTI::SK_PermuteTwoSrc, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6394 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6395
6396 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6397 // We need additional instructions to keep sources.
6398 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6400 MaskCost +
6401 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6402 NumOfMoves;
6403 return Cost;
6404}
6405
6407 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6408 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6409 bool UseMaskForCond, bool UseMaskForGaps) {
6410 auto *VecTy = cast<FixedVectorType>(BaseTy);
6411
6412 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6413 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6414 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6415 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6416 return true;
6417 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6418 return ST->hasBWI();
6419 if (EltTy->isBFloatTy())
6420 return ST->hasBF16();
6421 return false;
6422 };
6423 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6425 Opcode, VecTy, Factor, Indices, Alignment,
6426 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6427
6428 if (UseMaskForCond || UseMaskForGaps)
6429 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6430 Alignment, AddressSpace, CostKind,
6431 UseMaskForCond, UseMaskForGaps);
6432
6433 // Get estimation for interleaved load/store operations for SSE-AVX2.
6434 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6435 // computing the cost using a generic formula as a function of generic
6436 // shuffles. We therefore use a lookup table instead, filled according to
6437 // the instruction sequences that codegen currently generates.
6438
6439 // VecTy for interleave memop is <VF*Factor x Elt>.
6440 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6441 // VecTy = <12 x i32>.
6442 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6443
6444 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6445 // the VF=2, while v2i128 is an unsupported MVT vector type
6446 // (see MachineValueType.h::getVectorVT()).
6447 if (!LegalVT.isVector())
6448 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6449 Alignment, AddressSpace, CostKind);
6450
6451 unsigned VF = VecTy->getNumElements() / Factor;
6452 Type *ScalarTy = VecTy->getElementType();
6453 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6454 if (!ScalarTy->isIntegerTy())
6455 ScalarTy =
6456 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6457
6458 // Get the cost of all the memory operations.
6459 // FIXME: discount dead loads.
6460 InstructionCost MemOpCosts = getMemoryOpCost(
6461 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6462
6463 auto *VT = FixedVectorType::get(ScalarTy, VF);
6464 EVT ETy = TLI->getValueType(DL, VT);
6465 if (!ETy.isSimple())
6466 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6467 Alignment, AddressSpace, CostKind);
6468
6469 // TODO: Complete for other data-types and strides.
6470 // Each combination of Stride, element bit width and VF results in a different
6471 // sequence; The cost tables are therefore accessed with:
6472 // Factor (stride) and VectorType=VFxiN.
6473 // The Cost accounts only for the shuffle sequence;
6474 // The cost of the loads/stores is accounted for separately.
6475 //
6476 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6477 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6478 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6479 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6480 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6481 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6482
6483 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6484 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6485 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6486
6487 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6488 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6489 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6490
6491 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6492 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6493 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6494 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6495
6496 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6497 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6498 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6499 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6500 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6501
6502 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6503 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6504 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6505 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6506 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6507
6508 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6509 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6510 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6511 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6512 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6513
6514 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6515 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6516 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6517 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6518
6519 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6520 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6521 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6522 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6523 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6524
6525 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6526 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6527 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6528 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6529 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6530
6531 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6532 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6533 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6534 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6535 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6536
6537 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6538 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6539 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6540 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6541
6542 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6543 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6544 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6545 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6546 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6547
6548 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6549 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6550 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6551 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6552 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6553
6554 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6555 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6556 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6557 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6558
6559 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6560 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6561 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6562
6563 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6564 };
6565
6566 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6567 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6568 };
6569
6570 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6571 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6572 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6573
6574 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6575 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6576
6577 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6578 };
6579
6580 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6581 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6582 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6583
6584 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6585 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6586 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6587
6588 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6589 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6590 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6591 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6592
6593 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6594 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6595 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6596 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6597 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6598
6599 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6600 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6601 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6602 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6603 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6604
6605 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6606 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6607 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6608 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6609 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6610
6611 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6612 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6613 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
6614 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6615 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6616
6617 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
6618 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
6619 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
6620 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6621
6622 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
6623 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
6624 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
6625 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
6626 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6627
6628 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
6629 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
6630 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
6631 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6632 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6633
6634 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
6635 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
6636 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
6637 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6638 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6639
6640 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
6641 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
6642 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6643 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6644
6645 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
6646 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
6647 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
6648 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6649 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6650
6651 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
6652 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
6653 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
6654 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6655 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6656
6657 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
6658 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
6659 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
6660 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6661
6662 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
6663 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6664 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6665 };
6666
6667 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6668 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
6669 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
6670 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
6671
6672 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
6673 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
6674
6675 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
6676 };
6677
6678 if (Opcode == Instruction::Load) {
6679 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6680 MemOpCosts](const CostTblEntry *Entry) {
6681 // NOTE: this is just an approximation!
6682 // It can over/under -estimate the cost!
6683 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6684 };
6685
6686 if (ST->hasAVX2())
6687 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6688 ETy.getSimpleVT()))
6689 return GetDiscountedCost(Entry);
6690
6691 if (ST->hasSSSE3())
6692 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
6693 ETy.getSimpleVT()))
6694 return GetDiscountedCost(Entry);
6695
6696 if (ST->hasSSE2())
6697 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
6698 ETy.getSimpleVT()))
6699 return GetDiscountedCost(Entry);
6700 } else {
6701 assert(Opcode == Instruction::Store &&
6702 "Expected Store Instruction at this point");
6703 assert((!Indices.size() || Indices.size() == Factor) &&
6704 "Interleaved store only supports fully-interleaved groups.");
6705 if (ST->hasAVX2())
6706 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
6707 ETy.getSimpleVT()))
6708 return MemOpCosts + Entry->Cost;
6709
6710 if (ST->hasSSE2())
6711 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
6712 ETy.getSimpleVT()))
6713 return MemOpCosts + Entry->Cost;
6714 }
6715
6716 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6717 Alignment, AddressSpace, CostKind,
6718 UseMaskForCond, UseMaskForGaps);
6719}
6720
6722 int64_t BaseOffset,
6723 bool HasBaseReg, int64_t Scale,
6724 unsigned AddrSpace) const {
6725 // Scaling factors are not free at all.
6726 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
6727 // will take 2 allocations in the out of order engine instead of 1
6728 // for plain addressing mode, i.e. inst (reg1).
6729 // E.g.,
6730 // vaddps (%rsi,%rdx), %ymm0, %ymm1
6731 // Requires two allocations (one for the load, one for the computation)
6732 // whereas:
6733 // vaddps (%rsi), %ymm0, %ymm1
6734 // Requires just 1 allocation, i.e., freeing allocations for other operations
6735 // and having less micro operations to execute.
6736 //
6737 // For some X86 architectures, this is even worse because for instance for
6738 // stores, the complex addressing mode forces the instruction to use the
6739 // "load" ports instead of the dedicated "store" port.
6740 // E.g., on Haswell:
6741 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
6742 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
6744 AM.BaseGV = BaseGV;
6745 AM.BaseOffs = BaseOffset;
6746 AM.HasBaseReg = HasBaseReg;
6747 AM.Scale = Scale;
6748 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6749 // Scale represents reg2 * scale, thus account for 1
6750 // as soon as we use a second register.
6751 return AM.Scale != 0;
6752 return -1;
6753}
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
Hexagon Common GEP
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
if(VerifyEach)
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1620
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:358
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:453
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:969
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:762
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:438
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:891
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:855
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:339
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:996
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:1023
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:1017
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:1001
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:1004
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:1021
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:1019
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:1003
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:410
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:319
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Class to represent pointers.
Definition: DerivedTypes.h:646
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:679
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:216
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
Definition: DerivedTypes.h:463
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Definition: DerivedTypes.h:517
Type * getElementType() const
Definition: DerivedTypes.h:436
bool hasSSE1() const
Definition: X86Subtarget.h:200
bool hasSSE42() const
Definition: X86Subtarget.h:205
bool useAVX512Regs() const
Definition: X86Subtarget.h:267
bool hasSSE3() const
Definition: X86Subtarget.h:202
bool hasAVX512() const
Definition: X86Subtarget.h:208
bool hasSSE41() const
Definition: X86Subtarget.h:204
bool hasSSE2() const
Definition: X86Subtarget.h:201
bool hasSSSE3() const
Definition: X86Subtarget.h:203
bool hasAVX() const
Definition: X86Subtarget.h:206
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:239
bool hasAVX2() const
Definition: X86Subtarget.h:207
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool isLegalNTStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLegalNTLoad(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment)
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment)
bool isLegalMaskedLoad(Type *DataType, Align Alignment)
bool supportsEfficientVectorElementLoadStore() const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
bool prefersVectorizedAddressing() const
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment)
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment)
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
Calculate the cost of Gather / Scatter operation.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getMaxInterleaveFactor(ElementCount VF)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getNumberOfRegisters(unsigned ClassID) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
unsigned getAtomicMemIntrinsicMaxElementSize() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
InstructionCost getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Type) const
bool hasDivRemOp(Type *DataType, bool IsSigned)
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:751
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:724
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:715
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:689
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:256
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:723
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:350
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:728
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:324
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:972
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:675
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:991
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:341
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
apint_match m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
Definition: PatternMatch.h:305
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
AddressSpace
Definition: NVPTXBaseInfo.h:21
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
constexpr int PoisonMaskElem
void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
InstructionCost Cost
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:439
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition: Alignment.h:141
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition: CostTable.h:55