LLVM 19.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include "llvm/Support/Debug.h"
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
102
105 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
106 // TODO: Currently the __builtin_popcount() implementation using SSE3
107 // instructions is inefficient. Once the problem is fixed, we should
108 // call ST->hasSSE3() instead of ST->hasPOPCNT().
109 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
110}
111
112std::optional<unsigned> X86TTIImpl::getCacheSize(
114 switch (Level) {
116 // - Penryn
117 // - Nehalem
118 // - Westmere
119 // - Sandy Bridge
120 // - Ivy Bridge
121 // - Haswell
122 // - Broadwell
123 // - Skylake
124 // - Kabylake
125 return 32 * 1024; // 32 KByte
127 // - Penryn
128 // - Nehalem
129 // - Westmere
130 // - Sandy Bridge
131 // - Ivy Bridge
132 // - Haswell
133 // - Broadwell
134 // - Skylake
135 // - Kabylake
136 return 256 * 1024; // 256 KByte
137 }
138
139 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
140}
141
142std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
144 // - Penryn
145 // - Nehalem
146 // - Westmere
147 // - Sandy Bridge
148 // - Ivy Bridge
149 // - Haswell
150 // - Broadwell
151 // - Skylake
152 // - Kabylake
153 switch (Level) {
155 [[fallthrough]];
157 return 8;
158 }
159
160 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
161}
162
163unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
164 bool Vector = (ClassID == 1);
165 if (Vector && !ST->hasSSE1())
166 return 0;
167
168 if (ST->is64Bit()) {
169 if (Vector && ST->hasAVX512())
170 return 32;
171 return 16;
172 }
173 return 8;
174}
175
178 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
179 switch (K) {
181 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
183 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
184 return TypeSize::getFixed(512);
185 if (ST->hasAVX() && PreferVectorWidth >= 256)
186 return TypeSize::getFixed(256);
187 if (ST->hasSSE1() && PreferVectorWidth >= 128)
188 return TypeSize::getFixed(128);
189 return TypeSize::getFixed(0);
191 return TypeSize::getScalable(0);
192 }
193
194 llvm_unreachable("Unsupported register kind");
195}
196
199 .getFixedValue();
200}
201
203 // If the loop will not be vectorized, don't interleave the loop.
204 // Let regular unroll to unroll the loop, which saves the overflow
205 // check and memory check cost.
206 if (VF.isScalar())
207 return 1;
208
209 if (ST->isAtom())
210 return 1;
211
212 // Sandybridge and Haswell have multiple execution ports and pipelined
213 // vector units.
214 if (ST->hasAVX())
215 return 4;
216
217 return 2;
218}
219
221 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
224 const Instruction *CxtI) {
225
226 // vXi8 multiplications are always promoted to vXi16.
227 // Sub-128-bit types can be extended/packed more efficiently.
228 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
229 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
230 Type *WideVecTy =
231 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
232 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
234 CostKind) +
235 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
237 CostKind) +
238 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
239 }
240
241 // Legalize the type.
242 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
243
244 int ISD = TLI->InstructionOpcodeToISD(Opcode);
245 assert(ISD && "Invalid opcode");
246
247 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
248 (LT.second.getScalarType() == MVT::i32 ||
249 LT.second.getScalarType() == MVT::i64)) {
250 // Check if the operands can be represented as a smaller datatype.
251 bool Op1Signed = false, Op2Signed = false;
252 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
253 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
254 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
255 bool SignedMode = Op1Signed || Op2Signed;
256
257 // If both vXi32 are representable as i15 and at least one is constant,
258 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
259 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
260 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
261 LT.second.getScalarType() == MVT::i32) {
262 bool Op1Constant =
263 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
264 bool Op2Constant =
265 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
266 bool Op1Sext = isa<SExtInst>(Args[0]) &&
267 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
268 bool Op2Sext = isa<SExtInst>(Args[1]) &&
269 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
270
271 bool IsZeroExtended = !Op1Signed || !Op2Signed;
272 bool IsConstant = Op1Constant || Op2Constant;
273 bool IsSext = Op1Sext || Op2Sext;
274 if (IsConstant || IsZeroExtended || IsSext)
275 LT.second =
276 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
277 }
278
279 // Check if the vXi32 operands can be shrunk into a smaller datatype.
280 // This should match the codegen from reduceVMULWidth.
281 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
282 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
283 if (OpMinSize <= 7)
284 return LT.first * 3; // pmullw/sext
285 if (!SignedMode && OpMinSize <= 8)
286 return LT.first * 3; // pmullw/zext
287 if (OpMinSize <= 15)
288 return LT.first * 5; // pmullw/pmulhw/pshuf
289 if (!SignedMode && OpMinSize <= 16)
290 return LT.first * 5; // pmullw/pmulhw/pshuf
291 }
292
293 // If both vXi64 are representable as (unsigned) i32, then we can perform
294 // the multiple with a single PMULUDQ instruction.
295 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
296 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
297 ISD = X86ISD::PMULUDQ;
298 }
299
300 // Vector multiply by pow2 will be simplified to shifts.
301 // Vector multiply by -pow2 will be simplified to shifts/negates.
302 if (ISD == ISD::MUL && Op2Info.isConstant() &&
303 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
305 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
306 Op1Info.getNoProps(), Op2Info.getNoProps());
307 if (Op2Info.isNegatedPowerOf2())
308 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
309 return Cost;
310 }
311
312 // On X86, vector signed division by constants power-of-two are
313 // normally expanded to the sequence SRA + SRL + ADD + SRA.
314 // The OperandValue properties may not be the same as that of the previous
315 // operation; conservatively assume OP_None.
316 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
317 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
319 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
320 Op1Info.getNoProps(), Op2Info.getNoProps());
321 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
322 Op1Info.getNoProps(), Op2Info.getNoProps());
323 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
324 Op1Info.getNoProps(), Op2Info.getNoProps());
325
326 if (ISD == ISD::SREM) {
327 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
328 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
329 Op2Info.getNoProps());
330 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
331 Op2Info.getNoProps());
332 }
333
334 return Cost;
335 }
336
337 // Vector unsigned division/remainder will be simplified to shifts/masks.
338 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
339 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
340 if (ISD == ISD::UDIV)
341 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
342 Op1Info.getNoProps(), Op2Info.getNoProps());
343 // UREM
344 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
345 Op1Info.getNoProps(), Op2Info.getNoProps());
346 }
347
348 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
349 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
350 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
351 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
352 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
353 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
354 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
355 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
356 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
357 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
358
359 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
360 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
361 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
362 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
363 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
364 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
365 };
366
367 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
368 if (const auto *Entry =
369 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
370 if (auto KindCost = Entry->Cost[CostKind])
371 return LT.first * *KindCost;
372
373 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
374 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
375 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
376 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
377
378 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
379 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
380 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
381
382 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
383 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
384 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
385 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
386 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
387 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
388
389 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
390 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
391 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
392 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
393 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
394 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
395 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
396
397 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
398 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
399 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
400 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
401 };
402
403 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
404 if (const auto *Entry =
405 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
406 if (auto KindCost = Entry->Cost[CostKind])
407 return LT.first * *KindCost;
408
409 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
410 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
411 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
412 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
413 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
414 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
415 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
416
417 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
418 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
419 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
420 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
421 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
422 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
423
424 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
425 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
426 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
427 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
428 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
429 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
430
431 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
432 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
433 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
434 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
435 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
436 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
437
438 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
439 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
440 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
441 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
442 };
443
444 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
445 if (const auto *Entry =
446 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
447 if (auto KindCost = Entry->Cost[CostKind])
448 return LT.first * *KindCost;
449
450 static const CostKindTblEntry AVXUniformConstCostTable[] = {
451 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
452 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
453 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
454 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
455 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
456 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
457
458 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
459 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
460 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
461 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
462 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
463 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
464
465 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
466 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
467 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
468 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
469 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
470 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
471
472 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
473 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
474 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
475 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
476 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
477 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
478
479 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
480 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
481 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
482 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
483 };
484
485 // XOP has faster vXi8 shifts.
486 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
487 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
488 if (const auto *Entry =
489 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
490 if (auto KindCost = Entry->Cost[CostKind])
491 return LT.first * *KindCost;
492
493 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
494 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
495 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
496 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
497
498 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
499 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
500 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
501
502 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
503 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
504 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
505
506 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
507 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
508 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
509
510 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
511 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
512 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
513 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
514 };
515
516 // XOP has faster vXi8 shifts.
517 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
518 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
519 if (const auto *Entry =
520 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
521 if (auto KindCost = Entry->Cost[CostKind])
522 return LT.first * *KindCost;
523
524 static const CostKindTblEntry AVX512BWConstCostTable[] = {
525 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
526 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
527 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
528 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
529
530 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
531 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
532 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
533 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
534 };
535
536 if (Op2Info.isConstant() && ST->hasBWI())
537 if (const auto *Entry =
538 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
539 if (auto KindCost = Entry->Cost[CostKind])
540 return LT.first * *KindCost;
541
542 static const CostKindTblEntry AVX512ConstCostTable[] = {
543 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
544 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
545 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
546 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
547
548 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
549 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
550 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
551 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
552
553 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
554 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
555 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
556 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
557 };
558
559 if (Op2Info.isConstant() && ST->hasAVX512())
560 if (const auto *Entry =
561 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
562 if (auto KindCost = Entry->Cost[CostKind])
563 return LT.first * *KindCost;
564
565 static const CostKindTblEntry AVX2ConstCostTable[] = {
566 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
567 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
568 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
569 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
570
571 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
572 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
573 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
574 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
575
576 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
577 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
578 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
579 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
580 };
581
582 if (Op2Info.isConstant() && ST->hasAVX2())
583 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
584 if (auto KindCost = Entry->Cost[CostKind])
585 return LT.first * *KindCost;
586
587 static const CostKindTblEntry AVXConstCostTable[] = {
588 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
589 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
590 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
591 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
592
593 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
594 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
595 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
596 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
597
598 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
599 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
600 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
601 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
602 };
603
604 if (Op2Info.isConstant() && ST->hasAVX())
605 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
606 if (auto KindCost = Entry->Cost[CostKind])
607 return LT.first * *KindCost;
608
609 static const CostKindTblEntry SSE41ConstCostTable[] = {
610 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
611 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
612 };
613
614 if (Op2Info.isConstant() && ST->hasSSE41())
615 if (const auto *Entry =
616 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
617 if (auto KindCost = Entry->Cost[CostKind])
618 return LT.first * *KindCost;
619
620 static const CostKindTblEntry SSE2ConstCostTable[] = {
621 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
622 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
623 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
624 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
625
626 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
627 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
628 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
629 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
630
631 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
632 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
633 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
634 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
635 };
636
637 if (Op2Info.isConstant() && ST->hasSSE2())
638 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
639 if (auto KindCost = Entry->Cost[CostKind])
640 return LT.first * *KindCost;
641
642 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
643 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
644 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
645 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
646 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
647 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
648 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
649 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
650 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
651 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
652
653 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
654 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
655 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
656 };
657
658 if (ST->hasBWI() && Op2Info.isUniform())
659 if (const auto *Entry =
660 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
661 if (auto KindCost = Entry->Cost[CostKind])
662 return LT.first * *KindCost;
663
664 static const CostKindTblEntry AVX512UniformCostTable[] = {
665 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
666 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
667 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
668
669 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
670 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
671 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
672
673 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
674 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
675 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
676 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
677 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
678 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
679 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
680 };
681
682 if (ST->hasAVX512() && Op2Info.isUniform())
683 if (const auto *Entry =
684 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
685 if (auto KindCost = Entry->Cost[CostKind])
686 return LT.first * *KindCost;
687
688 static const CostKindTblEntry AVX2UniformCostTable[] = {
689 // Uniform splats are cheaper for the following instructions.
690 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
691 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
692 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
693 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
694 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
695 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
696
697 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
698 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
699 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
700 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
701 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
702 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
703
704 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
705 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
706 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
707 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
708 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
709 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
710
711 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
712 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
713 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
714 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
715 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
716 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
717 };
718
719 if (ST->hasAVX2() && Op2Info.isUniform())
720 if (const auto *Entry =
721 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
722 if (auto KindCost = Entry->Cost[CostKind])
723 return LT.first * *KindCost;
724
725 static const CostKindTblEntry AVXUniformCostTable[] = {
726 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
727 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
728 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
729 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
730 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
731 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
732
733 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
734 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
735 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
736 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
737 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
738 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
739
740 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
741 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
742 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
743 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
744 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
745 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
746
747 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
748 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
749 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
750 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
751 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
752 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
753 };
754
755 // XOP has faster vXi8 shifts.
756 if (ST->hasAVX() && Op2Info.isUniform() &&
757 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
758 if (const auto *Entry =
759 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
760 if (auto KindCost = Entry->Cost[CostKind])
761 return LT.first * *KindCost;
762
763 static const CostKindTblEntry SSE2UniformCostTable[] = {
764 // Uniform splats are cheaper for the following instructions.
765 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
766 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
767 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
768
769 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
770 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
771 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
772
773 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
774 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
775 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
776
777 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
778 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
779 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
780 };
781
782 if (ST->hasSSE2() && Op2Info.isUniform() &&
783 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
784 if (const auto *Entry =
785 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
786 if (auto KindCost = Entry->Cost[CostKind])
787 return LT.first * *KindCost;
788
789 static const CostKindTblEntry AVX512DQCostTable[] = {
790 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
791 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
792 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
793 };
794
795 // Look for AVX512DQ lowering tricks for custom cases.
796 if (ST->hasDQI())
797 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
798 if (auto KindCost = Entry->Cost[CostKind])
799 return LT.first * *KindCost;
800
801 static const CostKindTblEntry AVX512BWCostTable[] = {
802 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
803 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
804 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
805 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
806 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
807 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
808 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
809 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
810 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
811
812 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
813 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
814 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
815 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
816 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
817 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
818 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
819 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
820 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
821
822 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
823 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
824
825 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
826 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
827 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
828 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
829
830 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
831 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
832
833 { ISD::MUL, MVT::v64i8, { 5, 10,10,11 } },
834 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
835
836 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
837 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
838 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
839 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
840 };
841
842 // Look for AVX512BW lowering tricks for custom cases.
843 if (ST->hasBWI())
844 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
845 if (auto KindCost = Entry->Cost[CostKind])
846 return LT.first * *KindCost;
847
848 static const CostKindTblEntry AVX512CostTable[] = {
849 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
850 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
851 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
852
853 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
854 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
855 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
856
857 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
858 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
859 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
860 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
861 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
862 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
863 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
864 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
865 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
866
867 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
868 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
869 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
870 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
871 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
872 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
873 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
874 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
875 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
876
877 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
878 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
879
880 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
881 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
882
883 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
884 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
885 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
886 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
887
888 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
889 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
890 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
891 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
892
893 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
894 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
895 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
896 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
897
898 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
899 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
900 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
901 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
902 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
903
904 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
905
906 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
907 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
908 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
909 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
910 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
911 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
912 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
913 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
914 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
915
916 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
917 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
918 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
919 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
920
921 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
922 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
923 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
924 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
925 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
926 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
927 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
928 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
929 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
930
931 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
932 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
933 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
934 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
935 };
936
937 if (ST->hasAVX512())
938 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
939 if (auto KindCost = Entry->Cost[CostKind])
940 return LT.first * *KindCost;
941
942 static const CostKindTblEntry AVX2ShiftCostTable[] = {
943 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
944 // customize them to detect the cases where shift amount is a scalar one.
945 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
946 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
947 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
948 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
949 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
950 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
951 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
952 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
953 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
954 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
955 };
956
957 if (ST->hasAVX512()) {
958 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
959 // On AVX512, a packed v32i16 shift left by a constant build_vector
960 // is lowered into a vector multiply (vpmullw).
961 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
962 Op1Info.getNoProps(), Op2Info.getNoProps());
963 }
964
965 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
966 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
967 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
968 Op2Info.isConstant())
969 // On AVX2, a packed v16i16 shift left by a constant build_vector
970 // is lowered into a vector multiply (vpmullw).
971 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
972 Op1Info.getNoProps(), Op2Info.getNoProps());
973
974 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
975 if (auto KindCost = Entry->Cost[CostKind])
976 return LT.first * *KindCost;
977 }
978
979 static const CostKindTblEntry XOPShiftCostTable[] = {
980 // 128bit shifts take 1cy, but right shifts require negation beforehand.
981 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
982 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
983 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
984 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
985 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
986 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
987 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
988 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
989 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
990 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
991 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
992 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
993 // 256bit shifts require splitting if AVX2 didn't catch them above.
994 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
995 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
996 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
997 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
998 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
999 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1000 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1001 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1002 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1003 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1004 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1005 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1006 };
1007
1008 // Look for XOP lowering tricks.
1009 if (ST->hasXOP()) {
1010 // If the right shift is constant then we'll fold the negation so
1011 // it's as cheap as a left shift.
1012 int ShiftISD = ISD;
1013 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1014 ShiftISD = ISD::SHL;
1015 if (const auto *Entry =
1016 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1017 if (auto KindCost = Entry->Cost[CostKind])
1018 return LT.first * *KindCost;
1019 }
1020
1021 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1022 MVT VT = LT.second;
1023 // Vector shift left by non uniform constant can be lowered
1024 // into vector multiply.
1025 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1026 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1027 ISD = ISD::MUL;
1028 }
1029
1030 static const CostKindTblEntry GLMCostTable[] = {
1031 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1032 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1033 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1034 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1035 };
1036
1037 if (ST->useGLMDivSqrtCosts())
1038 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1039 if (auto KindCost = Entry->Cost[CostKind])
1040 return LT.first * *KindCost;
1041
1042 static const CostKindTblEntry SLMCostTable[] = {
1043 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1044 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1045 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1046 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1047 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1048 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1049 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1050 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1051 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1052 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1053 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1054 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1055 // v2i64/v4i64 mul is custom lowered as a series of long:
1056 // multiplies(3), shifts(3) and adds(2)
1057 // slm muldq version throughput is 2 and addq throughput 4
1058 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1059 // 3X4 (addq throughput) = 17
1060 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1061 // slm addq\subq throughput is 4
1062 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1063 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1064 };
1065
1066 if (ST->useSLMArithCosts())
1067 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1068 if (auto KindCost = Entry->Cost[CostKind])
1069 return LT.first * *KindCost;
1070
1071 static const CostKindTblEntry AVX2CostTable[] = {
1072 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1073 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1074 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1075 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1076
1077 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1078 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1079 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1080 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1081
1082 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1083 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1084 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1085 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1086 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1087 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1088
1089 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1090 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1091 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1092 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1093 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1094 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1095 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1096 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1097
1098 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1099 { ISD::MUL, MVT::v32i8, { 6, 11,10,19 } }, // unpack/pmullw
1100 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1101 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1102 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1103 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1104 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1105
1106 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1107
1108 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1109 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1110
1111 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1112 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1113 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1114 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1115 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1116 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1117
1118 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1119 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1120 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1121 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1122 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1123 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1124
1125 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1126 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1127 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1128 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1129 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1130 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1131
1132 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1133 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1134 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1135 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1136 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1137 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1138 };
1139
1140 // Look for AVX2 lowering tricks for custom cases.
1141 if (ST->hasAVX2())
1142 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1143 if (auto KindCost = Entry->Cost[CostKind])
1144 return LT.first * *KindCost;
1145
1146 static const CostKindTblEntry AVX1CostTable[] = {
1147 // We don't have to scalarize unsupported ops. We can issue two half-sized
1148 // operations and we only need to extract the upper YMM half.
1149 // Two ops + 1 extract + 1 insert = 4.
1150 { ISD::MUL, MVT::v32i8, { 12, 13, 22, 23 } }, // unpack/pmullw + split
1151 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1152 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1153 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1154 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1155
1156 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1157 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1158 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1159 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1160
1161 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1162 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1163 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1164 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1165
1166 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1167 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1168 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1169 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1170
1171 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1172 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1173 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1174 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1175 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1176 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1177 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1178 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1179 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1180 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1181
1182 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1183 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1184 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1185 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1186 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1187 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1188 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1189 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1190
1191 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1192 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1193 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1194 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1195 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1196 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1197 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1198 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1199
1200 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1201 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1202 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1203 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1204 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1205 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1206 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1207 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1208
1209 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1210 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1211
1212 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1213 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1214 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1215 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1216 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1217 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1218
1219 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1220 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1221 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1222 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1223 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1224 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1225
1226 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1227 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1228 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1229 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1230 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1231 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1232
1233 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1234 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1235 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1236 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1237 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1238 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1239 };
1240
1241 if (ST->hasAVX())
1242 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1243 if (auto KindCost = Entry->Cost[CostKind])
1244 return LT.first * *KindCost;
1245
1246 static const CostKindTblEntry SSE42CostTable[] = {
1247 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1248 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1249 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1250 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1251
1252 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1253 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1254 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1255 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1256
1257 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1258 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1259 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1260 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1261
1262 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1263 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1264 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1265 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1266
1267 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1268 };
1269
1270 if (ST->hasSSE42())
1271 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1272 if (auto KindCost = Entry->Cost[CostKind])
1273 return LT.first * *KindCost;
1274
1275 static const CostKindTblEntry SSE41CostTable[] = {
1276 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1277 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1278 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1279
1280 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1281 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1282 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1283 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1284
1285 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1286 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1287 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1288 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1289
1290 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack
1291 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1292 };
1293
1294 if (ST->hasSSE41())
1295 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1296 if (auto KindCost = Entry->Cost[CostKind])
1297 return LT.first * *KindCost;
1298
1299 static const CostKindTblEntry SSE2CostTable[] = {
1300 // We don't correctly identify costs of casts because they are marked as
1301 // custom.
1302 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1303 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1304 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1305 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1306
1307 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1308 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1309 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1310 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1311
1312 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1313 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1314 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1315 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1316
1317 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1318 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1319 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1320 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1321
1322 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1323 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1324 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1325 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1326
1327 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1328 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1329 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1330 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1331
1332 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1333 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1334
1335 { ISD::MUL, MVT::v16i8, { 5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1336 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1337 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1338 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1339
1340 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1341
1342 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1343 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1344 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1345 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1346
1347 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1348 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1349 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1350 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1351
1352 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1353 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1354 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1355
1356 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1357 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1358 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1359
1360 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1361 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1362 };
1363
1364 if (ST->hasSSE2())
1365 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1366 if (auto KindCost = Entry->Cost[CostKind])
1367 return LT.first * *KindCost;
1368
1369 static const CostKindTblEntry SSE1CostTable[] = {
1370 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1371 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1372
1373 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1374 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1375
1376 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1377 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1378
1379 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1380 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1381
1382 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1383 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1384 };
1385
1386 if (ST->hasSSE1())
1387 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1388 if (auto KindCost = Entry->Cost[CostKind])
1389 return LT.first * *KindCost;
1390
1391 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1392 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1393 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1394 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1395 };
1396
1397 if (ST->is64Bit())
1398 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1399 if (auto KindCost = Entry->Cost[CostKind])
1400 return LT.first * *KindCost;
1401
1402 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1403 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1404 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1405 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1406
1407 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1408 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1409 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1410
1411 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1412 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1413 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1414
1415 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1416 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1417 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1418 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1419 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1420 };
1421
1422 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1423 if (auto KindCost = Entry->Cost[CostKind])
1424 return LT.first * *KindCost;
1425
1426 // It is not a good idea to vectorize division. We have to scalarize it and
1427 // in the process we will often end up having to spilling regular
1428 // registers. The overhead of division is going to dominate most kernels
1429 // anyways so try hard to prevent vectorization of division - it is
1430 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1431 // to hide "20 cycles" for each lane.
1432 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1433 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1434 ISD == ISD::UREM)) {
1435 InstructionCost ScalarCost =
1437 Op1Info.getNoProps(), Op2Info.getNoProps());
1438 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1439 }
1440
1441 // Handle some basic single instruction code size cases.
1442 if (CostKind == TTI::TCK_CodeSize) {
1443 switch (ISD) {
1444 case ISD::FADD:
1445 case ISD::FSUB:
1446 case ISD::FMUL:
1447 case ISD::FDIV:
1448 case ISD::FNEG:
1449 case ISD::AND:
1450 case ISD::OR:
1451 case ISD::XOR:
1452 return LT.first;
1453 break;
1454 }
1455 }
1456
1457 // Fallback to the default implementation.
1458 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1459 Args, CxtI);
1460}
1461
1464 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1466 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1467 return TTI::TCC_Basic;
1469}
1470
1472 TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask,
1474 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1475 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1476 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1477 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1478
1479 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1480
1481 // Recognize a basic concat_vector shuffle.
1482 if (Kind == TTI::SK_PermuteTwoSrc &&
1483 Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1484 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1487 CostKind, Mask.size() / 2, BaseTp);
1488
1489 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1490 if (Kind == TTI::SK_Transpose)
1491 Kind = TTI::SK_PermuteTwoSrc;
1492
1493 // For Broadcasts we are splatting the first element from the first input
1494 // register, so only need to reference that input and all the output
1495 // registers are the same.
1496 if (Kind == TTI::SK_Broadcast)
1497 LT.first = 1;
1498
1499 // Treat <X x bfloat> shuffles as <X x half>.
1500 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1501 LT.second = LT.second.changeVectorElementType(MVT::f16);
1502
1503 // Subvector extractions are free if they start at the beginning of a
1504 // vector and cheap if the subvectors are aligned.
1505 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1506 int NumElts = LT.second.getVectorNumElements();
1507 if ((Index % NumElts) == 0)
1508 return 0;
1509 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1510 if (SubLT.second.isVector()) {
1511 int NumSubElts = SubLT.second.getVectorNumElements();
1512 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1513 return SubLT.first;
1514 // Handle some cases for widening legalization. For now we only handle
1515 // cases where the original subvector was naturally aligned and evenly
1516 // fit in its legalized subvector type.
1517 // FIXME: Remove some of the alignment restrictions.
1518 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1519 // vectors.
1520 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1521 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1522 (NumSubElts % OrigSubElts) == 0 &&
1523 LT.second.getVectorElementType() ==
1524 SubLT.second.getVectorElementType() &&
1525 LT.second.getVectorElementType().getSizeInBits() ==
1527 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1528 "Unexpected number of elements!");
1529 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1530 LT.second.getVectorNumElements());
1531 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1532 SubLT.second.getVectorNumElements());
1533 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1534 InstructionCost ExtractCost =
1535 getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt,
1536 CostKind, ExtractIndex, SubTy);
1537
1538 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1539 // if we have SSSE3 we can use pshufb.
1540 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1541 return ExtractCost + 1; // pshufd or pshufb
1542
1543 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1544 "Unexpected vector size");
1545
1546 return ExtractCost + 2; // worst case pshufhw + pshufd
1547 }
1548 }
1549 // If the extract subvector is not optimal, treat it as single op shuffle.
1551 }
1552
1553 // Subvector insertions are cheap if the subvectors are aligned.
1554 // Note that in general, the insertion starting at the beginning of a vector
1555 // isn't free, because we need to preserve the rest of the wide vector.
1556 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1557 int NumElts = LT.second.getVectorNumElements();
1558 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1559 if (SubLT.second.isVector()) {
1560 int NumSubElts = SubLT.second.getVectorNumElements();
1561 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1562 return SubLT.first;
1563 }
1564
1565 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1566 Kind = TTI::SK_PermuteTwoSrc;
1567 }
1568
1569 // Handle some common (illegal) sub-vector types as they are often very cheap
1570 // to shuffle even on targets without PSHUFB.
1571 EVT VT = TLI->getValueType(DL, BaseTp);
1572 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1573 !ST->hasSSSE3()) {
1574 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1575 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1576 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1577 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1578 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1579 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1580
1581 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1582 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1583 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1584 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1585
1586 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1587 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1588 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1589 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1590
1591 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1592 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1593 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1594 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1595 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1596
1597 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1598 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1599 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1600 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1601 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1602 };
1603
1604 if (ST->hasSSE2())
1605 if (const auto *Entry =
1606 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1607 return Entry->Cost;
1608 }
1609
1610 // We are going to permute multiple sources and the result will be in multiple
1611 // destinations. Providing an accurate cost only for splits where the element
1612 // type remains the same.
1613 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1614 MVT LegalVT = LT.second;
1615 if (LegalVT.isVector() &&
1616 LegalVT.getVectorElementType().getSizeInBits() ==
1618 LegalVT.getVectorNumElements() <
1619 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1620 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1621 unsigned LegalVTSize = LegalVT.getStoreSize();
1622 // Number of source vectors after legalization:
1623 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1624 // Number of destination vectors after legalization:
1625 InstructionCost NumOfDests = LT.first;
1626
1627 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1628 LegalVT.getVectorNumElements());
1629
1630 if (!Mask.empty() && NumOfDests.isValid()) {
1631 // Try to perform better estimation of the permutation.
1632 // 1. Split the source/destination vectors into real registers.
1633 // 2. Do the mask analysis to identify which real registers are
1634 // permuted. If more than 1 source registers are used for the
1635 // destination register building, the cost for this destination register
1636 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1637 // source register is used, build mask and calculate the cost as a cost
1638 // of PermuteSingleSrc.
1639 // Also, for the single register permute we try to identify if the
1640 // destination register is just a copy of the source register or the
1641 // copy of the previous destination register (the cost is
1642 // TTI::TCC_Basic). If the source register is just reused, the cost for
1643 // this operation is 0.
1644 NumOfDests =
1646 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1647 .first;
1648 unsigned E = *NumOfDests.getValue();
1649 unsigned NormalizedVF =
1650 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1651 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1652 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1653 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1654 copy(Mask, NormalizedMask.begin());
1655 unsigned PrevSrcReg = 0;
1656 ArrayRef<int> PrevRegMask;
1659 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1660 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1661 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1662 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1663 // Check if the previous register can be just copied to the next
1664 // one.
1665 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1666 PrevRegMask != RegMask)
1668 RegMask, CostKind, 0, nullptr);
1669 else
1670 // Just a copy of previous destination register.
1672 return;
1673 }
1674 if (SrcReg != DestReg &&
1675 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1676 // Just a copy of the source register.
1678 }
1679 PrevSrcReg = SrcReg;
1680 PrevRegMask = RegMask;
1681 },
1682 [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1683 unsigned /*Unused*/,
1684 unsigned /*Unused*/) {
1685 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1686 CostKind, 0, nullptr);
1687 });
1688 return Cost;
1689 }
1690
1691 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1692 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1693 std::nullopt, CostKind, 0, nullptr);
1694 }
1695
1696 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1697 }
1698
1699 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1700 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1701 // We assume that source and destination have the same vector type.
1702 InstructionCost NumOfDests = LT.first;
1703 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1704 LT.first = NumOfDests * NumOfShufflesPerDest;
1705 }
1706
1707 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1708 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1709 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1710
1711 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1712 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1713
1714 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1715 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1716 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1717 };
1718
1719 if (ST->hasVBMI())
1720 if (const auto *Entry =
1721 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1722 return LT.first * Entry->Cost;
1723
1724 static const CostTblEntry AVX512BWShuffleTbl[] = {
1725 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1726 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1727 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1728
1729 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1730 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1731 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1732 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1733
1734 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1735 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1736 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1737 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1738 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1739
1740 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1741 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1742 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1743 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1744 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1745
1746 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1747 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1748
1749 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1750 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1751 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1752 };
1753
1754 if (ST->hasBWI())
1755 if (const auto *Entry =
1756 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1757 return LT.first * Entry->Cost;
1758
1759 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1760 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1761 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1762 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1763 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1764 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1765 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1766 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1767
1768 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1769 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1770 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1771 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1772 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1773 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1774 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1775
1776 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1777 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1778 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1779 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1780 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1781 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1782 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1783 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1784 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1785 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1786 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1787
1788 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1789 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1790 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1791 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1792 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1793 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1794 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1795 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1796 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1797 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1798 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1799 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1800 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1801
1802 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1803 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1804 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1805 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1806 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1807 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1808 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1809 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1810 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1811 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1812 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1813 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1814
1815 // FIXME: This just applies the type legalization cost rules above
1816 // assuming these completely split.
1817 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1818 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1819 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1820 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1821 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1822 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1823
1824 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1825 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1826 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1827 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1828 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1829 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1830 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1831 };
1832
1833 if (ST->hasAVX512())
1834 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1835 if (auto KindCost = Entry->Cost[CostKind])
1836 return LT.first * *KindCost;
1837
1838 static const CostTblEntry AVX2ShuffleTbl[] = {
1839 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1840 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1841 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1842 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1843 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1844 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1845 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1846
1847 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1848 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1849 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1850 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1851 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1852 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1853 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1854
1855 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1856 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1857 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1858
1859 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1860 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1861 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1862 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1863 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1864
1865 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1866 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1867 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1868 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1869 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1870 // + vpblendvb
1871 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1872 // + vpblendvb
1873 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1874 // + vpblendvb
1875
1876 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1877 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1878 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1879 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1880 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1881 // + vpblendvb
1882 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1883 // + vpblendvb
1884 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1885 // + vpblendvb
1886 };
1887
1888 if (ST->hasAVX2())
1889 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1890 return LT.first * Entry->Cost;
1891
1892 static const CostTblEntry XOPShuffleTbl[] = {
1893 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1894 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1895 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1896 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1897 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1898 // + vinsertf128
1899 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1900 // + vinsertf128
1901
1902 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1903 // + vinsertf128
1904 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1905 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1906 // + vinsertf128
1907 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1908 };
1909
1910 if (ST->hasXOP())
1911 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1912 return LT.first * Entry->Cost;
1913
1914 static const CostTblEntry AVX1ShuffleTbl[] = {
1915 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1916 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1917 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1918 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1919 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1920 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1921 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1922
1923 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1924 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1925 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1926 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1927 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1928 // + vinsertf128
1929 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1930 // + vinsertf128
1931 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1932 // + vinsertf128
1933
1934 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1935 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1936 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1937 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1938 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1939 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
1940 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1941
1942 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
1943 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
1944 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1945 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1946 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1947 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1948 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1949
1950 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1951 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1952 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1953 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1954 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1955 // + 2*por + vinsertf128
1956 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
1957 // + 2*por + vinsertf128
1958 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1959 // + 2*por + vinsertf128
1960
1961 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1962 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1963 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1964 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1965 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1966 // + 4*por + vinsertf128
1967 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
1968 // + 4*por + vinsertf128
1969 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1970 // + 4*por + vinsertf128
1971 };
1972
1973 if (ST->hasAVX())
1974 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1975 return LT.first * Entry->Cost;
1976
1977 static const CostTblEntry SSE41ShuffleTbl[] = {
1978 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1979 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1980 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1981 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1982 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1983 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
1984 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1985 };
1986
1987 if (ST->hasSSE41())
1988 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1989 return LT.first * Entry->Cost;
1990
1991 static const CostTblEntry SSSE3ShuffleTbl[] = {
1992 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1993 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
1994 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1995
1996 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1997 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
1998 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1999
2000 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
2001 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
2002 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
2003
2004 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
2005 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
2006 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
2007 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
2008 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
2009
2010 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
2011 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
2012 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
2013
2014 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
2015 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
2016 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
2017 };
2018
2019 if (ST->hasSSSE3())
2020 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2021 return LT.first * Entry->Cost;
2022
2023 static const CostTblEntry SSE2ShuffleTbl[] = {
2024 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2025 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2026 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2027 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2028 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2029 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2030
2031 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2032 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2033 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2034 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2035 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2036 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2037 // + 2*pshufd + 2*unpck + packus
2038
2039 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2040 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2041 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2042 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2043 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2044 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2045
2046 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2047 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2048 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2049 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2050 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2051 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2052
2053 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2054 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2055 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2056 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2057 // + pshufd/unpck
2058 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2059 // + pshufd/unpck
2060 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2061 // + 2*pshufd + 2*unpck + 2*packus
2062
2063 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2064 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2065 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2066 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2067 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2068 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2069 };
2070
2071 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2072 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2073 };
2074
2075 if (ST->hasSSE2()) {
2076 bool IsLoad =
2077 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2078 if (ST->hasSSE3() && IsLoad)
2079 if (const auto *Entry =
2080 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2082 LT.second.getVectorElementCount()) &&
2083 "Table entry missing from isLegalBroadcastLoad()");
2084 return LT.first * Entry->Cost;
2085 }
2086
2087 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2088 return LT.first * Entry->Cost;
2089 }
2090
2091 static const CostTblEntry SSE1ShuffleTbl[] = {
2092 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2093 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2094 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2095 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2096 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2097 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2098 };
2099
2100 if (ST->hasSSE1())
2101 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2102 return LT.first * Entry->Cost;
2103
2104 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2105}
2106
2108 Type *Src,
2111 const Instruction *I) {
2112 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2113 assert(ISD && "Invalid opcode");
2114
2115 // TODO: Allow non-throughput costs that aren't binary.
2116 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2118 return Cost == 0 ? 0 : 1;
2119 return Cost;
2120 };
2121
2122 // The cost tables include both specific, custom (non-legal) src/dst type
2123 // conversions and generic, legalized types. We test for customs first, before
2124 // falling back to legalization.
2125 // FIXME: Need a better design of the cost table to handle non-simple types of
2126 // potential massive combinations (elem_num x src_type x dst_type).
2127 static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
2128 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2129 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2130
2131 // Mask sign extend has an instruction.
2132 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
2133 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
2134 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
2135 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
2136 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
2137 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
2138 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
2139 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
2140 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
2141 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
2142 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
2143 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
2144 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2145 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
2146 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
2147 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
2148 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 },
2149
2150 // Mask zero extend is a sext + shift.
2151 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
2152 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
2153 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
2154 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
2155 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
2156 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
2157 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
2158 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
2159 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
2160 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
2161 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
2162 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
2163 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
2164 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
2165 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
2166 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
2167 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 },
2168
2169 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
2170 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
2171 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
2172 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
2173 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
2174 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
2175 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
2176 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
2177 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
2178 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
2179 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
2180 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
2181 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
2182 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
2183 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
2184 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
2185 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 },
2186
2187 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
2188 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
2189 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
2190 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
2191 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
2192 };
2193
2194 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
2195 // Mask sign extend has an instruction.
2196 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
2197 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
2198 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
2199 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
2200 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
2201 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 },
2202 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 },
2203 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 },
2204
2205 // Mask zero extend is a sext + shift.
2206 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
2207 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
2208 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
2209 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
2210 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
2211 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 },
2212 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 },
2213 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
2214
2215 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
2216 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
2217 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
2218 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
2219 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2220 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 },
2221 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 },
2222 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 },
2223
2224 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
2225 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
2226
2227 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
2228 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
2229
2230 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
2231 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
2232
2233 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
2234 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
2235 };
2236
2237 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2238 // 256-bit wide vectors.
2239
2240 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
2241 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
2242 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
2243 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, 4 }, // 2*vcvtps2pd+vextractf64x4
2244 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
2245
2246 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2247 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2248 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2249 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
2250 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2251 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2252 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2253 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
2254 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
2255 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
2256 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
2257 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
2258 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
2259 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
2260 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
2261 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
2262 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
2263 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
2264 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb
2265 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb
2266 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw
2267 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw
2268 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
2269 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
2270 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
2271 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb
2272 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb
2273 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb
2274 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
2275 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw
2276 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw
2277 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
2278 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
2279 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
2280
2281 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
2282 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 },
2283 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 },
2284
2285 // Sign extend is zmm vpternlogd+vptruncdb.
2286 // Zero extend is zmm broadcast load+vptruncdw.
2287 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
2288 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
2289 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
2290 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
2291 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
2292 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
2293 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
2294 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },
2295
2296 // Sign extend is zmm vpternlogd+vptruncdw.
2297 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2298 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
2299 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
2300 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
2301 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
2302 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
2303 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
2304 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
2305 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2306
2307 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
2308 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
2309 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
2310 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
2311 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
2312 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
2313 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
2314 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
2315 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
2316 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
2317
2318 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
2319 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
2320 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
2321 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
2322
2323 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
2324 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
2325 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2326 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2327 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
2328 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
2329 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
2330 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
2331 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
2332 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
2333
2334 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
2335 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
2336
2337 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
2338 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
2339 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
2340 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
2341 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
2342 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
2343 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
2344 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
2345
2346 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
2347 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
2348 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
2349 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
2350 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
2351 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
2352 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
2353 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
2354 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
2355 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
2356
2357 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
2358 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 },
2359 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 },
2360 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 },
2361 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 },
2362 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
2363 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 },
2364 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 },
2365 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 },
2366 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 },
2367 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 },
2368
2369 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
2370 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
2371 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 },
2372 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
2373 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 },
2374 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 },
2375 };
2376
2377 static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
2378 // Mask sign extend has an instruction.
2379 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
2380 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
2381 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
2382 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
2383 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
2384 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
2385 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
2386 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
2387 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
2388 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
2389 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
2390 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
2391 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2392 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
2393 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 },
2394 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 },
2395 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 },
2396
2397 // Mask zero extend is a sext + shift.
2398 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
2399 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
2400 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
2401 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
2402 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
2403 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
2404 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
2405 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
2406 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
2407 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
2408 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
2409 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
2410 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
2411 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
2412 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 },
2413 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 },
2414 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 },
2415
2416 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
2417 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
2418 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
2419 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
2420 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
2421 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
2422 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
2423 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
2424 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
2425 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
2426 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
2427 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
2428 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
2429 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
2430 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 },
2431 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 },
2432 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 },
2433
2434 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
2435 };
2436
2437 static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
2438 // Mask sign extend has an instruction.
2439 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
2440 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
2441 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
2442 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 },
2443 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
2444 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 },
2445 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 },
2446 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
2447
2448 // Mask zero extend is a sext + shift.
2449 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
2450 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
2451 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
2452 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 },
2453 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
2454 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 },
2455 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 },
2456 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
2457
2458 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 },
2459 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 },
2460 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
2461 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
2462 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
2463 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
2464 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 },
2465 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2466
2467 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
2468 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2469 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
2470 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
2471
2472 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
2473 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2474 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
2475 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
2476
2477 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 },
2478 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
2479 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
2480 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
2481
2482 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 },
2483 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
2484 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
2485 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
2486 };
2487
2488 static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
2489 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2490 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2491 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2492 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
2493 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2494 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2495 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2496 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
2497 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
2498 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
2499 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2500 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2501 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
2502 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
2503 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
2504 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
2505 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
2506 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
2507
2508 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2509 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2510 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
2511 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
2512 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
2513 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
2514 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
2515 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
2516 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
2517 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },
2518
2519 // sign extend is vpcmpeq+maskedmove+vpmovdw
2520 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2521 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
2522 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
2523 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
2524 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
2525 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
2526 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
2527 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
2528 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
2529
2530 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
2531 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
2532 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
2533 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
2534 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
2535 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
2536 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, // vpternlogd
2537 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, // vpternlogd+psrld
2538
2539 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
2540 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
2541 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
2542 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
2543
2544 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
2545 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
2546 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
2547 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
2548 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
2549 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
2550 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
2551 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
2552 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
2553 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
2554 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
2555 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
2556
2557 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2558 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
2559 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2560 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
2561
2562 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
2563 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
2564 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2565 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
2566 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2567 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
2568 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2569 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2570 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
2571 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
2572 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
2573 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2574 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
2575
2576 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2577 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
2578 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 },
2579
2580 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
2581 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
2582 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
2583 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 },
2584 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
2585 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
2586 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
2587 };
2588
2589 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
2590 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
2591 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
2592 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
2593 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
2594 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2595 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2596
2597 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
2598 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
2599 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
2600 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
2601 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2602 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2603 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
2604 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
2605 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2606 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2607 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2608 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2609 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2610 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2611
2612 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2613
2614 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 },
2615 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 },
2616 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 },
2617 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 },
2618 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 },
2619 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 },
2620 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 },
2621 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 },
2622 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 },
2623 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 },
2624 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
2625 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
2626
2627 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
2628 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
2629
2630 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 },
2631 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 },
2632 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 },
2633 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 },
2634
2635 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 },
2636 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 },
2637 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 },
2638 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2639 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2640 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 },
2641 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 },
2642 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 },
2643
2644 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
2645 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
2646 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
2647 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
2648 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
2649 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
2650 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 },
2651
2652 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
2653 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
2654 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
2655 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
2656 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
2657 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
2658 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 },
2659 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2660 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2661 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2662 };
2663
2664 static const TypeConversionCostTblEntry AVXConversionTbl[] = {
2665 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
2666 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
2667 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
2668 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
2669 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2670 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2671
2672 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2673 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2674 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2675 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2676 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2677 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2678 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2679 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2680 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2681 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2682 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2683 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2684
2685 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
2686 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
2687 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
2688 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
2689 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
2690
2691 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
2692 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
2693 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
2694 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 },
2695 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2696 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 },
2697 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
2698 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
2699
2700 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
2701 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
2702 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
2703 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2704 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2705 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2706 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2707 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2708 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2709 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2710 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 },
2711 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 },
2712
2713 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
2714 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
2715 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
2716 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2717 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2718 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2719 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2720 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 },
2721 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 },
2722 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2723 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
2724 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
2725 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 },
2726 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 },
2727 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 },
2728 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2729 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 },
2730
2731 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2732 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 },
2733 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 },
2734 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 },
2735 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 },
2736 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 },
2737 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 },
2738 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 },
2739 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 },
2740 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 },
2741 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 },
2742
2743 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 },
2744 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 },
2745 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 },
2746 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 },
2747 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 },
2748 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 },
2749 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 },
2750 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 },
2751 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2752 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2753 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 },
2754 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 },
2755 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 },
2756
2757 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
2758 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
2759 };
2760
2761 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2762 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2763 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2764 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2765 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2766 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2767 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2768 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2769 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2770 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2771 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2772 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2773 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2774
2775 // These truncates end up widening elements.
2776 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
2777 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
2778 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
2779
2780 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 },
2781 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 },
2782 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 },
2783
2784 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 },
2785 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 },
2786 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 },
2787 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 },
2788 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2789 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2790 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2791 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2792 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2793 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 },
2794 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2795
2796 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 },
2797 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 },
2798 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
2799 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
2800 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2801 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2802 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2803 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2804 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 },
2805 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2806 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 },
2807 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 },
2808 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 },
2809 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 },
2810
2811 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 },
2812 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 },
2813 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 },
2814 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 },
2815 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 },
2816 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 },
2817 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 },
2818 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 },
2819 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
2820 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 },
2821
2822 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 },
2823 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2824 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 },
2825 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
2826 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 },
2827 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 },
2828 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 },
2829 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 },
2830 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 },
2831 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2832 };
2833
2834 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2835 // These are somewhat magic numbers justified by comparing the
2836 // output of llvm-mca for our various supported scheduler models
2837 // and basing it off the worst case scenario.
2838 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 },
2839 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 },
2840 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 },
2841 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 },
2842 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 },
2843 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2844 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 },
2845 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2846 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2847 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 },
2848 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 },
2849 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 },
2850
2851 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 },
2852 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 },
2853 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 },
2854 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 },
2855 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2856 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 },
2857 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 },
2858 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2859 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 },
2860 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 },
2861 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2862 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 },
2863 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 },
2864
2865 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 },
2866 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 },
2867 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 },
2868 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 },
2869 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 },
2870 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 },
2871 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 },
2872 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 },
2873 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 },
2874 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 },
2875
2876 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 },
2877 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2878 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 },
2879 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
2880 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 },
2881 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 },
2882 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 },
2883 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 },
2884 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 },
2885 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 },
2886
2887 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2888 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2889 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 },
2890 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 },
2891 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2892 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 },
2893 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 },
2894 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 },
2895 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2896 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 },
2897 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2898 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 },
2899
2900 // These truncates are really widening elements.
2901 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2902 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2903 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2904 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2905 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2906 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2907
2908 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2909 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
2910 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
2911 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
2912 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
2913 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 },
2914 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2915 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 },
2916 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2917 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2918 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
2919 };
2920
2921 // Attempt to map directly to (simple) MVT types to let us match custom entries.
2922 EVT SrcTy = TLI->getValueType(DL, Src);
2923 EVT DstTy = TLI->getValueType(DL, Dst);
2924
2925 // The function getSimpleVT only handles simple value types.
2926 if (SrcTy.isSimple() && DstTy.isSimple()) {
2927 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2928 MVT SimpleDstTy = DstTy.getSimpleVT();
2929
2930 if (ST->useAVX512Regs()) {
2931 if (ST->hasBWI())
2932 if (const auto *Entry = ConvertCostTableLookup(
2933 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2934 return AdjustCost(Entry->Cost);
2935
2936 if (ST->hasDQI())
2937 if (const auto *Entry = ConvertCostTableLookup(
2938 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2939 return AdjustCost(Entry->Cost);
2940
2941 if (ST->hasAVX512())
2942 if (const auto *Entry = ConvertCostTableLookup(
2943 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2944 return AdjustCost(Entry->Cost);
2945 }
2946
2947 if (ST->hasBWI())
2948 if (const auto *Entry = ConvertCostTableLookup(
2949 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2950 return AdjustCost(Entry->Cost);
2951
2952 if (ST->hasDQI())
2953 if (const auto *Entry = ConvertCostTableLookup(
2954 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2955 return AdjustCost(Entry->Cost);
2956
2957 if (ST->hasAVX512())
2958 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2959 SimpleDstTy, SimpleSrcTy))
2960 return AdjustCost(Entry->Cost);
2961
2962 if (ST->hasAVX2()) {
2963 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2964 SimpleDstTy, SimpleSrcTy))
2965 return AdjustCost(Entry->Cost);
2966 }
2967
2968 if (ST->hasAVX()) {
2969 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2970 SimpleDstTy, SimpleSrcTy))
2971 return AdjustCost(Entry->Cost);
2972 }
2973
2974 if (ST->hasSSE41()) {
2975 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2976 SimpleDstTy, SimpleSrcTy))
2977 return AdjustCost(Entry->Cost);
2978 }
2979
2980 if (ST->hasSSE2()) {
2981 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2982 SimpleDstTy, SimpleSrcTy))
2983 return AdjustCost(Entry->Cost);
2984 }
2985 }
2986
2987 // Fall back to legalized types.
2988 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
2989 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
2990
2991 // If we're truncating to the same legalized type - just assume its free.
2992 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
2993 return TTI::TCC_Free;
2994
2995 if (ST->useAVX512Regs()) {
2996 if (ST->hasBWI())
2997 if (const auto *Entry = ConvertCostTableLookup(
2998 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
2999 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3000
3001 if (ST->hasDQI())
3002 if (const auto *Entry = ConvertCostTableLookup(
3003 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3004 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3005
3006 if (ST->hasAVX512())
3007 if (const auto *Entry = ConvertCostTableLookup(
3008 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3009 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3010 }
3011
3012 if (ST->hasBWI())
3013 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3014 LTDest.second, LTSrc.second))
3015 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3016
3017 if (ST->hasDQI())
3018 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3019 LTDest.second, LTSrc.second))
3020 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3021
3022 if (ST->hasAVX512())
3023 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3024 LTDest.second, LTSrc.second))
3025 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3026
3027 if (ST->hasAVX2())
3028 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3029 LTDest.second, LTSrc.second))
3030 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3031
3032 if (ST->hasAVX())
3033 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3034 LTDest.second, LTSrc.second))
3035 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3036
3037 if (ST->hasSSE41())
3038 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3039 LTDest.second, LTSrc.second))
3040 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3041
3042 if (ST->hasSSE2())
3043 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3044 LTDest.second, LTSrc.second))
3045 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3046
3047 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3048 // sitofp.
3049 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3050 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3051 Type *ExtSrc = Src->getWithNewBitWidth(32);
3052 unsigned ExtOpc =
3053 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3054
3055 // For scalar loads the extend would be free.
3056 InstructionCost ExtCost = 0;
3057 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3058 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3059
3060 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3062 }
3063
3064 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3065 // i32.
3066 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3067 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3068 Type *TruncDst = Dst->getWithNewBitWidth(32);
3069 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3070 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3072 }
3073
3074 return AdjustCost(
3075 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3076}
3077
3079 Type *CondTy,
3080 CmpInst::Predicate VecPred,
3082 const Instruction *I) {
3083 // Early out if this type isn't scalar/vector integer/float.
3084 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3085 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3086 I);
3087
3088 // Legalize the type.
3089 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3090
3091 MVT MTy = LT.second;
3092
3093 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3094 assert(ISD && "Invalid opcode");
3095
3096 InstructionCost ExtraCost = 0;
3097 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3098 // Some vector comparison predicates cost extra instructions.
3099 // TODO: Adjust ExtraCost based on CostKind?
3100 // TODO: Should we invert this and assume worst case cmp costs
3101 // and reduce for particular predicates?
3102 if (MTy.isVector() &&
3103 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3104 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3105 ST->hasBWI())) {
3106 // Fallback to I if a specific predicate wasn't specified.
3107 CmpInst::Predicate Pred = VecPred;
3108 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3110 Pred = cast<CmpInst>(I)->getPredicate();
3111
3112 bool CmpWithConstant = false;
3113 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3114 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3115
3116 switch (Pred) {
3118 // xor(cmpeq(x,y),-1)
3119 ExtraCost = CmpWithConstant ? 0 : 1;
3120 break;
3123 // xor(cmpgt(x,y),-1)
3124 ExtraCost = CmpWithConstant ? 0 : 1;
3125 break;
3128 // cmpgt(xor(x,signbit),xor(y,signbit))
3129 // xor(cmpeq(pmaxu(x,y),x),-1)
3130 ExtraCost = CmpWithConstant ? 1 : 2;
3131 break;
3134 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3135 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3136 // cmpeq(psubus(x,y),0)
3137 // cmpeq(pminu(x,y),x)
3138 ExtraCost = 1;
3139 } else {
3140 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3141 ExtraCost = CmpWithConstant ? 2 : 3;
3142 }
3143 break;
3146 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3147 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3148 if (CondTy && !ST->hasAVX())
3149 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3151 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3153 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3154
3155 break;
3158 // Assume worst case scenario and add the maximum extra cost.
3159 ExtraCost = 3;
3160 break;
3161 default:
3162 break;
3163 }
3164 }
3165 }
3166
3167 static const CostKindTblEntry SLMCostTbl[] = {
3168 // slm pcmpeq/pcmpgt throughput is 2
3169 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3170 // slm pblendvb/blendvpd/blendvps throughput is 4
3171 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3172 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3173 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3174 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3175 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3176 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3177 };
3178
3179 static const CostKindTblEntry AVX512BWCostTbl[] = {
3180 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3181 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3182 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3183 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3184
3185 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3186 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3187 };
3188
3189 static const CostKindTblEntry AVX512CostTbl[] = {
3190 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3191 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3192 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3193 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3194
3195 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3196 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3197 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3198 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3199 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3200 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3201 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3202
3203 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3204 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3205 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3206 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3207 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3208 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3209 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3210 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3211 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3212 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3213 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3214 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3215 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3216 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3217
3218 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3219 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3220 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3221 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3222 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3223 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3224 };
3225
3226 static const CostKindTblEntry AVX2CostTbl[] = {
3227 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3228 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3229 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3230 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3231 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3232 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3233
3234 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3235 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3236 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3237 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3238
3239 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3240 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3241 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3242 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3243 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3244 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3245 };
3246
3247 static const CostKindTblEntry XOPCostTbl[] = {
3248 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3249 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3250 };
3251
3252 static const CostKindTblEntry AVX1CostTbl[] = {
3253 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3254 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3255 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3256 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3257 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3258 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3259
3260 // AVX1 does not support 8-wide integer compare.
3261 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3262 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3263 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3264 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3265
3266 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3267 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3268 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3269 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3270 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3271 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3272 };
3273
3274 static const CostKindTblEntry SSE42CostTbl[] = {
3275 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3276 };
3277
3278 static const CostKindTblEntry SSE41CostTbl[] = {
3279 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3280 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3281
3282 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3283 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3284 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3285 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3286 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3287 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3288 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3289 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3290 };
3291
3292 static const CostKindTblEntry SSE2CostTbl[] = {
3293 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3294 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3295
3296 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3297 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3298 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3299 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3300
3301 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3302 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3303 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3304 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3305 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3306 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3307 };
3308
3309 static const CostKindTblEntry SSE1CostTbl[] = {
3310 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3311 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3312
3313 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3314 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3315 };
3316
3317 if (ST->useSLMArithCosts())
3318 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3319 if (auto KindCost = Entry->Cost[CostKind])
3320 return LT.first * (ExtraCost + *KindCost);
3321
3322 if (ST->hasBWI())
3323 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3324 if (auto KindCost = Entry->Cost[CostKind])
3325 return LT.first * (ExtraCost + *KindCost);
3326
3327 if (ST->hasAVX512())
3328 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3329 if (auto KindCost = Entry->Cost[CostKind])
3330 return LT.first * (ExtraCost + *KindCost);
3331
3332 if (ST->hasAVX2())
3333 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3334 if (auto KindCost = Entry->Cost[CostKind])
3335 return LT.first * (ExtraCost + *KindCost);
3336
3337 if (ST->hasXOP())
3338 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3339 if (auto KindCost = Entry->Cost[CostKind])
3340 return LT.first * (ExtraCost + *KindCost);
3341
3342 if (ST->hasAVX())
3343 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3344 if (auto KindCost = Entry->Cost[CostKind])
3345 return LT.first * (ExtraCost + *KindCost);
3346
3347 if (ST->hasSSE42())
3348 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3349 if (auto KindCost = Entry->Cost[CostKind])
3350 return LT.first * (ExtraCost + *KindCost);
3351
3352 if (ST->hasSSE41())
3353 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3354 if (auto KindCost = Entry->Cost[CostKind])
3355 return LT.first * (ExtraCost + *KindCost);
3356
3357 if (ST->hasSSE2())
3358 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3359 if (auto KindCost = Entry->Cost[CostKind])
3360 return LT.first * (ExtraCost + *KindCost);
3361
3362 if (ST->hasSSE1())
3363 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3364 if (auto KindCost = Entry->Cost[CostKind])
3365 return LT.first * (ExtraCost + *KindCost);
3366
3367 // Assume a 3cy latency for fp select ops.
3368 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3369 if (ValTy->getScalarType()->isFloatingPointTy())
3370 return 3;
3371
3372 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3373}
3374
3376
3380 // Costs should match the codegen from:
3381 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3382 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3383 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3384 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3385 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3386
3387 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3388 // specialized in these tables yet.
3389 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3390 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3391 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3392 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3393 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3394 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3395 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3396 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3397 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3398 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3399 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3400 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3401 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3402 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3403 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3404 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3405 };
3406 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3407 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3408 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3409 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3410 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3411 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3412 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3413 };
3414 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3415 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3416 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3417 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3418 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3419 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3420 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3421 };
3422 static const CostKindTblEntry AVX512CDCostTbl[] = {
3423 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3424 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3425 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3426 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3427 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3428 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3429 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3430 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3431 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3432 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3433 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3434 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3435
3436 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3437 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3438 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3439 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3440 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3441 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3442 };
3443 static const CostKindTblEntry AVX512BWCostTbl[] = {
3444 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3445 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3446 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3447 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3448 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3449 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3450 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3451 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3452 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3453 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3454 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3455 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3456 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3457 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3458 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3459 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3460 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3461 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3462 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3463 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3464 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3465 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3466 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3467 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3468 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3469 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3470 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3471 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3472 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3473 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3474 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3475 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3476 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3477 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3478 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3479 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3480 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3481 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3482 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3483 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3484 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3485 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3486 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3487 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3488 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3489 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3490 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3491 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3492 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3493 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3494 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3495 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3496 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3497 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3498 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3499 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3500 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3501 { ISD::SADDSAT, MVT::v32i16, { 1 } },
3502 { ISD::SADDSAT, MVT::v64i8, { 1 } },
3503 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3504 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3505 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3506 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3507 { ISD::SSUBSAT, MVT::v32i16, { 1 } },
3508 { ISD::SSUBSAT, MVT::v64i8, { 1 } },
3509 { ISD::UADDSAT, MVT::v32i16, { 1 } },
3510 { ISD::UADDSAT, MVT::v64i8, { 1 } },
3511 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3512 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3513 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3514 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3515 { ISD::USUBSAT, MVT::v32i16, { 1 } },
3516 { ISD::USUBSAT, MVT::v64i8, { 1 } },
3517 };
3518 static const CostKindTblEntry AVX512CostTbl[] = {
3519 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3520 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3521 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3522 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3523 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3524 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3525 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3526 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3527 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3528 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3529 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3530 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3531 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3532 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3533 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3534 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3535 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3536 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3537 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3538 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3539 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3540 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3541 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3542 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3543 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3544 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3545 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3546 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3547 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3548 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3549 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3550 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3551 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3552 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3553 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3554 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3555 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3556 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3557 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3558 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3559 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3560 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3561 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3562 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3563 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3564 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3565 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3566 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3567 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3568 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3569 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3570 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3571 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3572 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3573 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3574 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3575 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3576 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3577 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3578 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3579 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3580 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3581 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3582 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3583 { ISD::USUBSAT, MVT::v16i32, { 2 } }, // pmaxud + psubd
3584 { ISD::USUBSAT, MVT::v2i64, { 2 } }, // pmaxuq + psubq
3585 { ISD::USUBSAT, MVT::v4i64, { 2 } }, // pmaxuq + psubq
3586 { ISD::USUBSAT, MVT::v8i64, { 2 } }, // pmaxuq + psubq
3587 { ISD::UADDSAT, MVT::v16i32, { 3 } }, // not + pminud + paddd
3588 { ISD::UADDSAT, MVT::v2i64, { 3 } }, // not + pminuq + paddq
3589 { ISD::UADDSAT, MVT::v4i64, { 3 } }, // not + pminuq + paddq
3590 { ISD::UADDSAT, MVT::v8i64, { 3 } }, // not + pminuq + paddq
3591 { ISD::SADDSAT, MVT::v32i16, { 2 } },
3592 { ISD::SADDSAT, MVT::v64i8, { 2 } },
3593 { ISD::SSUBSAT, MVT::v32i16, { 2 } },
3594 { ISD::SSUBSAT, MVT::v64i8, { 2 } },
3595 { ISD::UADDSAT, MVT::v32i16, { 2 } },
3596 { ISD::UADDSAT, MVT::v64i8, { 2 } },
3597 { ISD::USUBSAT, MVT::v32i16, { 2 } },
3598 { ISD::USUBSAT, MVT::v64i8, { 2 } },
3599 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3600 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3601 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3602 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3603 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3604 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3605 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3606 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3607 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3608 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3609 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3610 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3611 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3612 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3613 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3614 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3615 };
3616 static const CostKindTblEntry XOPCostTbl[] = {
3617 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3618 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3619 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3620 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3621 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3622 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3623 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3624 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3625 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3626 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3627 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3628 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3629 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3630 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3631 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3632 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3633 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3634 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3635 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3636 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3637 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3638 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3639 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3640 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3641 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3642 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3643 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3644 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3645 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } }
3646 };
3647 static const CostKindTblEntry AVX2CostTbl[] = {
3648 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3649 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3650 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3651 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3652 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3653 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3654 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3655 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3656 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3657 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3658 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3659 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3660 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3661 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3662 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3663 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3664 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3665 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3666 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3667 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3668 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3669 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3670 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3671 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3672 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3673 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3674 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3675 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3676 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3677 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3678 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3679 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3680 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3681 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3682 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3683 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3684 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3685 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3686 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3687 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3688 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3689 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3690 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3691 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3692 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3693 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3694 { ISD::SADDSAT, MVT::v16i16, { 1 } },
3695 { ISD::SADDSAT, MVT::v32i8, { 1 } },
3696 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
3697 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
3698 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3699 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3700 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3701 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
3702 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
3703 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3704 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3705 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3706 { ISD::SSUBSAT, MVT::v16i16, { 1 } },
3707 { ISD::SSUBSAT, MVT::v32i8, { 1 } },
3708 { ISD::UADDSAT, MVT::v16i16, { 1 } },
3709 { ISD::UADDSAT, MVT::v32i8, { 1 } },
3710 { ISD::UADDSAT, MVT::v8i32, { 3 } }, // not + pminud + paddd
3711 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
3712 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
3713 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3714 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3715 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3716 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
3717 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
3718 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3719 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3720 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3721 { ISD::USUBSAT, MVT::v16i16, { 1 } },
3722 { ISD::USUBSAT, MVT::v32i8, { 1 } },
3723 { ISD::USUBSAT, MVT::v8i32, { 2 } }, // pmaxud + psubd
3724 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3725 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3726 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3727 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3728 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3729 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3730 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
3731 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
3732 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
3733 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
3734 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
3735 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
3736 };
3737 static const CostKindTblEntry AVX1CostTbl[] = {
3738 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3739 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
3740 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
3741 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
3742 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3743 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
3744 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3745 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
3746 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3747 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
3748 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
3749 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
3750 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
3751 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
3752 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
3753 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
3754 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
3755 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
3756 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
3757 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
3758 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
3759 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
3760 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
3761 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
3762 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3763 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
3764 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3765 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
3766 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3767 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
3768 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
3769 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
3770 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
3771 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
3772 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
3773 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
3774 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
3775 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
3776 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3777 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
3778 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
3779 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
3780 { ISD::SADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3781 { ISD::SADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3782 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3783 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
3784 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3785 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3786 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3787 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3788 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3789 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3790 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3791 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3792 { ISD::SSUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3793 { ISD::SSUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3794 { ISD::UADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3795 { ISD::UADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3796 { ISD::UADDSAT, MVT::v8i32, { 8 } }, // 2 x 128-bit Op + extract/insert
3797 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3798 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
3799 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3800 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3801 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3802 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3803 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
3804 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3805 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3806 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3807 { ISD::USUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3808 { ISD::USUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3809 { ISD::USUBSAT, MVT::v8i32, { 6 } }, // 2 x 128-bit Op + extract/insert
3810 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3811 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3812 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3813 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3814 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3815 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3816 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
3817 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
3818 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
3819 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
3820 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
3821 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
3822 };
3823 static const CostKindTblEntry GLMCostTbl[] = {
3824 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
3825 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
3826 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
3827 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
3828 };
3829 static const CostKindTblEntry SLMCostTbl[] = {
3830 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
3831 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
3832 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
3833 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
3834 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
3835 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
3836 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
3837 };
3838 static const CostKindTblEntry SSE42CostTbl[] = {
3839 { ISD::USUBSAT, MVT::v4i32, { 2 } }, // pmaxud + psubd
3840 { ISD::UADDSAT, MVT::v4i32, { 3 } }, // not + pminud + paddd
3841 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3842 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3843 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3844 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3845 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3846 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3847 };
3848 static const CostKindTblEntry SSE41CostTbl[] = {
3849 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
3850 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
3851 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3852 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
3853 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3854 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3855 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
3856 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
3857 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3858 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
3859 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
3860 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3861 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
3862 };
3863 static const CostKindTblEntry SSSE3CostTbl[] = {
3864 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
3865 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
3866 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
3867 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
3868 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
3869 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
3870 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
3871 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
3872 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
3873 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
3874 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
3875 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
3876 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
3877 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
3878 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
3879 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
3880 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
3881 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
3882 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
3883 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
3884 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
3885 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
3886 };
3887 static const CostKindTblEntry SSE2CostTbl[] = {
3888 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
3889 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
3890 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
3891 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
3892 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
3893 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
3894 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
3895 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
3896 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
3897 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
3898 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
3899 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
3900 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
3901 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
3902 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
3903 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
3904 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
3905 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
3906 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
3907 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
3908 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
3909 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
3910 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
3911 { ISD::SADDSAT, MVT::v8i16, { 1 } },
3912 { ISD::SADDSAT, MVT::v16i8, { 1 } },
3913 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
3914 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
3915 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
3916 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
3917 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
3918 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
3919 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
3920 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
3921 { ISD::SSUBSAT, MVT::v8i16, { 1 } },
3922 { ISD::SSUBSAT, MVT::v16i8, { 1 } },
3923 { ISD::UADDSAT, MVT::v8i16, { 1 } },
3924 { ISD::UADDSAT, MVT::v16i8, { 1 } },
3925 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
3926 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
3927 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
3928 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
3929 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
3930 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
3931 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
3932 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
3933 { ISD::USUBSAT, MVT::v8i16, { 1 } },
3934 { ISD::USUBSAT, MVT::v16i8, { 1 } },
3935 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
3936 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
3937 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
3938 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
3939 };
3940 static const CostKindTblEntry SSE1CostTbl[] = {
3941 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
3942 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
3943 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
3944 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
3945 };
3946 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
3947 { ISD::CTTZ, MVT::i64, { 1 } },
3948 };
3949 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
3950 { ISD::CTTZ, MVT::i32, { 1 } },
3951 { ISD::CTTZ, MVT::i16, { 1 } },
3952 { ISD::CTTZ, MVT::i8, { 1 } },
3953 };
3954 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
3955 { ISD::CTLZ, MVT::i64, { 1 } },
3956 };
3957 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
3958 { ISD::CTLZ, MVT::i32, { 1 } },
3959 { ISD::CTLZ, MVT::i16, { 2 } },
3960 { ISD::CTLZ, MVT::i8, { 2 } },
3961 };
3962 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
3963 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
3964 };
3965 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
3966 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
3967 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
3968 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
3969 };
3970 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
3971 { ISD::ABS, MVT::i64, { 1, 2, 3, 4 } }, // SUB+CMOV
3972 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
3973 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
3974 { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
3975 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR+XOR
3976 { ISD::CTTZ, MVT::i64, { 3 } }, // TEST+BSF+CMOV/BRANCH
3977 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR
3978 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
3979 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
3980 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
3981 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
3982 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
3983 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
3984 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
3985 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
3986 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
3987 { ISD::SADDO, MVT::i64, { 1 } },
3988 { ISD::UADDO, MVT::i64, { 1 } },
3989 { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto
3990 };
3991 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
3992 { ISD::ABS, MVT::i32, { 1, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV
3993 { ISD::ABS, MVT::i16, { 2, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV
3994 { ISD::ABS, MVT::i8, { 2, 4, 4, 4 } }, // SUB+XOR+SRA
3995 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
3996 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
3997 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
3998 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
3999 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4000 { ISD::CTLZ, MVT::i32, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4001 { ISD::CTLZ, MVT::i16, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4002 { ISD::CTLZ, MVT::i8, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4003 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSR+XOR
4004 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 3, 3 } }, // BSR+XOR
4005 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4006 { ISD::CTTZ, MVT::i32, { 3 } }, // TEST+BSF+CMOV/BRANCH
4007 { ISD::CTTZ, MVT::i16, { 3 } }, // TEST+BSF+CMOV/BRANCH
4008 { ISD::CTTZ, MVT::i8, { 3 } }, // TEST+BSF+CMOV/BRANCH
4009 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSF
4010 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 1 } }, // BSF
4011 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 1 } }, // BSF
4012 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4013 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4014 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4015 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4016 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4017 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4018 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4019 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4020 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4021 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4022 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4023 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4024 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4025 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4026 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4027 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4028 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4029 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4030 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4031 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4032 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4033 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4034 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4035 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4036 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4037 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4038 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4039 { ISD::SADDO, MVT::i32, { 1 } },
4040 { ISD::SADDO, MVT::i16, { 1 } },
4041 { ISD::SADDO, MVT::i8, { 1 } },
4042 { ISD::UADDO, MVT::i32, { 1 } },
4043 { ISD::UADDO, MVT::i16, { 1 } },
4044 { ISD::UADDO, MVT::i8, { 1 } },
4045 { ISD::UMULO, MVT::i32, { 2 } }, // mul + seto
4046 { ISD::UMULO, MVT::i16, { 2 } },
4047 { ISD::UMULO, MVT::i8, { 2 } },
4048 };
4049
4050 Type *RetTy = ICA.getReturnType();
4051 Type *OpTy = RetTy;
4052 Intrinsic::ID IID = ICA.getID();
4053 unsigned ISD = ISD::DELETED_NODE;
4054 switch (IID) {
4055 default:
4056 break;
4057 case Intrinsic::abs:
4058 ISD = ISD::ABS;
4059 break;
4060 case Intrinsic::bitreverse:
4061 ISD = ISD::BITREVERSE;
4062 break;
4063 case Intrinsic::bswap:
4064 ISD = ISD::BSWAP;
4065 break;
4066 case Intrinsic::ctlz:
4067 ISD = ISD::CTLZ;
4068 break;
4069 case Intrinsic::ctpop:
4070 ISD = ISD::CTPOP;
4071 break;
4072 case Intrinsic::cttz:
4073 ISD = ISD::CTTZ;
4074 break;
4075 case Intrinsic::fshl:
4076 ISD = ISD::FSHL;
4077 if (!ICA.isTypeBasedOnly()) {
4078 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4079 if (Args[0] == Args[1]) {
4080 ISD = ISD::ROTL;
4081 // Handle scalar constant rotation amounts.
4082 // TODO: Handle vector + funnel-shift cases.
4083 if (isa_and_nonnull<ConstantInt>(Args[2]))
4084 ISD = X86ISD::VROTLI;
4085 }
4086 }
4087 break;
4088 case Intrinsic::fshr:
4089 // FSHR has same costs so don't duplicate.
4090 ISD = ISD::FSHL;
4091 if (!ICA.isTypeBasedOnly()) {
4092 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4093 if (Args[0] == Args[1]) {
4094 // Handle scalar constant rotation amount.
4095 // TODO: Handle vector + funnel-shift cases.
4096 ISD = ISD::ROTR;
4097 if (isa_and_nonnull<ConstantInt>(Args[2]))
4098 ISD = X86ISD::VROTLI;
4099 }
4100 }
4101 break;
4102 case Intrinsic::maxnum:
4103 case Intrinsic::minnum:
4104 // FMINNUM has same costs so don't duplicate.
4105 ISD = ISD::FMAXNUM;
4106 break;
4107 case Intrinsic::sadd_sat:
4108 ISD = ISD::SADDSAT;
4109 break;
4110 case Intrinsic::smax:
4111 ISD = ISD::SMAX;
4112 break;
4113 case Intrinsic::smin:
4114 ISD = ISD::SMIN;
4115 break;
4116 case Intrinsic::ssub_sat:
4117 ISD = ISD::SSUBSAT;
4118 break;
4119 case Intrinsic::uadd_sat:
4120 ISD = ISD::UADDSAT;
4121 break;
4122 case Intrinsic::umax:
4123 ISD = ISD::UMAX;
4124 break;
4125 case Intrinsic::umin:
4126 ISD = ISD::UMIN;
4127 break;
4128 case Intrinsic::usub_sat:
4129 ISD = ISD::USUBSAT;
4130 break;
4131 case Intrinsic::sqrt:
4132 ISD = ISD::FSQRT;
4133 break;
4134 case Intrinsic::sadd_with_overflow:
4135 case Intrinsic::ssub_with_overflow:
4136 // SSUBO has same costs so don't duplicate.
4137 ISD = ISD::SADDO;
4138 OpTy = RetTy->getContainedType(0);
4139 break;
4140 case Intrinsic::uadd_with_overflow:
4141 case Intrinsic::usub_with_overflow:
4142 // USUBO has same costs so don't duplicate.
4143 ISD = ISD::UADDO;
4144 OpTy = RetTy->getContainedType(0);
4145 break;
4146 case Intrinsic::umul_with_overflow:
4147 case Intrinsic::smul_with_overflow:
4148 // SMULO has same costs so don't duplicate.
4149 ISD = ISD::UMULO;
4150 OpTy = RetTy->getContainedType(0);
4151 break;
4152 }
4153
4154 if (ISD != ISD::DELETED_NODE) {
4155 // Legalize the type.
4156 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4157 MVT MTy = LT.second;
4158
4159 // Attempt to lookup cost.
4160 if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
4161 MTy.isVector()) {
4162 // With PSHUFB the code is very similar for all types. If we have integer
4163 // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
4164 // we also need a PSHUFB.
4165 unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
4166
4167 // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
4168 // instructions. We also need an extract and an insert.
4169 if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
4170 (ST->hasBWI() && MTy.is512BitVector())))
4171 Cost = Cost * 2 + 2;
4172
4173 return LT.first * Cost;
4174 }
4175
4176 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4177 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4178 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4179 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4180 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4181 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4182 if (Cst->isAllOnesValue())
4184 }
4185
4186 // FSQRT is a single instruction.
4187 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4188 return LT.first;
4189
4190 auto adjustTableCost = [](int ISD, unsigned Cost,
4191 InstructionCost LegalizationCost,
4192 FastMathFlags FMF) {
4193 // If there are no NANs to deal with, then these are reduced to a
4194 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4195 // assume is used in the non-fast case.
4196 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4197 if (FMF.noNaNs())
4198 return LegalizationCost * 1;
4199 }
4200 return LegalizationCost * (int)Cost;
4201 };
4202
4203 if (ST->useGLMDivSqrtCosts())
4204 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4205 if (auto KindCost = Entry->Cost[CostKind])
4206 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4207 ICA.getFlags());
4208
4209 if (ST->useSLMArithCosts())
4210 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4211 if (auto KindCost = Entry->Cost[CostKind])
4212 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4213 ICA.getFlags());
4214
4215 if (ST->hasVBMI2())
4216 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4217 if (auto KindCost = Entry->Cost[CostKind])
4218 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4219 ICA.getFlags());
4220
4221 if (ST->hasBITALG())
4222 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4223 if (auto KindCost = Entry->Cost[CostKind])
4224 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4225 ICA.getFlags());
4226
4227 if (ST->hasVPOPCNTDQ())
4228 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4229 if (auto KindCost = Entry->Cost[CostKind])
4230 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4231 ICA.getFlags());
4232
4233 if (ST->hasCDI())
4234 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4235 if (auto KindCost = Entry->Cost[CostKind])
4236 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4237 ICA.getFlags());
4238
4239 if (ST->hasBWI())
4240 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4241 if (auto KindCost = Entry->Cost[CostKind])
4242 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4243 ICA.getFlags());
4244
4245 if (ST->hasAVX512())
4246 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4247 if (auto KindCost = Entry->Cost[CostKind])
4248 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4249 ICA.getFlags());
4250
4251 if (ST->hasXOP())
4252 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4253 if (auto KindCost = Entry->Cost[CostKind])
4254 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4255 ICA.getFlags());
4256
4257 if (ST->hasAVX2())
4258 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4259 if (auto KindCost = Entry->Cost[CostKind])
4260 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4261 ICA.getFlags());
4262
4263 if (ST->hasAVX())
4264 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4265 if (auto KindCost = Entry->Cost[CostKind])
4266 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4267 ICA.getFlags());
4268
4269 if (ST->hasSSE42())
4270 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4271 if (auto KindCost = Entry->Cost[CostKind])
4272 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4273 ICA.getFlags());
4274
4275 if (ST->hasSSE41())
4276 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4277 if (auto KindCost = Entry->Cost[CostKind])
4278 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4279 ICA.getFlags());
4280
4281 if (ST->hasSSSE3())
4282 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4283 if (auto KindCost = Entry->Cost[CostKind])
4284 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4285 ICA.getFlags());
4286
4287 if (ST->hasSSE2())
4288 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4289 if (auto KindCost = Entry->Cost[CostKind])
4290 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4291 ICA.getFlags());
4292
4293 if (ST->hasSSE1())
4294 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4295 if (auto KindCost = Entry->Cost[CostKind])
4296 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4297 ICA.getFlags());
4298
4299 if (ST->hasBMI()) {
4300 if (ST->is64Bit())
4301 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4302 if (auto KindCost = Entry->Cost[CostKind])
4303 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4304 ICA.getFlags());
4305
4306 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4307 if (auto KindCost = Entry->Cost[CostKind])
4308 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4309 ICA.getFlags());
4310 }
4311
4312 if (ST->hasLZCNT()) {
4313 if (ST->is64Bit())
4314 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4315 if (auto KindCost = Entry->Cost[CostKind])
4316 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4317 ICA.getFlags());
4318
4319 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4320 if (auto KindCost = Entry->Cost[CostKind])
4321 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4322 ICA.getFlags());
4323 }
4324
4325 if (ST->hasPOPCNT()) {
4326 if (ST->is64Bit())
4327 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4328 if (auto KindCost = Entry->Cost[CostKind])
4329 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4330 ICA.getFlags());
4331
4332 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4333 if (auto KindCost = Entry->Cost[CostKind])
4334 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4335 ICA.getFlags());
4336 }
4337
4338 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4339 if (const Instruction *II = ICA.getInst()) {
4340 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4341 return TTI::TCC_Free;
4342 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4343 if (LI->hasOneUse())
4344 return TTI::TCC_Free;
4345 }
4346 }
4347 }
4348
4349 if (ST->is64Bit())
4350 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4351 if (auto KindCost = Entry->Cost[CostKind])
4352 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4353 ICA.getFlags());
4354
4355 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4356 if (auto KindCost = Entry->Cost[CostKind])
4357 return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags());
4358 }
4359
4361}
4362
4365 unsigned Index, Value *Op0,
4366 Value *Op1) {
4367 static const CostTblEntry SLMCostTbl[] = {
4368 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4369 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4370 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4371 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4372 };
4373
4374 assert(Val->isVectorTy() && "This must be a vector type");
4375 Type *ScalarType = Val->getScalarType();
4376 InstructionCost RegisterFileMoveCost = 0;
4377
4378 // Non-immediate extraction/insertion can be handled as a sequence of
4379 // aliased loads+stores via the stack.
4380 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4381 Opcode == Instruction::InsertElement)) {
4382 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4383 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4384
4385 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4386 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4387 Align VecAlign = DL.getPrefTypeAlign(Val);
4388 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4389
4390 // Extract - store vector to stack, load scalar.
4391 if (Opcode == Instruction::ExtractElement) {
4392 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4393 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4394 CostKind);
4395 }
4396 // Insert - store vector to stack, store scalar, load vector.
4397 if (Opcode == Instruction::InsertElement) {
4398 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4399 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4400 CostKind) +
4401 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4402 }
4403 }
4404
4405 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4406 Opcode == Instruction::InsertElement)) {
4407 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4408 if (Opcode == Instruction::ExtractElement &&
4409 ScalarType->getScalarSizeInBits() == 1 &&
4410 cast<FixedVectorType>(Val)->getNumElements() > 1)
4411 return 1;
4412
4413 // Legalize the type.
4414 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4415
4416 // This type is legalized to a scalar type.
4417 if (!LT.second.isVector())
4418 return 0;
4419
4420 // The type may be split. Normalize the index to the new type.
4421 unsigned SizeInBits = LT.second.getSizeInBits();
4422 unsigned NumElts = LT.second.getVectorNumElements();
4423 unsigned SubNumElts = NumElts;
4424 Index = Index % NumElts;
4425
4426 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4427 // For inserts, we also need to insert the subvector back.
4428 if (SizeInBits > 128) {
4429 assert((SizeInBits % 128) == 0 && "Illegal vector");
4430 unsigned NumSubVecs = SizeInBits / 128;
4431 SubNumElts = NumElts / NumSubVecs;
4432 if (SubNumElts <= Index) {
4433 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4434 Index %= SubNumElts;
4435 }
4436 }
4437
4438 MVT MScalarTy = LT.second.getScalarType();
4439 auto IsCheapPInsrPExtrInsertPS = [&]() {
4440 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4441 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4442 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4443 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4444 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4445 Opcode == Instruction::InsertElement);
4446 };
4447
4448 if (Index == 0) {
4449 // Floating point scalars are already located in index #0.
4450 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4451 // true for all.
4452 if (ScalarType->isFloatingPointTy() &&
4453 (Opcode != Instruction::InsertElement || !Op0 ||
4454 isa<UndefValue>(Op0)))
4455 return RegisterFileMoveCost;
4456
4457 if (Opcode == Instruction::InsertElement &&
4458 isa_and_nonnull<UndefValue>(Op0)) {
4459 // Consider the gather cost to be cheap.
4460 if (isa_and_nonnull<LoadInst>(Op1))
4461 return RegisterFileMoveCost;
4462 if (!IsCheapPInsrPExtrInsertPS()) {
4463 // mov constant-to-GPR + movd/movq GPR -> XMM.
4464 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4465 return 2 + RegisterFileMoveCost;
4466 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4467 return 1 + RegisterFileMoveCost;
4468 }
4469 }
4470
4471 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4472 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4473 return 1 + RegisterFileMoveCost;
4474 }
4475
4476 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4477 assert(ISD && "Unexpected vector opcode");
4478 if (ST->useSLMArithCosts())
4479 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4480 return Entry->Cost + RegisterFileMoveCost;
4481
4482 // Consider cheap cases.
4483 if (IsCheapPInsrPExtrInsertPS())
4484 return 1 + RegisterFileMoveCost;
4485
4486 // For extractions we just need to shuffle the element to index 0, which
4487 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4488 // the elements to its destination. In both cases we must handle the
4489 // subvector move(s).
4490 // If the vector type is already less than 128-bits then don't reduce it.
4491 // TODO: Under what circumstances should we shuffle using the full width?
4492 InstructionCost ShuffleCost = 1;
4493 if (Opcode == Instruction::InsertElement) {
4494 auto *SubTy = cast<VectorType>(Val);
4495 EVT VT = TLI->getValueType(DL, Val);
4496 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4497 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4498 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, std::nullopt,
4499 CostKind, 0, SubTy);
4500 }
4501 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4502 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4503 }
4504
4505 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4506 RegisterFileMoveCost;
4507}
4508
4511 bool Insert, bool Extract,
4513 assert(DemandedElts.getBitWidth() ==
4514 cast<FixedVectorType>(Ty)->getNumElements() &&
4515 "Vector size mismatch");
4516
4517 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4518 MVT MScalarTy = LT.second.getScalarType();
4519 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4521
4522 constexpr unsigned LaneBitWidth = 128;
4523 assert((LegalVectorBitWidth < LaneBitWidth ||
4524 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4525 "Illegal vector");
4526
4527 const int NumLegalVectors = *LT.first.getValue();
4528 assert(NumLegalVectors >= 0 && "Negative cost!");
4529
4530 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4531 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4532 if (Insert) {
4533 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4534 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4535 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4536 // For types we can insert directly, insertion into 128-bit sub vectors is
4537 // cheap, followed by a cheap chain of concatenations.
4538 if (LegalVectorBitWidth <= LaneBitWidth) {
4539 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4540 /*Extract*/ false, CostKind);
4541 } else {
4542 // In each 128-lane, if at least one index is demanded but not all
4543 // indices are demanded and this 128-lane is not the first 128-lane of
4544 // the legalized-vector, then this 128-lane needs a extracti128; If in
4545 // each 128-lane, there is at least one demanded index, this 128-lane
4546 // needs a inserti128.
4547
4548 // The following cases will help you build a better understanding:
4549 // Assume we insert several elements into a v8i32 vector in avx2,
4550 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4551 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4552 // inserti128.
4553 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4554 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4555 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4556 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4557 unsigned NumLegalElts =
4558 LT.second.getVectorNumElements() * NumLegalVectors;
4559 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4560 "Vector has been legalized to smaller element count");
4561 assert((NumLegalElts % NumLanesTotal) == 0 &&
4562 "Unexpected elts per lane");
4563 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4564
4565 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4566 auto *LaneTy =
4567 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4568
4569 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4570 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4571 NumEltsPerLane, NumEltsPerLane * I);
4572 if (LaneEltMask.isZero())
4573 continue;
4574 // FIXME: we don't need to extract if all non-demanded elements
4575 // are legalization-inserted padding.
4576 if (!LaneEltMask.isAllOnes())
4577 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4578 CostKind, I * NumEltsPerLane, LaneTy);
4579 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4580 /*Extract*/ false, CostKind);
4581 }
4582
4583 APInt AffectedLanes =
4584 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4585 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4586 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4587 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4588 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4589 unsigned I = NumLegalLanes * LegalVec + Lane;
4590 // No need to insert unaffected lane; or lane 0 of each legal vector
4591 // iff ALL lanes of that vector were affected and will be inserted.
4592 if (!AffectedLanes[I] ||
4593 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
4594 continue;
4595 Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, std::nullopt,
4596 CostKind, I * NumEltsPerLane, LaneTy);
4597 }
4598 }
4599 }
4600 } else if (LT.second.isVector()) {
4601 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
4602 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
4603 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
4604 // considered cheap.
4605 if (Ty->isIntOrIntVectorTy())
4606 Cost += DemandedElts.popcount();
4607
4608 // Get the smaller of the legalized or original pow2-extended number of
4609 // vector elements, which represents the number of unpacks we'll end up
4610 // performing.
4611 unsigned NumElts = LT.second.getVectorNumElements();
4612 unsigned Pow2Elts =
4613 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
4614 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
4615 }
4616 }
4617
4618 if (Extract) {
4619 // vXi1 can be efficiently extracted with MOVMSK.
4620 // TODO: AVX512 predicate mask handling.
4621 // NOTE: This doesn't work well for roundtrip scalarization.
4622 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
4623 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
4624 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
4625 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
4626 return MOVMSKCost;
4627 }
4628
4629 if (LT.second.isVector()) {
4630 unsigned NumLegalElts =
4631 LT.second.getVectorNumElements() * NumLegalVectors;
4632 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4633 "Vector has been legalized to smaller element count");
4634
4635 // If we're extracting elements from a 128-bit subvector lane,
4636 // we only need to extract each lane once, not for every element.
4637 if (LegalVectorBitWidth > LaneBitWidth) {
4638 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4639 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4640 assert((NumLegalElts % NumLanesTotal) == 0 &&
4641 "Unexpected elts per lane");
4642 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4643
4644 // Add cost for each demanded 128-bit subvector extraction.
4645 // Luckily this is a lot easier than for insertion.
4646 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4647 auto *LaneTy =
4648 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4649
4650 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4651 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4652 NumEltsPerLane, I * NumEltsPerLane);
4653 if (LaneEltMask.isZero())
4654 continue;
4655 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4656 CostKind, I * NumEltsPerLane, LaneTy);
4658 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
4659 }
4660
4661 return Cost;
4662 }
4663 }
4664
4665 // Fallback to default extraction.
4666 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
4667 Extract, CostKind);
4668 }
4669
4670 return Cost;
4671}
4672
4674X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
4675 int VF, const APInt &DemandedDstElts,
4677 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
4678 // We don't differentiate element types here, only element bit width.
4679 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
4680
4681 auto bailout = [&]() {
4682 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
4683 DemandedDstElts, CostKind);
4684 };
4685
4686 // For now, only deal with AVX512 cases.
4687 if (!ST->hasAVX512())
4688 return bailout();
4689
4690 // Do we have a native shuffle for this element type, or should we promote?
4691 unsigned PromEltTyBits = EltTyBits;
4692 switch (EltTyBits) {
4693 case 32:
4694 case 64:
4695 break; // AVX512F.
4696 case 16:
4697 if (!ST->hasBWI())
4698 PromEltTyBits = 32; // promote to i32, AVX512F.
4699 break; // AVX512BW
4700 case 8:
4701 if (!ST->hasVBMI())
4702 PromEltTyBits = 32; // promote to i32, AVX512F.
4703 break; // AVX512VBMI
4704 case 1:
4705 // There is no support for shuffling i1 elements. We *must* promote.
4706 if (ST->hasBWI()) {
4707 if (ST->hasVBMI())
4708 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
4709 else
4710 PromEltTyBits = 16; // promote to i16, AVX512BW.
4711 break;
4712 }
4713 PromEltTyBits = 32; // promote to i32, AVX512F.
4714 break;
4715 default:
4716 return bailout();
4717 }
4718 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
4719
4720 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
4721 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
4722
4723 int NumDstElements = VF * ReplicationFactor;
4724 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
4725 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
4726
4727 // Legalize the types.
4728 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
4729 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
4730 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
4731 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
4732 // They should have legalized into vector types.
4733 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
4734 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
4735 return bailout();
4736
4737 if (PromEltTyBits != EltTyBits) {
4738 // If we have to perform the shuffle with wider elt type than our data type,
4739 // then we will first need to anyext (we don't care about the new bits)
4740 // the source elements, and then truncate Dst elements.
4741 InstructionCost PromotionCost;
4742 PromotionCost += getCastInstrCost(
4743 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
4745 PromotionCost +=
4746 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
4747 /*Src=*/PromDstVecTy,
4749 return PromotionCost + getReplicationShuffleCost(PromEltTy,
4750 ReplicationFactor, VF,
4751 DemandedDstElts, CostKind);
4752 }
4753
4754 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
4755 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
4756 "We expect that the legalization doesn't affect the element width, "
4757 "doesn't coalesce/split elements.");
4758
4759 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
4760 unsigned NumDstVectors =
4761 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
4762
4763 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
4764
4765 // Not all the produced Dst elements may be demanded. In our case,
4766 // given that a single Dst vector is formed by a single shuffle,
4767 // if all elements that will form a single Dst vector aren't demanded,
4768 // then we won't need to do that shuffle, so adjust the cost accordingly.
4769 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
4770 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
4771 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
4772
4773 InstructionCost SingleShuffleCost = getShuffleCost(
4774 TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/std::nullopt, CostKind,
4775 /*Index=*/0, /*SubTp=*/nullptr);
4776 return NumDstVectorsDemanded * SingleShuffleCost;
4777}
4778
4780 MaybeAlign Alignment,
4781 unsigned AddressSpace,
4783 TTI::OperandValueInfo OpInfo,
4784 const Instruction *I) {
4785 // TODO: Handle other cost kinds.
4787 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
4788 // Store instruction with index and scale costs 2 Uops.
4789 // Check the preceding GEP to identify non-const indices.
4790 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
4791 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
4792 return TTI::TCC_Basic * 2;
4793 }
4794 }
4795 return TTI::TCC_Basic;
4796 }
4797
4798 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4799 "Invalid Opcode");
4800 // Type legalization can't handle structs
4801 if (TLI->getValueType(DL, Src, true) == MVT::Other)
4802 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4803 CostKind);
4804
4805 // Legalize the type.
4806 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
4807
4808 auto *VTy = dyn_cast<FixedVectorType>(Src);
4809
4811
4812 // Add a cost for constant load to vector.
4813 if (Opcode == Instruction::Store && OpInfo.isConstant())
4814 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
4815 /*AddressSpace=*/0, CostKind);
4816
4817 // Handle the simple case of non-vectors.
4818 // NOTE: this assumes that legalization never creates vector from scalars!
4819 if (!VTy || !LT.second.isVector()) {
4820 // Each load/store unit costs 1.
4821 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
4822 }
4823
4824 bool IsLoad = Opcode == Instruction::Load;
4825
4826 Type *EltTy = VTy->getElementType();
4827
4828 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
4829
4830 // Source of truth: how many elements were there in the original IR vector?
4831 const unsigned SrcNumElt = VTy->getNumElements();
4832
4833 // How far have we gotten?
4834 int NumEltRemaining = SrcNumElt;
4835 // Note that we intentionally capture by-reference, NumEltRemaining changes.
4836 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
4837
4838 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
4839
4840 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
4841 const unsigned XMMBits = 128;
4842 if (XMMBits % EltTyBits != 0)
4843 // Vector size must be a multiple of the element size. I.e. no padding.
4844 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4845 CostKind);
4846 const int NumEltPerXMM = XMMBits / EltTyBits;
4847
4848 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
4849
4850 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
4851 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
4852 // How many elements would a single op deal with at once?
4853 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
4854 // Vector size must be a multiple of the element size. I.e. no padding.
4855 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4856 CostKind);
4857 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
4858
4859 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
4860 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
4861 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
4862 "Unless we haven't halved the op size yet, "
4863 "we have less than two op's sized units of work left.");
4864
4865 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
4866 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
4867 : XMMVecTy;
4868
4869 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
4870 "After halving sizes, the vector elt count is no longer a multiple "
4871 "of number of elements per operation?");
4872 auto *CoalescedVecTy =
4873 CurrNumEltPerOp == 1
4874 ? CurrVecTy
4876 IntegerType::get(Src->getContext(),
4877 EltTyBits * CurrNumEltPerOp),
4878 CurrVecTy->getNumElements() / CurrNumEltPerOp);
4879 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
4880 DL.getTypeSizeInBits(CurrVecTy) &&
4881 "coalesciing elements doesn't change vector width.");
4882
4883 while (NumEltRemaining > 0) {
4884 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
4885
4886 // Can we use this vector size, as per the remaining element count?
4887 // Iff the vector is naturally aligned, we can do a wide load regardless.
4888 if (NumEltRemaining < CurrNumEltPerOp &&
4889 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
4890 CurrOpSizeBytes != 1)
4891 break; // Try smalled vector size.
4892
4893 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
4894
4895 // If we have fully processed the previous reg, we need to replenish it.
4896 if (SubVecEltsLeft == 0) {
4897 SubVecEltsLeft += CurrVecTy->getNumElements();
4898 // And that's free only for the 0'th subvector of a legalized vector.
4899 if (!Is0thSubVec)
4902 VTy, std::nullopt, CostKind, NumEltDone(),
4903 CurrVecTy);
4904 }
4905
4906 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
4907 // for smaller widths (32/16/8) we have to insert/extract them separately.
4908 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
4909 // but let's pretend that it is also true for 16/8 bit wide ops...)
4910 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
4911 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
4912 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
4913 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
4914 APInt DemandedElts =
4915 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
4916 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
4917 assert(DemandedElts.popcount() == 1 && "Inserting single value");
4918 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
4919 !IsLoad, CostKind);
4920 }
4921
4922 // This isn't exactly right. We're using slow unaligned 32-byte accesses
4923 // as a proxy for a double-pumped AVX memory interface such as on
4924 // Sandybridge.
4925 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
4926 // will be scalarized.
4927 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
4928 Cost += 2;
4929 else if (CurrOpSizeBytes < 4)
4930 Cost += 2;
4931 else
4932 Cost += 1;
4933
4934 SubVecEltsLeft -= CurrNumEltPerOp;
4935 NumEltRemaining -= CurrNumEltPerOp;
4936 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
4937 }
4938 }
4939
4940 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
4941
4942 return Cost;
4943}
4944
4946X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
4947 unsigned AddressSpace,
4949 bool IsLoad = (Instruction::Load == Opcode);
4950 bool IsStore = (Instruction::Store == Opcode);
4951
4952 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
4953 if (!SrcVTy)
4954 // To calculate scalar take the regular cost, without mask
4955 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
4956
4957 unsigned NumElem = SrcVTy->getNumElements();
4958 auto *MaskTy =
4959 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
4960 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
4961 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
4962 // Scalarization
4963 APInt DemandedElts = APInt::getAllOnes(NumElem);
4965 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
4966 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
4967 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
4969 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
4970 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
4972 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
4973 InstructionCost MemopCost =
4974 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
4975 Alignment, AddressSpace, CostKind);
4976 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
4977 }
4978
4979 // Legalize the type.
4980 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
4981 auto VT = TLI->getValueType(DL, SrcVTy);
4983 if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
4984 LT.second.getVectorNumElements() == NumElem)
4985 // Promotion requires extend/truncate for data and a shuffle for mask.
4986 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt,
4987 CostKind, 0, nullptr) +
4988 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt,
4989 CostKind, 0, nullptr);
4990
4991 else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
4992 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
4993 LT.second.getVectorNumElements());
4994 // Expanding requires fill mask with zeroes
4995 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt,
4996 CostKind, 0, MaskTy);
4997 }
4998
4999 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5000 if (!ST->hasAVX512())
5001 return Cost + LT.first * (IsLoad ? 2 : 8);
5002
5003 // AVX-512 masked load/store is cheaper
5004 return Cost + LT.first;
5005}
5006
5009 const Value *Base,
5010 const TTI::PointersChainInfo &Info,
5011 Type *AccessTy, TTI::TargetCostKind CostKind) {
5012 if (Info.isSameBase() && Info.isKnownStride()) {
5013 // If all the pointers have known stride all the differences are translated
5014 // into constants. X86 memory addressing allows encoding it into
5015 // displacement. So we just need to take the base GEP cost.
5016 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5017 SmallVector<const Value *> Indices(BaseGEP->indices());
5018 return getGEPCost(BaseGEP->getSourceElementType(),
5019 BaseGEP->getPointerOperand(), Indices, nullptr,
5020 CostKind);
5021 }
5022 return TTI::TCC_Free;
5023 }
5024 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5025}
5026
5028 ScalarEvolution *SE,
5029 const SCEV *Ptr) {
5030 // Address computations in vectorized code with non-consecutive addresses will
5031 // likely result in more instructions compared to scalar code where the
5032 // computation can more often be merged into the index mode. The resulting
5033 // extra micro-ops can significantly decrease throughput.
5034 const unsigned NumVectorInstToHideOverhead = 10;
5035
5036 // Cost modeling of Strided Access Computation is hidden by the indexing
5037 // modes of X86 regardless of the stride value. We dont believe that there
5038 // is a difference between constant strided access in gerenal and constant
5039 // strided value which is less than or equal to 64.
5040 // Even in the case of (loop invariant) stride whose value is not known at
5041 // compile time, the address computation will not incur more than one extra
5042 // ADD instruction.
5043 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
5044 // TODO: AVX2 is the current cut-off because we don't have correct
5045 // interleaving costs for prior ISA's.
5047 return NumVectorInstToHideOverhead;
5049 return 1;
5050 }
5051
5052 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5053}
5054
5057 std::optional<FastMathFlags> FMF,
5060 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5061
5062 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5063 // and make it as the cost.
5064
5065 static const CostTblEntry SLMCostTbl[] = {
5066 { ISD::FADD, MVT::v2f64, 3 },
5067 { ISD::ADD, MVT::v2i64, 5 },
5068 };
5069
5070 static const CostTblEntry SSE2CostTbl[] = {
5071 { ISD::FADD, MVT::v2f64, 2 },
5072 { ISD::FADD, MVT::v2f32, 2 },
5073 { ISD::FADD, MVT::v4f32, 4 },
5074 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5075 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5076 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5077 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5078 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5079 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5080 { ISD::ADD, MVT::v2i8, 2 },
5081 { ISD::ADD, MVT::v4i8, 2 },
5082 { ISD::ADD, MVT::v8i8, 2 },
5083 { ISD::ADD, MVT::v16i8, 3 },
5084 };
5085
5086 static const CostTblEntry AVX1CostTbl[] = {
5087 { ISD::FADD, MVT::v4f64, 3 },
5088 { ISD::FADD, MVT::v4f32, 3 },
5089 { ISD::FADD, MVT::v8f32, 4 },
5090 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5091 { ISD::ADD, MVT::v4i64, 3 },
5092 { ISD::ADD, MVT::v8i32, 5 },
5093 { ISD::ADD, MVT::v16i16, 5 },
5094 { ISD::ADD, MVT::v32i8, 4 },
5095 };
5096
5097 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5098 assert(ISD && "Invalid opcode");
5099
5100 // Before legalizing the type, give a chance to look up illegal narrow types
5101 // in the table.
5102 // FIXME: Is there a better way to do this?
5103 EVT VT = TLI->getValueType(DL, ValTy);
5104 if (VT.isSimple()) {
5105 MVT MTy = VT.getSimpleVT();
5106 if (ST->useSLMArithCosts())
5107 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5108 return Entry->Cost;
5109
5110 if (ST->hasAVX())
5111 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5112 return Entry->Cost;
5113
5114 if (ST->hasSSE2())
5115 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5116 return Entry->Cost;
5117 }
5118
5119 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5120
5121 MVT MTy = LT.second;
5122
5123 auto *ValVTy = cast<FixedVectorType>(ValTy);
5124
5125 // Special case: vXi8 mul reductions are performed as vXi16.
5126 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5127 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5128 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5129 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5131 CostKind) +
5132 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5133 }
5134
5135 InstructionCost ArithmeticCost = 0;
5136 if (LT.first != 1 && MTy.isVector() &&
5137 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5138 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5139 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5140 MTy.getVectorNumElements());
5141 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5142 ArithmeticCost *= LT.first - 1;
5143 }
5144
5145 if (ST->useSLMArithCosts())
5146 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5147 return ArithmeticCost + Entry->Cost;
5148
5149 if (ST->hasAVX())
5150 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5151 return ArithmeticCost + Entry->Cost;
5152
5153 if (ST->hasSSE2())
5154 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5155 return ArithmeticCost + Entry->Cost;
5156
5157 // FIXME: These assume a naive kshift+binop lowering, which is probably
5158 // conservative in most cases.
5159 static const CostTblEntry AVX512BoolReduction[] = {
5160 { ISD::AND, MVT::v2i1, 3 },
5161 { ISD::AND, MVT::v4i1, 5 },
5162 { ISD::AND, MVT::v8i1, 7 },
5163 { ISD::AND, MVT::v16i1, 9 },
5164 { ISD::AND, MVT::v32i1, 11 },
5165 { ISD::AND, MVT::v64i1, 13 },
5166 { ISD::OR, MVT::v2i1, 3 },
5167 { ISD::OR, MVT::v4i1, 5 },
5168 { ISD::OR, MVT::v8i1, 7 },
5169 { ISD::OR, MVT::v16i1, 9 },
5170 { ISD::OR, MVT::v32i1, 11 },
5171 { ISD::OR, MVT::v64i1, 13 },
5172 };
5173
5174 static const CostTblEntry AVX2BoolReduction[] = {
5175 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5176 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5177 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5178 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5179 };
5180
5181 static const CostTblEntry AVX1BoolReduction[] = {
5182 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5183 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5184 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5185 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5186 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5187 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5188 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5189 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5190 };
5191
5192 static const CostTblEntry SSE2BoolReduction[] = {
5193 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5194 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5195 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5196 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5197 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5198 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5199 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5200 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5201 };
5202
5203 // Handle bool allof/anyof patterns.
5204 if (ValVTy->getElementType()->isIntegerTy(1)) {
5205 InstructionCost ArithmeticCost = 0;
5206 if (LT.first != 1 && MTy.isVector() &&
5207 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5208 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5209 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5210 MTy.getVectorNumElements());
5211 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5212 ArithmeticCost *= LT.first - 1;
5213 }
5214
5215 if (ST->hasAVX512())
5216 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5217 return ArithmeticCost + Entry->Cost;
5218 if (ST->hasAVX2())
5219 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5220 return ArithmeticCost + Entry->Cost;
5221 if (ST->hasAVX())
5222 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5223 return ArithmeticCost + Entry->Cost;
5224 if (ST->hasSSE2())
5225 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5226 return ArithmeticCost + Entry->Cost;
5227
5228 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5229 }
5230
5231 unsigned NumVecElts = ValVTy->getNumElements();
5232 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5233
5234 // Special case power of 2 reductions where the scalar type isn't changed
5235 // by type legalization.
5236 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5237 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5238
5239 InstructionCost ReductionCost = 0;
5240
5241 auto *Ty = ValVTy;
5242 if (LT.first != 1 && MTy.isVector() &&
5243 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5244 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5245 Ty = FixedVectorType::get(ValVTy->getElementType(),
5246 MTy.getVectorNumElements());
5247 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5248 ReductionCost *= LT.first - 1;
5249 NumVecElts = MTy.getVectorNumElements();
5250 }
5251
5252 // Now handle reduction with the legal type, taking into account size changes
5253 // at each level.
5254 while (NumVecElts > 1) {
5255 // Determine the size of the remaining vector we need to reduce.
5256 unsigned Size = NumVecElts * ScalarSize;
5257 NumVecElts /= 2;
5258 // If we're reducing from 256/512 bits, use an extract_subvector.
5259 if (Size > 128) {
5260 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5261 ReductionCost +=
5263 NumVecElts, SubTy);
5264 Ty = SubTy;
5265 } else if (Size == 128) {
5266 // Reducing from 128 bits is a permute of v2f64/v2i64.
5267 FixedVectorType *ShufTy;
5268 if (ValVTy->isFloatingPointTy())
5269 ShufTy =
5270 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5271 else
5272 ShufTy =
5273 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5274 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5275 std::nullopt, CostKind, 0, nullptr);
5276 } else if (Size == 64) {
5277 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5278 FixedVectorType *ShufTy;
5279 if (ValVTy->isFloatingPointTy())
5280 ShufTy =
5281 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5282 else
5283 ShufTy =
5284 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5285 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5286 std::nullopt, CostKind, 0, nullptr);
5287 } else {
5288 // Reducing from smaller size is a shift by immediate.
5289 auto *ShiftTy = FixedVectorType::get(
5290 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5291 ReductionCost += getArithmeticInstrCost(
5292 Instruction::LShr, ShiftTy, CostKind,
5295 }
5296
5297 // Add the arithmetic op for this level.
5298 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5299 }
5300
5301 // Add the final extract element to the cost.
5302 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5303 CostKind, 0, nullptr, nullptr);
5304}
5305
5308 FastMathFlags FMF) {
5309 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5310 return getIntrinsicInstrCost(ICA, CostKind);
5311}
5312
5315 FastMathFlags FMF,
5317 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5318
5319 MVT MTy = LT.second;
5320
5321 int ISD;
5322 if (ValTy->isIntOrIntVectorTy()) {
5323 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5324 : ISD::SMIN;
5325 } else {
5326 assert(ValTy->isFPOrFPVectorTy() &&
5327 "Expected float point or integer vector type.");
5328 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5329 ? ISD::FMINNUM
5330 : ISD::FMINIMUM;
5331 }
5332
5333 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5334 // and make it as the cost.
5335
5336 static const CostTblEntry SSE2CostTbl[] = {
5337 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5338 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5339 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5340 };
5341
5342 static const CostTblEntry SSE41CostTbl[] = {
5343 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5344 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5345 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5346 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5347 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5348 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5349 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5350 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5351 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5352 {ISD::SMIN, MVT::v16i8, 6},
5353 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5354 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5355 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5356 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5357 };
5358
5359 static const CostTblEntry AVX1CostTbl[] = {
5360 {ISD::SMIN, MVT::v16i16, 6},
5361 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5362 {ISD::SMIN, MVT::v32i8, 8},
5363 {ISD::UMIN, MVT::v32i8, 8},
5364 };
5365
5366 static const CostTblEntry AVX512BWCostTbl[] = {
5367 {ISD::SMIN, MVT::v32i16, 8},
5368 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5369 {ISD::SMIN, MVT::v64i8, 10},
5370 {ISD::UMIN, MVT::v64i8, 10},
5371 };
5372
5373 // Before legalizing the type, give a chance to look up illegal narrow types
5374 // in the table.
5375 // FIXME: Is there a better way to do this?
5376 EVT VT = TLI->getValueType(DL, ValTy);
5377 if (VT.isSimple()) {
5378 MVT MTy = VT.getSimpleVT();
5379 if (ST->hasBWI())
5380 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5381 return Entry->Cost;
5382
5383 if (ST->hasAVX())
5384 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5385 return Entry->Cost;
5386
5387 if (ST->hasSSE41())
5388 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5389 return Entry->Cost;
5390
5391 if (ST->hasSSE2())
5392 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5393 return Entry->Cost;
5394 }
5395
5396 auto *ValVTy = cast<FixedVectorType>(ValTy);
5397 unsigned NumVecElts = ValVTy->getNumElements();
5398
5399 auto *Ty = ValVTy;
5400 InstructionCost MinMaxCost = 0;
5401 if (LT.first != 1 && MTy.isVector() &&
5402 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5403 // Type needs to be split. We need LT.first - 1 operations ops.
5404 Ty = FixedVectorType::get(ValVTy->getElementType(),
5405 MTy.getVectorNumElements());
5406 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5407 MinMaxCost *= LT.first - 1;
5408 NumVecElts = MTy.getVectorNumElements();
5409 }
5410
5411 if (ST->hasBWI())
5412 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5413 return MinMaxCost + Entry->Cost;
5414
5415 if (ST->hasAVX())
5416 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5417 return MinMaxCost + Entry->Cost;
5418
5419 if (ST->hasSSE41())
5420 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5421 return MinMaxCost + Entry->Cost;
5422
5423 if (ST->hasSSE2())
5424 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5425 return MinMaxCost + Entry->Cost;
5426
5427 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5428
5429 // Special case power of 2 reductions where the scalar type isn't changed
5430 // by type legalization.
5431 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5432 ScalarSize != MTy.getScalarSizeInBits())
5433 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5434
5435 // Now handle reduction with the legal type, taking into account size changes
5436 // at each level.
5437 while (NumVecElts > 1) {
5438 // Determine the size of the remaining vector we need to reduce.
5439 unsigned Size = NumVecElts * ScalarSize;
5440 NumVecElts /= 2;
5441 // If we're reducing from 256/512 bits, use an extract_subvector.
5442 if (Size > 128) {
5443 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5444 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
5445 CostKind, NumVecElts, SubTy);
5446 Ty = SubTy;
5447 } else if (Size == 128) {
5448 // Reducing from 128 bits is a permute of v2f64/v2i64.
5449 VectorType *ShufTy;
5450 if (ValTy->isFloatingPointTy())
5451 ShufTy =
5453 else
5454 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5455 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5456 std::nullopt, CostKind, 0, nullptr);
5457 } else if (Size == 64) {
5458 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5459 FixedVectorType *ShufTy;
5460 if (ValTy->isFloatingPointTy())
5461 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5462 else
5463 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5464 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5465 std::nullopt, CostKind, 0, nullptr);
5466 } else {
5467 // Reducing from smaller size is a shift by immediate.
5468 auto *ShiftTy = FixedVectorType::get(
5469 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5470 MinMaxCost += getArithmeticInstrCost(
5471 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5474 }
5475
5476 // Add the arithmetic op for this level.
5477 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5478 }
5479
5480 // Add the final extract element to the cost.
5481 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5482 CostKind, 0, nullptr, nullptr);
5483}
5484
5485/// Calculate the cost of materializing a 64-bit value. This helper
5486/// method might only calculate a fraction of a larger immediate. Therefore it
5487/// is valid to return a cost of ZERO.
5489 if (Val == 0)
5490 return TTI::TCC_Free;
5491
5492 if (isInt<32>(Val))
5493 return TTI::TCC_Basic;
5494
5495 return 2 * TTI::TCC_Basic;
5496}
5497
5500 assert(Ty->isIntegerTy());
5501
5502 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5503 if (BitSize == 0)
5504 return ~0U;
5505
5506 // Never hoist constants larger than 128bit, because this might lead to
5507 // incorrect code generation or assertions in codegen.
5508 // Fixme: Create a cost model for types larger than i128 once the codegen
5509 // issues have been fixed.
5510 if (BitSize > 128)
5511 return TTI::TCC_Free;
5512
5513 if (Imm == 0)
5514 return TTI::TCC_Free;
5515
5516 // Sign-extend all constants to a multiple of 64-bit.
5517 APInt ImmVal = Imm;
5518 if (BitSize % 64 != 0)
5519 ImmVal = Imm.sext(alignTo(BitSize, 64));
5520
5521 // Split the constant into 64-bit chunks and calculate the cost for each
5522 // chunk.
5524 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5525 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5526 int64_t Val = Tmp.getSExtValue();
5527 Cost += getIntImmCost(Val);
5528 }
5529 // We need at least one instruction to materialize the constant.
5530 return std::max<InstructionCost>(1, Cost);
5531}
5532
5534 const APInt &Imm, Type *Ty,
5536 Instruction *Inst) {
5537 assert(Ty->isIntegerTy());
5538
5539 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5540 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5541 // here, so that constant hoisting will ignore this constant.
5542 if (BitSize == 0)
5543 return TTI::TCC_Free;
5544
5545 unsigned ImmIdx = ~0U;
5546 switch (Opcode) {
5547 default:
5548 return TTI::TCC_Free;
5549 case Instruction::GetElementPtr:
5550 // Always hoist the base address of a GetElementPtr. This prevents the
5551 // creation of new constants for every base constant that gets constant
5552 // folded with the offset.
5553 if (Idx == 0)
5554 return 2 * TTI::TCC_Basic;
5555 return TTI::TCC_Free;
5556 case Instruction::Store:
5557 ImmIdx = 0;
5558 break;
5559 case Instruction::ICmp:
5560 // This is an imperfect hack to prevent constant hoisting of
5561 // compares that might be trying to check if a 64-bit value fits in
5562 // 32-bits. The backend can optimize these cases using a right shift by 32.
5563 // Ideally we would check the compare predicate here. There also other
5564 // similar immediates the backend can use shifts for.
5565 if (Idx == 1 && Imm.getBitWidth() == 64) {
5566 uint64_t ImmVal = Imm.getZExtValue();
5567 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5568 return TTI::TCC_Free;
5569 }
5570 ImmIdx = 1;
5571 break;
5572 case Instruction::And:
5573 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5574 // by using a 32-bit operation with implicit zero extension. Detect such
5575 // immediates here as the normal path expects bit 31 to be sign extended.
5576 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(32))
5577 return TTI::TCC_Free;
5578 ImmIdx = 1;
5579 break;
5580 case Instruction::Add:
5581 case Instruction::Sub:
5582 // For add/sub, we can use the opposite instruction for INT32_MIN.
5583 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
5584 return TTI::TCC_Free;
5585 ImmIdx = 1;
5586 break;
5587 case Instruction::UDiv:
5588 case Instruction::SDiv:
5589 case Instruction::URem:
5590 case Instruction::SRem:
5591 // Division by constant is typically expanded later into a different
5592 // instruction sequence. This completely changes the constants.
5593 // Report them as "free" to stop ConstantHoist from marking them as opaque.
5594 return TTI::TCC_Free;
5595 case Instruction::Mul:
5596 case Instruction::Or:
5597 case Instruction::Xor:
5598 ImmIdx = 1;
5599 break;
5600 // Always return TCC_Free for the shift value of a shift instruction.
5601 case Instruction::Shl:
5602 case Instruction::LShr:
5603 case Instruction::AShr:
5604 if (Idx == 1)
5605 return TTI::TCC_Free;
5606 break;
5607 case Instruction::Trunc:
5608 case Instruction::ZExt:
5609 case Instruction::SExt:
5610 case Instruction::IntToPtr:
5611 case Instruction::PtrToInt:
5612 case Instruction::BitCast:
5613 case Instruction::PHI:
5614 case Instruction::Call:
5615 case Instruction::Select:
5616 case Instruction::Ret:
5617 case Instruction::Load:
5618 break;
5619 }
5620
5621 if (Idx == ImmIdx) {
5622 uint64_t NumConstants = divideCeil(BitSize, 64);
5624 return (Cost <= NumConstants * TTI::TCC_Basic)
5625 ? static_cast<int>(TTI::TCC_Free)
5626 : Cost;
5627 }
5628
5629 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5630}
5631
5633 const APInt &Imm, Type *Ty,
5635 assert(Ty->isIntegerTy());
5636
5637 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5638 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5639 // here, so that constant hoisting will ignore this constant.
5640 if (BitSize == 0)
5641 return TTI::TCC_Free;
5642
5643 switch (IID) {
5644 default:
5645 return TTI::TCC_Free;
5646 case Intrinsic::sadd_with_overflow:
5647 case Intrinsic::uadd_with_overflow:
5648 case Intrinsic::ssub_with_overflow:
5649 case Intrinsic::usub_with_overflow:
5650 case Intrinsic::smul_with_overflow:
5651 case Intrinsic::umul_with_overflow:
5652 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
5653 return TTI::TCC_Free;
5654 break;
5655 case Intrinsic::experimental_stackmap:
5656 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5657 return TTI::TCC_Free;
5658 break;
5659 case Intrinsic::experimental_patchpoint_void:
5660 case Intrinsic::experimental_patchpoint:
5661 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5662 return TTI::TCC_Free;
5663 break;
5664 }
5665 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5666}
5667
5670 const Instruction *I) {
5672 return Opcode == Instruction::PHI ? 0 : 1;
5673 // Branches are assumed to be predicted.
5674 return 0;
5675}
5676
5677int X86TTIImpl::getGatherOverhead() const {
5678 // Some CPUs have more overhead for gather. The specified overhead is relative
5679 // to the Load operation. "2" is the number provided by Intel architects. This
5680 // parameter is used for cost estimation of Gather Op and comparison with
5681 // other alternatives.
5682 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
5683 // enable gather with a -march.
5684 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
5685 return 2;
5686
5687 return 1024;
5688}
5689
5690int X86TTIImpl::getScatterOverhead() const {
5691 if (ST->hasAVX512())
5692 return 2;
5693
5694 return 1024;
5695}
5696
5697// Return an average cost of Gather / Scatter instruction, maybe improved later.
5698// FIXME: Add TargetCostKind support.
5699InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
5701 Type *SrcVTy, const Value *Ptr,
5702 Align Alignment,
5703 unsigned AddressSpace) {
5704
5705 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
5706 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5707
5708 // Try to reduce index size from 64 bit (default for GEP)
5709 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
5710 // operation will use 16 x 64 indices which do not fit in a zmm and needs
5711 // to split. Also check that the base pointer is the same for all lanes,
5712 // and that there's at most one variable index.
5713 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
5714 unsigned IndexSize = DL.getPointerSizeInBits();
5715 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
5716 if (IndexSize < 64 || !GEP)
5717 return IndexSize;
5718
5719 unsigned NumOfVarIndices = 0;
5720 const Value *Ptrs = GEP->getPointerOperand();
5721 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
5722 return IndexSize;
5723 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
5724 if (isa<Constant>(GEP->getOperand(I)))
5725 continue;
5726 Type *IndxTy = GEP->getOperand(I)->getType();
5727 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
5728 IndxTy = IndexVTy->getElementType();
5729 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
5730 !isa<SExtInst>(GEP->getOperand(I))) ||
5731 ++NumOfVarIndices > 1)
5732 return IndexSize; // 64
5733 }
5734 return (unsigned)32;
5735 };
5736
5737 // Trying to reduce IndexSize to 32 bits for vector 16.
5738 // By default the IndexSize is equal to pointer size.
5739 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
5740 ? getIndexSizeInBits(Ptr, DL)
5742
5743 auto *IndexVTy = FixedVectorType::get(
5744 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
5745 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
5746 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
5747 InstructionCost::CostType SplitFactor =
5748 *std::max(IdxsLT.first, SrcLT.first).getValue();
5749 if (SplitFactor > 1) {
5750 // Handle splitting of vector of pointers
5751 auto *SplitSrcTy =
5752 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
5753 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
5754 Alignment, AddressSpace);
5755 }
5756
5757 // The gather / scatter cost is given by Intel architects. It is a rough
5758 // number since we are looking at one instruction in a time.
5759 const int GSOverhead = (Opcode == Instruction::Load)
5760 ? getGatherOverhead()
5761 : getScatterOverhead();
5762 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5763 MaybeAlign(Alignment), AddressSpace,
5765}
5766
5767/// Return the cost of full scalarization of gather / scatter operation.
5768///
5769/// Opcode - Load or Store instruction.
5770/// SrcVTy - The type of the data vector that should be gathered or scattered.
5771/// VariableMask - The mask is non-constant at compile time.
5772/// Alignment - Alignment for one element.
5773/// AddressSpace - pointer[s] address space.
5774/// TODO: Remove this and use getCommonMaskedMemoryOpCost directly.
5775InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode,
5777 Type *SrcVTy, bool VariableMask,
5778 Align Alignment,
5779 unsigned AddressSpace) {
5780 Type *ScalarTy = SrcVTy->getScalarType();
5781 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5782 APInt DemandedElts = APInt::getAllOnes(VF);
5783
5784 InstructionCost MaskUnpackCost = 0;
5785 if (VariableMask) {
5786 auto *MaskTy =
5788 MaskUnpackCost = getScalarizationOverhead(
5789 MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
5790 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5791 Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
5793 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5794 MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
5795 }
5796
5797 InstructionCost AddressUnpackCost = getScalarizationOverhead(
5799 DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
5800
5801 // The cost of the scalar loads/stores.
5802 InstructionCost MemoryOpCost =
5803 VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment),
5805
5806 // The cost of forming the vector from loaded scalars/
5807 // scalarizing the vector to perform scalar stores.
5808 InstructionCost InsertExtractCost = getScalarizationOverhead(
5809 cast<FixedVectorType>(SrcVTy), DemandedElts,
5810 /*Insert=*/Opcode == Instruction::Load,
5811 /*Extract=*/Opcode == Instruction::Store, CostKind);
5812
5813 return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost;
5814}
5815
5816/// Calculate the cost of Gather / Scatter operation
5818 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
5820 const Instruction *I = nullptr) {
5822 if ((Opcode == Instruction::Load &&
5823 isLegalMaskedGather(SrcVTy, Align(Alignment)) &&
5824 !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5825 Align(Alignment))) ||
5826 (Opcode == Instruction::Store &&
5827 isLegalMaskedScatter(SrcVTy, Align(Alignment)) &&
5828 !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5829 Align(Alignment))))
5830 return 1;
5831 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
5832 Alignment, CostKind, I);
5833 }
5834
5835 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
5836 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
5837 if (!PtrTy && Ptr->getType()->isVectorTy())
5838 PtrTy = dyn_cast<PointerType>(
5839 cast<VectorType>(Ptr->getType())->getElementType());
5840 assert(PtrTy && "Unexpected type for Ptr argument");
5841 unsigned AddressSpace = PtrTy->getAddressSpace();
5842
5843 if ((Opcode == Instruction::Load &&
5844 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
5845 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5846 Align(Alignment)))) ||
5847 (Opcode == Instruction::Store &&
5848 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
5849 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5850 Align(Alignment)))))
5851 return getGSScalarCost(Opcode, CostKind, SrcVTy, VariableMask, Alignment,
5852 AddressSpace);
5853
5854 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
5855 AddressSpace);
5856}
5857
5859 const TargetTransformInfo::LSRCost &C2) {
5860 // X86 specific here are "instruction number 1st priority".
5861 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
5862 C1.NumIVMuls, C1.NumBaseAdds,
5863 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5864 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
5865 C2.NumIVMuls, C2.NumBaseAdds,
5866 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5867}
5868
5870 return ST->hasMacroFusion() || ST->hasBranchFusion();
5871}
5872
5873bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
5874 if (!ST->hasAVX())
5875 return false;
5876
5877 // The backend can't handle a single element vector.
5878 if (isa<VectorType>(DataTy) &&
5879 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5880 return false;
5881 Type *ScalarTy = DataTy->getScalarType();
5882
5883 if (ScalarTy->isPointerTy())
5884 return true;
5885
5886 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5887 return true;
5888
5889 if (ScalarTy->isHalfTy() && ST->hasBWI())
5890 return true;
5891
5892 if (ScalarTy->isBFloatTy() && ST->hasBF16())
5893 return true;
5894
5895 if (!ScalarTy->isIntegerTy())
5896 return false;
5897
5898 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5899 return IntWidth == 32 || IntWidth == 64 ||
5900 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
5901}
5902
5903bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
5904 return isLegalMaskedLoad(DataType, Alignment);
5905}
5906
5907bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
5908 unsigned DataSize = DL.getTypeStoreSize(DataType);
5909 // The only supported nontemporal loads are for aligned vectors of 16 or 32
5910 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
5911 // (the equivalent stores only require AVX).
5912 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
5913 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
5914
5915 return false;
5916}
5917
5918bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
5919 unsigned DataSize = DL.getTypeStoreSize(DataType);
5920
5921 // SSE4A supports nontemporal stores of float and double at arbitrary
5922 // alignment.
5923 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
5924 return true;
5925
5926 // Besides the SSE4A subtarget exception above, only aligned stores are
5927 // available nontemporaly on any other subtarget. And only stores with a size
5928 // of 4..32 bytes (powers of 2, only) are permitted.
5929 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
5930 !isPowerOf2_32(DataSize))
5931 return false;
5932
5933 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
5934 // loads require AVX2).
5935 if (DataSize == 32)
5936 return ST->hasAVX();
5937 if (DataSize == 16)
5938 return ST->hasSSE1();
5939 return true;
5940}
5941
5943 ElementCount NumElements) const {
5944 // movddup
5945 return ST->hasSSE3() && !NumElements.isScalable() &&
5946 NumElements.getFixedValue() == 2 &&
5947 ElementTy == Type::getDoubleTy(ElementTy->getContext());
5948}
5949
5951 if (!isa<VectorType>(DataTy))
5952 return false;
5953
5954 if (!ST->hasAVX512())
5955 return false;
5956
5957 // The backend can't handle a single element vector.
5958 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5959 return false;
5960
5961 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
5962
5963 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5964 return true;
5965
5966 if (!ScalarTy->isIntegerTy())
5967 return false;
5968
5969 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5970 return IntWidth == 32 || IntWidth == 64 ||
5971 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
5972}
5973
5975 return isLegalMaskedExpandLoad(DataTy, Alignment);
5976}
5977
5978bool X86TTIImpl::supportsGather() const {
5979 // Some CPUs have better gather performance than others.
5980 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
5981 // enable gather with a -march.
5982 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
5983}
5984
5986 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
5987 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
5988 // it to 8 elements, but zeroing upper bits of the mask vector will add more
5989 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
5990 // Check, maybe the gather/scatter instruction is better in the VariableMask
5991 // case.
5992 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
5993 return NumElts == 1 ||
5994 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
5995}
5996
5998 Type *ScalarTy = DataTy->getScalarType();
5999 if (ScalarTy->isPointerTy())
6000 return true;
6001
6002 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6003 return true;
6004
6005 if (!ScalarTy->isIntegerTy())
6006 return false;
6007
6008 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6009 return IntWidth == 32 || IntWidth == 64;
6010}
6011
6013 if (!supportsGather() || !ST->preferGather())
6014 return false;
6015 return isLegalMaskedGatherScatter(DataTy, Alignment);
6016}
6017
6018bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6019 unsigned Opcode1,
6020 const SmallBitVector &OpcodeMask) const {
6021 // ADDSUBPS 4xf32 SSE3
6022 // VADDSUBPS 4xf32 AVX
6023 // VADDSUBPS 8xf32 AVX2
6024 // ADDSUBPD 2xf64 SSE3
6025 // VADDSUBPD 2xf64 AVX
6026 // VADDSUBPD 4xf64 AVX2
6027
6028 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6029 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6030 if (!isPowerOf2_32(NumElements))
6031 return false;
6032 // Check the opcode pattern. We apply the mask on the opcode arguments and
6033 // then check if it is what we expect.
6034 for (int Lane : seq<int>(0, NumElements)) {
6035 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6036 // We expect FSub for even lanes and FAdd for odd lanes.
6037 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6038 return false;
6039 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6040 return false;
6041 }
6042 // Now check that the pattern is supported by the target ISA.
6043 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6044 if (ElemTy->isFloatTy())
6045 return ST->hasSSE3() && NumElements % 4 == 0;
6046 if (ElemTy->isDoubleTy())
6047 return ST->hasSSE3() && NumElements % 2 == 0;
6048 return false;
6049}
6050
6051bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6052 // AVX2 doesn't support scatter
6053 if (!ST->hasAVX512() || !ST->preferScatter())
6054 return false;
6055 return isLegalMaskedGatherScatter(DataType, Alignment);
6056}
6057
6058bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6059 EVT VT = TLI->getValueType(DL, DataType);
6060 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6061}
6062
6064 // FDIV is always expensive, even if it has a very low uop count.
6065 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6066 if (I->getOpcode() == Instruction::FDiv)
6067 return true;
6068
6070}
6071
6073 return false;
6074}
6075
6077 const Function *Callee) const {
6078 const TargetMachine &TM = getTLI()->getTargetMachine();
6079
6080 // Work this as a subsetting of subtarget features.
6081 const FeatureBitset &CallerBits =
6082 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6083 const FeatureBitset &CalleeBits =
6084 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6085
6086 // Check whether features are the same (apart from the ignore list).
6087 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6088 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6089 if (RealCallerBits == RealCalleeBits)
6090 return true;
6091
6092 // If the features are a subset, we need to additionally check for calls
6093 // that may become ABI-incompatible as a result of inlining.
6094 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6095 return false;
6096
6097 for (const Instruction &I : instructions(Callee)) {
6098 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6099 // Having more target features is fine for inline ASM.
6100 if (CB->isInlineAsm())
6101 continue;
6102
6104 for (Value *Arg : CB->args())
6105 Types.push_back(Arg->getType());
6106 if (!CB->getType()->isVoidTy())
6107 Types.push_back(CB->getType());
6108
6109 // Simple types are always ABI compatible.
6110 auto IsSimpleTy = [](Type *Ty) {
6111 return !Ty->isVectorTy() && !Ty->isAggregateType();
6112 };
6113 if (all_of(Types, IsSimpleTy))
6114 continue;
6115
6116 if (Function *NestedCallee = CB->getCalledFunction()) {
6117 // Assume that intrinsics are always ABI compatible.
6118 if (NestedCallee->isIntrinsic())
6119 continue;
6120
6121 // Do a precise compatibility check.
6122 if (!areTypesABICompatible(Caller, NestedCallee, Types))
6123 return false;
6124 } else {
6125 // We don't know the target features of the callee,
6126 // assume it is incompatible.
6127 return false;
6128 }
6129 }
6130 }
6131 return true;
6132}
6133
6135 const Function *Callee,
6136 const ArrayRef<Type *> &Types) const {
6137 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6138 return false;
6139
6140 // If we get here, we know the target features match. If one function
6141 // considers 512-bit vectors legal and the other does not, consider them
6142 // incompatible.
6143 const TargetMachine &TM = getTLI()->getTargetMachine();
6144
6145 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6146 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6147 return true;
6148
6149 // Consider the arguments compatible if they aren't vectors or aggregates.
6150 // FIXME: Look at the size of vectors.
6151 // FIXME: Look at the element types of aggregates to see if there are vectors.
6152 return llvm::none_of(Types,
6153 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6154}
6155
6157X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6159 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6160 Options.NumLoadsPerBlock = 2;
6161 // All GPR and vector loads can be unaligned.
6162 Options.AllowOverlappingLoads = true;
6163 if (IsZeroCmp) {
6164 // Only enable vector loads for equality comparison. Right now the vector
6165 // version is not as fast for three way compare (see #33329).
6166 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6167 if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6168 Options.LoadSizes.push_back(64);
6169 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6170 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6171 }
6172 if (ST->is64Bit()) {
6173 Options.LoadSizes.push_back(8);
6174 }
6175 Options.LoadSizes.push_back(4);
6176 Options.LoadSizes.push_back(2);
6177 Options.LoadSizes.push_back(1);
6178 return Options;
6179}
6180
6182 return supportsGather();
6183}
6184
6186 return false;
6187}
6188
6190 // TODO: We expect this to be beneficial regardless of arch,
6191 // but there are currently some unexplained performance artifacts on Atom.
6192 // As a temporary solution, disable on Atom.
6193 return !(ST->isAtom());
6194}
6195
6196// Get estimation for interleaved load/store operations and strided load.
6197// \p Indices contains indices for strided load.
6198// \p Factor - the factor of interleaving.
6199// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6201 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6202 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6203 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6204 // VecTy for interleave memop is <VF*Factor x Elt>.
6205 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6206 // VecTy = <12 x i32>.
6207
6208 // Calculate the number of memory operations (NumOfMemOps), required
6209 // for load/store the VecTy.
6210 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6211 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6212 unsigned LegalVTSize = LegalVT.getStoreSize();
6213 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6214
6215 // Get the cost of one memory operation.
6216 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6217 LegalVT.getVectorNumElements());
6218 InstructionCost MemOpCost;
6219 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6220 if (UseMaskedMemOp)
6221 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6223 else
6224 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6226
6227 unsigned VF = VecTy->getNumElements() / Factor;
6228 MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
6229
6230 InstructionCost MaskCost;
6231 if (UseMaskedMemOp) {
6232 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6233 for (unsigned Index : Indices) {
6234 assert(Index < Factor && "Invalid index for interleaved memory op");
6235 for (unsigned Elm = 0; Elm < VF; Elm++)
6236 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6237 }
6238
6239 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6240
6241 MaskCost = getReplicationShuffleCost(
6242 I1Type, Factor, VF,
6243 UseMaskForGaps ? DemandedLoadStoreElts
6245 CostKind);
6246
6247 // The Gaps mask is invariant and created outside the loop, therefore the
6248 // cost of creating it is not accounted for here. However if we have both
6249 // a MaskForGaps and some other mask that guards the execution of the
6250 // memory access, we need to account for the cost of And-ing the two masks
6251 // inside the loop.
6252 if (UseMaskForGaps) {
6253 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6254 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6255 }
6256 }
6257
6258 if (Opcode == Instruction::Load) {
6259 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6260 // contain the cost of the optimized shuffle sequence that the
6261 // X86InterleavedAccess pass will generate.
6262 // The cost of loads and stores are computed separately from the table.
6263
6264 // X86InterleavedAccess support only the following interleaved-access group.
6265 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6266 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6267 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6268 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6269 };
6270
6271 if (const auto *Entry =
6272 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6273 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6274 //If an entry does not exist, fallback to the default implementation.
6275
6276 // Kind of shuffle depends on number of loaded values.
6277 // If we load the entire data in one register, we can use a 1-src shuffle.
6278 // Otherwise, we'll merge 2 sources in each operation.
6279 TTI::ShuffleKind ShuffleKind =
6280 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6281
6282 InstructionCost ShuffleCost = getShuffleCost(
6283 ShuffleKind, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6284
6285 unsigned NumOfLoadsInInterleaveGrp =
6286 Indices.size() ? Indices.size() : Factor;
6287 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6288 VecTy->getNumElements() / Factor);
6289 InstructionCost NumOfResults =
6290 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6291
6292 // About a half of the loads may be folded in shuffles when we have only
6293 // one result. If we have more than one result, or the loads are masked,
6294 // we do not fold loads at all.
6295 unsigned NumOfUnfoldedLoads =
6296 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6297
6298 // Get a number of shuffle operations per result.
6299 unsigned NumOfShufflesPerResult =
6300 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6301
6302 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6303 // When we have more than one destination, we need additional instructions
6304 // to keep sources.
6305 InstructionCost NumOfMoves = 0;
6306 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6307 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6308
6309 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6310 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6311 NumOfMoves;
6312
6313 return Cost;
6314 }
6315
6316 // Store.
6317 assert(Opcode == Instruction::Store &&
6318 "Expected Store Instruction at this point");
6319 // X86InterleavedAccess support only the following interleaved-access group.
6320 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6321 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6322 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6323 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6324
6325 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6326 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6327 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6328 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6329 };
6330
6331 if (const auto *Entry =
6332 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6333 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6334 //If an entry does not exist, fallback to the default implementation.
6335
6336 // There is no strided stores meanwhile. And store can't be folded in
6337 // shuffle.
6338 unsigned NumOfSources = Factor; // The number of values to be merged.
6339 InstructionCost ShuffleCost = getShuffleCost(
6340 TTI::SK_PermuteTwoSrc, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6341 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6342
6343 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6344 // We need additional instructions to keep sources.
6345 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6347 MaskCost +
6348 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6349 NumOfMoves;
6350 return Cost;
6351}
6352
6354 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6355 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6356 bool UseMaskForCond, bool UseMaskForGaps) {
6357 auto *VecTy = cast<FixedVectorType>(BaseTy);
6358
6359 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6360 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6361 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6362 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6363 return true;
6364 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6365 return ST->hasBWI();
6366 if (EltTy->isBFloatTy())
6367 return ST->hasBF16();
6368 return false;
6369 };
6370 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6372 Opcode, VecTy, Factor, Indices, Alignment,
6373 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6374
6375 if (UseMaskForCond || UseMaskForGaps)
6376 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6377 Alignment, AddressSpace, CostKind,
6378 UseMaskForCond, UseMaskForGaps);
6379
6380 // Get estimation for interleaved load/store operations for SSE-AVX2.
6381 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6382 // computing the cost using a generic formula as a function of generic
6383 // shuffles. We therefore use a lookup table instead, filled according to
6384 // the instruction sequences that codegen currently generates.
6385
6386 // VecTy for interleave memop is <VF*Factor x Elt>.
6387 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6388 // VecTy = <12 x i32>.
6389 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6390
6391 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6392 // the VF=2, while v2i128 is an unsupported MVT vector type
6393 // (see MachineValueType.h::getVectorVT()).
6394 if (!LegalVT.isVector())
6395 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6396 Alignment, AddressSpace, CostKind);
6397
6398 unsigned VF = VecTy->getNumElements() / Factor;
6399 Type *ScalarTy = VecTy->getElementType();
6400 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6401 if (!ScalarTy->isIntegerTy())
6402 ScalarTy =
6403 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6404
6405 // Get the cost of all the memory operations.
6406 // FIXME: discount dead loads.
6407 InstructionCost MemOpCosts = getMemoryOpCost(
6408 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6409
6410 auto *VT = FixedVectorType::get(ScalarTy, VF);
6411 EVT ETy = TLI->getValueType(DL, VT);
6412 if (!ETy.isSimple())
6413 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6414 Alignment, AddressSpace, CostKind);
6415
6416 // TODO: Complete for other data-types and strides.
6417 // Each combination of Stride, element bit width and VF results in a different
6418 // sequence; The cost tables are therefore accessed with:
6419 // Factor (stride) and VectorType=VFxiN.
6420 // The Cost accounts only for the shuffle sequence;
6421 // The cost of the loads/stores is accounted for separately.
6422 //
6423 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6424 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6425 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6426 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6427 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6428 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6429
6430 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6431 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6432 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6433
6434 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6435 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6436 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6437
6438 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6439 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6440 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6441 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6442
6443 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6444 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6445 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6446 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6447 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6448
6449 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6450 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6451 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6452 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6453 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6454
6455 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6456 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6457 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6458 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6459 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6460
6461 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6462 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6463 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6464 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6465
6466 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6467 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6468 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6469 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6470 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6471
6472 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6473 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6474 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6475 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6476 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6477
6478 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6479 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6480 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6481 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6482 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6483
6484 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6485 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6486 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6487 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6488
6489 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6490 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6491 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6492 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6493 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6494
6495 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6496 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6497 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6498 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6499 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6500
6501 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6502 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6503 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6504 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6505
6506 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6507 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6508 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6509
6510 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6511 };
6512
6513 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6514 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6515 };
6516
6517 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6518 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6519 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6520
6521 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6522 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6523
6524 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6525 };
6526
6527 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6528 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6529 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6530
6531 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6532 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6533 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6534
6535 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6536 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6537 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6538 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6539
6540 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6541 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6542 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6543 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6544 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6545
6546 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6547 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6548 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6549 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6550 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6551
6552 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6553 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6554 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6555 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6556 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6557
6558 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6559 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6560 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
6561 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6562 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6563
6564 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
6565 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
6566 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
6567 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6568
6569 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
6570 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
6571 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
6572 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
6573 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6574
6575 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
6576 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
6577 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
6578 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6579 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6580
6581 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
6582 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
6583 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
6584 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6585 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6586
6587 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
6588 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
6589 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6590 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6591
6592 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
6593 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
6594 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
6595 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6596 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6597
6598 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
6599 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
6600 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
6601 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6602 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6603
6604 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
6605 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
6606 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
6607 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6608
6609 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
6610 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6611 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6612 };
6613
6614 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6615 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
6616 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
6617 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
6618
6619 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
6620 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
6621
6622 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
6623 };
6624
6625 if (Opcode == Instruction::Load) {
6626 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6627 MemOpCosts](const CostTblEntry *Entry) {
6628 // NOTE: this is just an approximation!
6629 // It can over/under -estimate the cost!
6630 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6631 };
6632
6633 if (ST->hasAVX2())
6634 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6635 ETy.getSimpleVT()))
6636 return GetDiscountedCost(Entry);
6637
6638 if (ST->hasSSSE3())
6639 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
6640 ETy.getSimpleVT()))
6641 return GetDiscountedCost(Entry);
6642
6643 if (ST->hasSSE2())
6644 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
6645 ETy.getSimpleVT()))
6646 return GetDiscountedCost(Entry);
6647 } else {
6648 assert(Opcode == Instruction::Store &&
6649 "Expected Store Instruction at this point");
6650 assert((!Indices.size() || Indices.size() == Factor) &&
6651 "Interleaved store only supports fully-interleaved groups.");
6652 if (ST->hasAVX2())
6653 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
6654 ETy.getSimpleVT()))
6655 return MemOpCosts + Entry->Cost;
6656
6657 if (ST->hasSSE2())
6658 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
6659 ETy.getSimpleVT()))
6660 return MemOpCosts + Entry->Cost;
6661 }
6662
6663 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6664 Alignment, AddressSpace, CostKind,
6665 UseMaskForCond, UseMaskForGaps);
6666}
6667
6669 int64_t BaseOffset,
6670 bool HasBaseReg, int64_t Scale,
6671 unsigned AddrSpace) const {
6672 // Scaling factors are not free at all.
6673 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
6674 // will take 2 allocations in the out of order engine instead of 1
6675 // for plain addressing mode, i.e. inst (reg1).
6676 // E.g.,
6677 // vaddps (%rsi,%rdx), %ymm0, %ymm1
6678 // Requires two allocations (one for the load, one for the computation)
6679 // whereas:
6680 // vaddps (%rsi), %ymm0, %ymm1
6681 // Requires just 1 allocation, i.e., freeing allocations for other operations
6682 // and having less micro operations to execute.
6683 //
6684 // For some X86 architectures, this is even worse because for instance for
6685 // stores, the complex addressing mode forces the instruction to use the
6686 // "load" ports instead of the dedicated "store" port.
6687 // E.g., on Haswell:
6688 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
6689 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
6691 AM.BaseGV = BaseGV;
6692 AM.BaseOffs = BaseOffset;
6693 AM.HasBaseReg = HasBaseReg;
6694 AM.Scale = Scale;
6695 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6696 // Scale represents reg2 * scale, thus account for 1
6697 // as soon as we use a second register.
6698 return AM.Scale != 0;
6699 return -1;
6700}
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
Hexagon Common GEP
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
if(VerifyEach)
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1620
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:358
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:453
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:891
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:969
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:762
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:438
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:855
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:339
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:966
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:969
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:996
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:990
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:989
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:974
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:977
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:991
@ ICMP_NE
not equal
Definition: InstrTypes.h:988
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:994
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:992
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:976
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:410
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:307
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool is512BitVector() const
Return true if this is a 512-bit vector type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Class to represent pointers.
Definition: DerivedTypes.h:646
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:679
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:333
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:216
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
Definition: DerivedTypes.h:463
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Definition: DerivedTypes.h:517
Type * getElementType() const
Definition: DerivedTypes.h:436
bool hasSSE1() const
Definition: X86Subtarget.h:200
bool hasSSE42() const
Definition: X86Subtarget.h:205
bool useAVX512Regs() const
Definition: X86Subtarget.h:266
bool hasSSE3() const
Definition: X86Subtarget.h:202
bool hasAVX512() const
Definition: X86Subtarget.h:208
bool hasSSE41() const
Definition: X86Subtarget.h:204
bool hasSSE2() const
Definition: X86Subtarget.h:201
bool hasSSSE3() const
Definition: X86Subtarget.h:203
bool hasAVX() const
Definition: X86Subtarget.h:206
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:239
bool hasAVX2() const
Definition: X86Subtarget.h:207
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool isLegalNTStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLegalNTLoad(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment)
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment)
bool isLegalMaskedLoad(Type *DataType, Align Alignment)
bool supportsEfficientVectorElementLoadStore() const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
bool prefersVectorizedAddressing() const
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment)
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment)
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
Calculate the cost of Gather / Scatter operation.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getMaxInterleaveFactor(ElementCount VF)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getNumberOfRegisters(unsigned ClassID) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
unsigned getAtomicMemIntrinsicMaxElementSize() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
InstructionCost getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Type) const
bool hasDivRemOp(Type *DataType, bool IsSigned)
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:723
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:714
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:688
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:722
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:349
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:323
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:971
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:674
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:984
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:340
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
AddressSpace
Definition: NVPTXBaseInfo.h:21
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:361
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
constexpr int PoisonMaskElem
void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
InstructionCost Cost
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:428
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition: Alignment.h:141
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition: CostTable.h:55