LLVM 17.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include "llvm/Support/Debug.h"
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
102
105 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
106 // TODO: Currently the __builtin_popcount() implementation using SSE3
107 // instructions is inefficient. Once the problem is fixed, we should
108 // call ST->hasSSE3() instead of ST->hasPOPCNT().
109 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
110}
111
112std::optional<unsigned> X86TTIImpl::getCacheSize(
114 switch (Level) {
116 // - Penryn
117 // - Nehalem
118 // - Westmere
119 // - Sandy Bridge
120 // - Ivy Bridge
121 // - Haswell
122 // - Broadwell
123 // - Skylake
124 // - Kabylake
125 return 32 * 1024; // 32 KByte
127 // - Penryn
128 // - Nehalem
129 // - Westmere
130 // - Sandy Bridge
131 // - Ivy Bridge
132 // - Haswell
133 // - Broadwell
134 // - Skylake
135 // - Kabylake
136 return 256 * 1024; // 256 KByte
137 }
138
139 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
140}
141
142std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
144 // - Penryn
145 // - Nehalem
146 // - Westmere
147 // - Sandy Bridge
148 // - Ivy Bridge
149 // - Haswell
150 // - Broadwell
151 // - Skylake
152 // - Kabylake
153 switch (Level) {
155 [[fallthrough]];
157 return 8;
158 }
159
160 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
161}
162
163unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
164 bool Vector = (ClassID == 1);
165 if (Vector && !ST->hasSSE1())
166 return 0;
167
168 if (ST->is64Bit()) {
169 if (Vector && ST->hasAVX512())
170 return 32;
171 return 16;
172 }
173 return 8;
174}
175
178 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
179 switch (K) {
181 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
183 if (ST->hasAVX512() && PreferVectorWidth >= 512)
184 return TypeSize::getFixed(512);
185 if (ST->hasAVX() && PreferVectorWidth >= 256)
186 return TypeSize::getFixed(256);
187 if (ST->hasSSE1() && PreferVectorWidth >= 128)
188 return TypeSize::getFixed(128);
189 return TypeSize::getFixed(0);
191 return TypeSize::getScalable(0);
192 }
193
194 llvm_unreachable("Unsupported register kind");
195}
196
199 .getFixedValue();
200}
201
203 // If the loop will not be vectorized, don't interleave the loop.
204 // Let regular unroll to unroll the loop, which saves the overflow
205 // check and memory check cost.
206 if (VF == 1)
207 return 1;
208
209 if (ST->isAtom())
210 return 1;
211
212 // Sandybridge and Haswell have multiple execution ports and pipelined
213 // vector units.
214 if (ST->hasAVX())
215 return 4;
216
217 return 2;
218}
219
221 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
224 const Instruction *CxtI) {
225
226 // vXi8 multiplications are always promoted to vXi16.
227 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
228 Ty->getScalarSizeInBits() == 8) {
229 Type *WideVecTy =
230 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
231 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
233 CostKind) +
234 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
236 CostKind) +
237 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
238 }
239
240 // Legalize the type.
241 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
242
243 int ISD = TLI->InstructionOpcodeToISD(Opcode);
244 assert(ISD && "Invalid opcode");
245
246 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
247 LT.second.getScalarType() == MVT::i32) {
248 // Check if the operands can be represented as a smaller datatype.
249 bool Op1Signed = false, Op2Signed = false;
250 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
251 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
252 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
253 bool SignedMode = Op1Signed || Op2Signed;
254
255 // If both are representable as i15 and at least one is constant,
256 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
257 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
258 if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) {
259 bool Op1Constant =
260 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
261 bool Op2Constant =
262 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
263 bool Op1Sext = isa<SExtInst>(Args[0]) &&
264 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
265 bool Op2Sext = isa<SExtInst>(Args[1]) &&
266 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
267
268 bool IsZeroExtended = !Op1Signed || !Op2Signed;
269 bool IsConstant = Op1Constant || Op2Constant;
270 bool IsSext = Op1Sext || Op2Sext;
271 if (IsConstant || IsZeroExtended || IsSext)
272 LT.second =
273 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
274 }
275
276 // Check if the vXi32 operands can be shrunk into a smaller datatype.
277 // This should match the codegen from reduceVMULWidth.
278 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
279 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
280 if (OpMinSize <= 7)
281 return LT.first * 3; // pmullw/sext
282 if (!SignedMode && OpMinSize <= 8)
283 return LT.first * 3; // pmullw/zext
284 if (OpMinSize <= 15)
285 return LT.first * 5; // pmullw/pmulhw/pshuf
286 if (!SignedMode && OpMinSize <= 16)
287 return LT.first * 5; // pmullw/pmulhw/pshuf
288 }
289 }
290
291 // Vector multiply by pow2 will be simplified to shifts.
292 // Vector multiply by -pow2 will be simplified to shifts/negates.
293 if (ISD == ISD::MUL && Op2Info.isConstant() &&
294 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
296 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
297 Op1Info.getNoProps(), Op2Info.getNoProps());
298 if (Op2Info.isNegatedPowerOf2())
299 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
300 return Cost;
301 }
302
303 // On X86, vector signed division by constants power-of-two are
304 // normally expanded to the sequence SRA + SRL + ADD + SRA.
305 // The OperandValue properties may not be the same as that of the previous
306 // operation; conservatively assume OP_None.
307 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
308 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
310 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
311 Op1Info.getNoProps(), Op2Info.getNoProps());
312 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
313 Op1Info.getNoProps(), Op2Info.getNoProps());
314 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
315 Op1Info.getNoProps(), Op2Info.getNoProps());
316
317 if (ISD == ISD::SREM) {
318 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
319 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
320 Op2Info.getNoProps());
321 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
322 Op2Info.getNoProps());
323 }
324
325 return Cost;
326 }
327
328 // Vector unsigned division/remainder will be simplified to shifts/masks.
329 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
330 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
331 if (ISD == ISD::UDIV)
332 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
333 Op1Info.getNoProps(), Op2Info.getNoProps());
334 // UREM
335 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
336 Op1Info.getNoProps(), Op2Info.getNoProps());
337 }
338
339 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
340 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
341 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
342 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
343 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
344 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
345 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
346 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
347 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
348 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
349
350 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
351 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
352 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
353 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
354 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
355 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
356 };
357
358 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
359 if (const auto *Entry =
360 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
361 if (auto KindCost = Entry->Cost[CostKind])
362 return LT.first * *KindCost;
363
364 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
365 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
366 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
367 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
368
369 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
370 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
371 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
372
373 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
374 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
375 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
376 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
377 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
378 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
379
380 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
381 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
382 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
383 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
384 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
385 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
386 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
387
388 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
389 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
390 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
391 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
392 };
393
394 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
395 if (const auto *Entry =
396 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
397 if (auto KindCost = Entry->Cost[CostKind])
398 return LT.first * *KindCost;
399
400 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
401 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
402 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
403 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
404 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
405 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
406 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
407
408 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
409 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
410 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
411 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
412 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
413 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
414
415 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
416 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
417 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
418 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
419 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
420 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
421
422 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
423 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
424 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
425 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
426 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
427 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
428
429 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
430 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
431 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
432 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
433 };
434
435 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
436 if (const auto *Entry =
437 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
438 if (auto KindCost = Entry->Cost[CostKind])
439 return LT.first * *KindCost;
440
441 static const CostKindTblEntry AVXUniformConstCostTable[] = {
442 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
443 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
444 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
445 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
446 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
447 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
448
449 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
450 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
451 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
452 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
453 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
454 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
455
456 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
457 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
458 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
459 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
460 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
461 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
462
463 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
464 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
465 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
466 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
467 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
468 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
469
470 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
471 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
472 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
473 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
474 };
475
476 // XOP has faster vXi8 shifts.
477 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
478 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
479 if (const auto *Entry =
480 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
481 if (auto KindCost = Entry->Cost[CostKind])
482 return LT.first * *KindCost;
483
484 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
485 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
486 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
487 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
488
489 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
490 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
491 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
492
493 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
494 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
495 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
496
497 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
498 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
499 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
500
501 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
502 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
503 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
504 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
505 };
506
507 // XOP has faster vXi8 shifts.
508 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
509 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
510 if (const auto *Entry =
511 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
512 if (auto KindCost = Entry->Cost[CostKind])
513 return LT.first * *KindCost;
514
515 static const CostKindTblEntry AVX512BWConstCostTable[] = {
516 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
517 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
518 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
519 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
520
521 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
522 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
523 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
524 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
525 };
526
527 if (Op2Info.isConstant() && ST->hasBWI())
528 if (const auto *Entry =
529 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
530 if (auto KindCost = Entry->Cost[CostKind])
531 return LT.first * *KindCost;
532
533 static const CostKindTblEntry AVX512ConstCostTable[] = {
534 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
535 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
536 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
537 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
538
539 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
540 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
541 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
542 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
543
544 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
545 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
546 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
547 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
548 };
549
550 if (Op2Info.isConstant() && ST->hasAVX512())
551 if (const auto *Entry =
552 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
553 if (auto KindCost = Entry->Cost[CostKind])
554 return LT.first * *KindCost;
555
556 static const CostKindTblEntry AVX2ConstCostTable[] = {
557 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
558 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
559 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
560 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
561
562 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
563 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
564 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
565 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
566
567 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
568 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
569 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
570 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
571 };
572
573 if (Op2Info.isConstant() && ST->hasAVX2())
574 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
575 if (auto KindCost = Entry->Cost[CostKind])
576 return LT.first * *KindCost;
577
578 static const CostKindTblEntry AVXConstCostTable[] = {
579 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
580 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
581 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
582 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
583
584 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
585 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
586 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
587 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
588
589 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
590 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
591 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
592 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
593 };
594
595 if (Op2Info.isConstant() && ST->hasAVX())
596 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
597 if (auto KindCost = Entry->Cost[CostKind])
598 return LT.first * *KindCost;
599
600 static const CostKindTblEntry SSE41ConstCostTable[] = {
601 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
602 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
603 };
604
605 if (Op2Info.isConstant() && ST->hasSSE41())
606 if (const auto *Entry =
607 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
608 if (auto KindCost = Entry->Cost[CostKind])
609 return LT.first * *KindCost;
610
611 static const CostKindTblEntry SSE2ConstCostTable[] = {
612 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
613 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
614 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
615 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
616
617 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
618 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
619 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
620 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
621
622 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
623 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
624 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
625 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
626 };
627
628 if (Op2Info.isConstant() && ST->hasSSE2())
629 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
630 if (auto KindCost = Entry->Cost[CostKind])
631 return LT.first * *KindCost;
632
633 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
634 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
635 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
636 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
637 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
638 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
639 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
640 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
641 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
642 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
643
644 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
645 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
646 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
647 };
648
649 if (ST->hasBWI() && Op2Info.isUniform())
650 if (const auto *Entry =
651 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
652 if (auto KindCost = Entry->Cost[CostKind])
653 return LT.first * *KindCost;
654
655 static const CostKindTblEntry AVX512UniformCostTable[] = {
656 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
657 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
658 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
659
660 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
661 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
662 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
663
664 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
665 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
666 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
667 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
668 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
669 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
670 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
671 };
672
673 if (ST->hasAVX512() && Op2Info.isUniform())
674 if (const auto *Entry =
675 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
676 if (auto KindCost = Entry->Cost[CostKind])
677 return LT.first * *KindCost;
678
679 static const CostKindTblEntry AVX2UniformCostTable[] = {
680 // Uniform splats are cheaper for the following instructions.
681 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
682 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
683 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
684 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
685 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
686 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
687
688 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
689 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
690 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
691 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
692 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
693 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
694
695 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
696 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
697 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
698 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
699 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
700 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
701
702 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
703 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
704 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
705 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
706 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
707 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
708 };
709
710 if (ST->hasAVX2() && Op2Info.isUniform())
711 if (const auto *Entry =
712 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
713 if (auto KindCost = Entry->Cost[CostKind])
714 return LT.first * *KindCost;
715
716 static const CostKindTblEntry AVXUniformCostTable[] = {
717 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
718 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
719 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
720 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
721 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
722 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
723
724 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
725 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
726 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
727 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
728 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
729 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
730
731 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
732 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
733 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
734 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
735 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
736 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
737
738 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
739 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
740 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
741 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
742 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
743 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
744 };
745
746 // XOP has faster vXi8 shifts.
747 if (ST->hasAVX() && Op2Info.isUniform() &&
748 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
749 if (const auto *Entry =
750 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
751 if (auto KindCost = Entry->Cost[CostKind])
752 return LT.first * *KindCost;
753
754 static const CostKindTblEntry SSE2UniformCostTable[] = {
755 // Uniform splats are cheaper for the following instructions.
756 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
757 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
758 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
759
760 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
761 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
762 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
763
764 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
765 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
766 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
767
768 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
769 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
770 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
771 };
772
773 if (ST->hasSSE2() && Op2Info.isUniform() &&
774 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
775 if (const auto *Entry =
776 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
777 if (auto KindCost = Entry->Cost[CostKind])
778 return LT.first * *KindCost;
779
780 static const CostKindTblEntry AVX512DQCostTable[] = {
781 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
782 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
783 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
784 };
785
786 // Look for AVX512DQ lowering tricks for custom cases.
787 if (ST->hasDQI())
788 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
789 if (auto KindCost = Entry->Cost[CostKind])
790 return LT.first * *KindCost;
791
792 static const CostKindTblEntry AVX512BWCostTable[] = {
793 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
794 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
795 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
796 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
797 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
798 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
799 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
800 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
801 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
802
803 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
804 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
805 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
806 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
807 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
808 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
809 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
810 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
811 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
812
813 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
814 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
815
816 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
817 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
818 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
819 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
820
821 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
822 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
823
824 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
825
826 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
827 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
828 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
829 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
830 };
831
832 // Look for AVX512BW lowering tricks for custom cases.
833 if (ST->hasBWI())
834 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
835 if (auto KindCost = Entry->Cost[CostKind])
836 return LT.first * *KindCost;
837
838 static const CostKindTblEntry AVX512CostTable[] = {
839 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
840 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
841 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
842
843 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
844 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
845 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
846
847 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
848 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
849 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
850 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
851 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
852 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
853 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
854 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
855 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
856
857 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
858 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
859 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
860 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
861 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
862 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
863 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
864 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
865 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
866
867 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
868 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
869
870 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
871 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
872
873 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
874 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
875 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
876 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
877
878 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
879 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
880 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
881 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
882
883 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
884 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
885 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
886 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
887
888 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
889 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
890 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
891 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
892 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
893
894 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
895 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
896 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
897 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
898 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
899 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
900 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
901 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
902 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
903
904 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
905 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
906 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
907 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
908
909 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
910 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
911 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
912 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
913 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
914 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
915 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
916 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
917 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
918
919 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
920 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
921 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
922 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
923 };
924
925 if (ST->hasAVX512())
926 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
927 if (auto KindCost = Entry->Cost[CostKind])
928 return LT.first * *KindCost;
929
930 static const CostKindTblEntry AVX2ShiftCostTable[] = {
931 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
932 // customize them to detect the cases where shift amount is a scalar one.
933 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
934 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
935 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
936 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
937 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
938 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
939 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
940 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
941 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
942 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
943 };
944
945 if (ST->hasAVX512()) {
946 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
947 // On AVX512, a packed v32i16 shift left by a constant build_vector
948 // is lowered into a vector multiply (vpmullw).
949 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
950 Op1Info.getNoProps(), Op2Info.getNoProps());
951 }
952
953 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
954 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
955 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
956 Op2Info.isConstant())
957 // On AVX2, a packed v16i16 shift left by a constant build_vector
958 // is lowered into a vector multiply (vpmullw).
959 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
960 Op1Info.getNoProps(), Op2Info.getNoProps());
961
962 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
963 if (auto KindCost = Entry->Cost[CostKind])
964 return LT.first * *KindCost;
965 }
966
967 static const CostKindTblEntry XOPShiftCostTable[] = {
968 // 128bit shifts take 1cy, but right shifts require negation beforehand.
969 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
970 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
971 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
972 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
973 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
974 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
975 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
976 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
977 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
978 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
979 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
980 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
981 // 256bit shifts require splitting if AVX2 didn't catch them above.
982 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
983 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
984 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
985 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
986 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
987 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
988 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
989 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
990 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
991 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
992 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
993 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
994 };
995
996 // Look for XOP lowering tricks.
997 if (ST->hasXOP()) {
998 // If the right shift is constant then we'll fold the negation so
999 // it's as cheap as a left shift.
1000 int ShiftISD = ISD;
1001 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1002 ShiftISD = ISD::SHL;
1003 if (const auto *Entry =
1004 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1005 if (auto KindCost = Entry->Cost[CostKind])
1006 return LT.first * *KindCost;
1007 }
1008
1009 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1010 MVT VT = LT.second;
1011 // Vector shift left by non uniform constant can be lowered
1012 // into vector multiply.
1013 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1014 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1015 ISD = ISD::MUL;
1016 }
1017
1018 static const CostKindTblEntry GLMCostTable[] = {
1019 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1020 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1021 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1022 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1023 };
1024
1025 if (ST->useGLMDivSqrtCosts())
1026 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1027 if (auto KindCost = Entry->Cost[CostKind])
1028 return LT.first * *KindCost;
1029
1030 static const CostKindTblEntry SLMCostTable[] = {
1031 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1032 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1033 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1034 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1035 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1036 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1037 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1038 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1039 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1040 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1041 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1042 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1043 // v2i64/v4i64 mul is custom lowered as a series of long:
1044 // multiplies(3), shifts(3) and adds(2)
1045 // slm muldq version throughput is 2 and addq throughput 4
1046 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1047 // 3X4 (addq throughput) = 17
1048 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1049 // slm addq\subq throughput is 4
1050 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1051 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1052 };
1053
1054 if (ST->useSLMArithCosts())
1055 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1056 if (auto KindCost = Entry->Cost[CostKind])
1057 return LT.first * *KindCost;
1058
1059 static const CostKindTblEntry AVX2CostTable[] = {
1060 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1061 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1062 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1063 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1064
1065 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1066 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1067 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1068 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1069
1070 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1071 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1072 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1073 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1074 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1075 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1076
1077 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1078 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1079 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1080 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1081 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1082 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1083 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1084 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1085
1086 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 1 } }, // pmullw
1087 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1088 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1089 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1090 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1091
1092 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1093 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1094
1095 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1096 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1097 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1098 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1099 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1100 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1101
1102 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1103 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1104 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1105 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1106 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1107 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1108
1109 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1110 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1111 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1112 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1113 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1114 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1115
1116 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1117 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1118 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1119 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1120 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1121 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1122 };
1123
1124 // Look for AVX2 lowering tricks for custom cases.
1125 if (ST->hasAVX2())
1126 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1127 if (auto KindCost = Entry->Cost[CostKind])
1128 return LT.first * *KindCost;
1129
1130 static const CostKindTblEntry AVX1CostTable[] = {
1131 // We don't have to scalarize unsupported ops. We can issue two half-sized
1132 // operations and we only need to extract the upper YMM half.
1133 // Two ops + 1 extract + 1 insert = 4.
1134 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1135 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1136 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1137 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1138
1139 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1140 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1141 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1142 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1143
1144 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1145 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1146 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1147 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1148
1149 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1150 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1151 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1152 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1153
1154 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1155 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1156 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1157 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1158 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1159 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1160 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1161 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1162 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1163 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1164
1165 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1166 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1167 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1168 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1169 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1170 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1171 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1172 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1173
1174 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1175 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1176 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1177 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1178 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1179 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1180 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1181 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1182
1183 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1184 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1185 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1186 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1187 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1188 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1189 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1190 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1191
1192 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1193 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1194
1195 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1196 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1197 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1198 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1199 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1200 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1201
1202 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1203 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1204 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1205 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1206 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1207 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1208
1209 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1210 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1211 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1212 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1213 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1214 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1215
1216 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1217 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1218 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1219 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1220 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1221 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1222 };
1223
1224 if (ST->hasAVX())
1225 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1226 if (auto KindCost = Entry->Cost[CostKind])
1227 return LT.first * *KindCost;
1228
1229 static const CostKindTblEntry SSE42CostTable[] = {
1230 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1231 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1232 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1233 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1234
1235 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1236 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1237 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1238 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1239
1240 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1241 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1242 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1243 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1244
1245 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1246 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1247 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1248 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1249
1250 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1251 };
1252
1253 if (ST->hasSSE42())
1254 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1255 if (auto KindCost = Entry->Cost[CostKind])
1256 return LT.first * *KindCost;
1257
1258 static const CostKindTblEntry SSE41CostTable[] = {
1259 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1260 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1261 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1262
1263 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1264 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1265 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1266 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1267
1268 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1269 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1270 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1271 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1272
1273 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1274 };
1275
1276 if (ST->hasSSE41())
1277 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1278 if (auto KindCost = Entry->Cost[CostKind])
1279 return LT.first * *KindCost;
1280
1281 static const CostKindTblEntry SSE2CostTable[] = {
1282 // We don't correctly identify costs of casts because they are marked as
1283 // custom.
1284 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1285 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1286 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1287 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1288
1289 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1290 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1291 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1292 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1293
1294 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1295 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1296 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1297 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1298
1299 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1300 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1301 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1302 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1303
1304 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1305 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1306 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1307 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1308
1309 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1310 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1311 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1312 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1313
1314 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1315 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1316
1317 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1318 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1319 { ISD::MUL, MVT::v2i64, { 8, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1320
1321 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1322 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1323 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1324 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1325
1326 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1327 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1328 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1329 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1330
1331 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1332 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1333 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1334
1335 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1336 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1337 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1338
1339 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1340 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1341 };
1342
1343 if (ST->hasSSE2())
1344 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1345 if (auto KindCost = Entry->Cost[CostKind])
1346 return LT.first * *KindCost;
1347
1348 static const CostKindTblEntry SSE1CostTable[] = {
1349 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1350 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1351
1352 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1353 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1354
1355 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1356 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1357
1358 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1359 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1360
1361 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1362 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1363 };
1364
1365 if (ST->hasSSE1())
1366 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1367 if (auto KindCost = Entry->Cost[CostKind])
1368 return LT.first * *KindCost;
1369
1370 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1371 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1372 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1373 { ISD::MUL, MVT::i64, { 2 } }, // Nehalem from http://www.agner.org/
1374 };
1375
1376 if (ST->is64Bit())
1377 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1378 if (auto KindCost = Entry->Cost[CostKind])
1379 return LT.first * *KindCost;
1380
1381 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1382 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1383 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1384 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1385
1386 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1387 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1388 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1389
1390 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1391 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1392 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1393 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1394 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1395 };
1396
1397 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1398 if (auto KindCost = Entry->Cost[CostKind])
1399 return LT.first * *KindCost;
1400
1401 // It is not a good idea to vectorize division. We have to scalarize it and
1402 // in the process we will often end up having to spilling regular
1403 // registers. The overhead of division is going to dominate most kernels
1404 // anyways so try hard to prevent vectorization of division - it is
1405 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1406 // to hide "20 cycles" for each lane.
1407 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1408 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1409 ISD == ISD::UREM)) {
1410 InstructionCost ScalarCost =
1412 Op1Info.getNoProps(), Op2Info.getNoProps());
1413 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1414 }
1415
1416 // Handle some basic single instruction code size cases.
1417 if (CostKind == TTI::TCK_CodeSize) {
1418 switch (ISD) {
1419 case ISD::FADD:
1420 case ISD::FSUB:
1421 case ISD::FMUL:
1422 case ISD::FDIV:
1423 case ISD::FNEG:
1424 case ISD::AND:
1425 case ISD::OR:
1426 case ISD::XOR:
1427 return LT.first;
1428 break;
1429 }
1430 }
1431
1432 // Fallback to the default implementation.
1433 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1434 Args, CxtI);
1435}
1436
1438 VectorType *BaseTp,
1439 ArrayRef<int> Mask,
1441 int Index, VectorType *SubTp,
1443 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1444 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1445 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1446
1447 Kind = improveShuffleKindFromMask(Kind, Mask);
1448
1449 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1450 if (Kind == TTI::SK_Transpose)
1451 Kind = TTI::SK_PermuteTwoSrc;
1452
1453 // For Broadcasts we are splatting the first element from the first input
1454 // register, so only need to reference that input and all the output
1455 // registers are the same.
1456 if (Kind == TTI::SK_Broadcast)
1457 LT.first = 1;
1458
1459 // Subvector extractions are free if they start at the beginning of a
1460 // vector and cheap if the subvectors are aligned.
1461 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1462 int NumElts = LT.second.getVectorNumElements();
1463 if ((Index % NumElts) == 0)
1464 return 0;
1465 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1466 if (SubLT.second.isVector()) {
1467 int NumSubElts = SubLT.second.getVectorNumElements();
1468 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1469 return SubLT.first;
1470 // Handle some cases for widening legalization. For now we only handle
1471 // cases where the original subvector was naturally aligned and evenly
1472 // fit in its legalized subvector type.
1473 // FIXME: Remove some of the alignment restrictions.
1474 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1475 // vectors.
1476 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1477 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1478 (NumSubElts % OrigSubElts) == 0 &&
1479 LT.second.getVectorElementType() ==
1480 SubLT.second.getVectorElementType() &&
1481 LT.second.getVectorElementType().getSizeInBits() ==
1483 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1484 "Unexpected number of elements!");
1485 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1486 LT.second.getVectorNumElements());
1487 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1488 SubLT.second.getVectorNumElements());
1489 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1490 InstructionCost ExtractCost =
1491 getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt,
1492 CostKind, ExtractIndex, SubTy);
1493
1494 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1495 // if we have SSSE3 we can use pshufb.
1496 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1497 return ExtractCost + 1; // pshufd or pshufb
1498
1499 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1500 "Unexpected vector size");
1501
1502 return ExtractCost + 2; // worst case pshufhw + pshufd
1503 }
1504 }
1505 }
1506
1507 // Subvector insertions are cheap if the subvectors are aligned.
1508 // Note that in general, the insertion starting at the beginning of a vector
1509 // isn't free, because we need to preserve the rest of the wide vector.
1510 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1511 int NumElts = LT.second.getVectorNumElements();
1512 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1513 if (SubLT.second.isVector()) {
1514 int NumSubElts = SubLT.second.getVectorNumElements();
1515 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1516 return SubLT.first;
1517 }
1518
1519 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1520 Kind = TTI::SK_PermuteTwoSrc;
1521 }
1522
1523 // Handle some common (illegal) sub-vector types as they are often very cheap
1524 // to shuffle even on targets without PSHUFB.
1525 EVT VT = TLI->getValueType(DL, BaseTp);
1526 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1527 !ST->hasSSSE3()) {
1528 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1529 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1530 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1531 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1532 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1533 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1534
1535 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1536 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1537 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1538 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1539
1540 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1541 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1542 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1543 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1544
1545 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1546 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1547 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1548 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1549 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1550
1551 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1552 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1553 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1554 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1555 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1556 };
1557
1558 if (ST->hasSSE2())
1559 if (const auto *Entry =
1560 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1561 return Entry->Cost;
1562 }
1563
1564 // We are going to permute multiple sources and the result will be in multiple
1565 // destinations. Providing an accurate cost only for splits where the element
1566 // type remains the same.
1567 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1568 MVT LegalVT = LT.second;
1569 if (LegalVT.isVector() &&
1570 LegalVT.getVectorElementType().getSizeInBits() ==
1572 LegalVT.getVectorNumElements() <
1573 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1574
1575 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1576 unsigned LegalVTSize = LegalVT.getStoreSize();
1577 // Number of source vectors after legalization:
1578 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1579 // Number of destination vectors after legalization:
1580 InstructionCost NumOfDests = LT.first;
1581
1582 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1583 LegalVT.getVectorNumElements());
1584
1585 if (!Mask.empty() && NumOfDests.isValid()) {
1586 // Try to perform better estimation of the permutation.
1587 // 1. Split the source/destination vectors into real registers.
1588 // 2. Do the mask analysis to identify which real registers are
1589 // permuted. If more than 1 source registers are used for the
1590 // destination register building, the cost for this destination register
1591 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1592 // source register is used, build mask and calculate the cost as a cost
1593 // of PermuteSingleSrc.
1594 // Also, for the single register permute we try to identify if the
1595 // destination register is just a copy of the source register or the
1596 // copy of the previous destination register (the cost is
1597 // TTI::TCC_Basic). If the source register is just reused, the cost for
1598 // this operation is 0.
1599 unsigned E = *NumOfDests.getValue();
1600 unsigned NormalizedVF =
1601 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1602 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1603 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1604 SmallVector<int> NormalizedMask(NormalizedVF, UndefMaskElem);
1605 copy(Mask, NormalizedMask.begin());
1606 unsigned PrevSrcReg = 0;
1607 ArrayRef<int> PrevRegMask;
1610 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1611 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1612 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1613 if (!ShuffleVectorInst::isIdentityMask(RegMask)) {
1614 // Check if the previous register can be just copied to the next
1615 // one.
1616 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1617 PrevRegMask != RegMask)
1619 RegMask, CostKind, 0, nullptr);
1620 else
1621 // Just a copy of previous destination register.
1623 return;
1624 }
1625 if (SrcReg != DestReg &&
1626 any_of(RegMask, [](int I) { return I != UndefMaskElem; })) {
1627 // Just a copy of the source register.
1629 }
1630 PrevSrcReg = SrcReg;
1631 PrevRegMask = RegMask;
1632 },
1633 [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1634 unsigned /*Unused*/,
1635 unsigned /*Unused*/) {
1636 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1637 CostKind, 0, nullptr);
1638 });
1639 return Cost;
1640 }
1641
1642 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1643 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1644 std::nullopt, CostKind, 0, nullptr);
1645 }
1646
1647 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1648 }
1649
1650 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1651 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1652 // We assume that source and destination have the same vector type.
1653 InstructionCost NumOfDests = LT.first;
1654 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1655 LT.first = NumOfDests * NumOfShufflesPerDest;
1656 }
1657
1658 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1659 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1660 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1661
1662 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1663 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1664
1665 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1666 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1667 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1668 };
1669
1670 if (ST->hasVBMI())
1671 if (const auto *Entry =
1672 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1673 return LT.first * Entry->Cost;
1674
1675 static const CostTblEntry AVX512BWShuffleTbl[] = {
1676 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1677 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1678 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1679
1680 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1681 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1682 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1683 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1684
1685 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1686 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1687 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1688 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1689 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1690
1691 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1692 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1693 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1694 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1695 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1696
1697 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1698 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1699
1700 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1701 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1702 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1703 };
1704
1705 if (ST->hasBWI())
1706 if (const auto *Entry =
1707 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1708 return LT.first * Entry->Cost;
1709
1710 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1711 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1712 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1713 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1714 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1715 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1716 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1717 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1718
1719 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1720 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1721 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1722 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1723 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1724 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1725 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1726
1727 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1728 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1729 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1730 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1731 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1732 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1733 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1734 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1735 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1736 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1737 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1738
1739 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1740 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1741 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1742 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1743 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1744 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1745 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1746 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1747 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1748 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1749 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1750 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1751 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1752
1753 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1754 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1755 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1756 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1757 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1758 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1759 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1760 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1761 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1762 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1763 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1764 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1765
1766 // FIXME: This just applies the type legalization cost rules above
1767 // assuming these completely split.
1768 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1769 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1770 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1771 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1772 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1773 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1774
1775 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1776 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1777 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1778 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1779 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1780 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1781 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1782 };
1783
1784 if (ST->hasAVX512())
1785 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1786 if (auto KindCost = Entry->Cost[CostKind])
1787 return LT.first * *KindCost;
1788
1789 static const CostTblEntry AVX2ShuffleTbl[] = {
1790 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1791 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1792 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1793 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1794 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1795 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1796 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1797
1798 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1799 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1800 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1801 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1802 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1803 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1804 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1805
1806 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1807 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1808 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1809
1810 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1811 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1812 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1813 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1814 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1815
1816 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1817 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1818 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1819 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1820 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1821 // + vpblendvb
1822 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1823 // + vpblendvb
1824 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1825 // + vpblendvb
1826
1827 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1828 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1829 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1830 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1831 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1832 // + vpblendvb
1833 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1834 // + vpblendvb
1835 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1836 // + vpblendvb
1837 };
1838
1839 if (ST->hasAVX2())
1840 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1841 return LT.first * Entry->Cost;
1842
1843 static const CostTblEntry XOPShuffleTbl[] = {
1844 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1845 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1846 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1847 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1848 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1849 // + vinsertf128
1850 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1851 // + vinsertf128
1852
1853 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1854 // + vinsertf128
1855 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1856 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1857 // + vinsertf128
1858 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1859 };
1860
1861 if (ST->hasXOP())
1862 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1863 return LT.first * Entry->Cost;
1864
1865 static const CostTblEntry AVX1ShuffleTbl[] = {
1866 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1867 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1868 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1869 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1870 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1871 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1872 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1873
1874 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1875 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1876 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1877 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1878 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1879 // + vinsertf128
1880 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1881 // + vinsertf128
1882 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1883 // + vinsertf128
1884
1885 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1886 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1887 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1888 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1889 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1890 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
1891 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1892
1893 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
1894 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
1895 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1896 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1897 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1898 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1899 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1900
1901 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1902 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1903 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1904 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1905 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1906 // + 2*por + vinsertf128
1907 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
1908 // + 2*por + vinsertf128
1909 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1910 // + 2*por + vinsertf128
1911
1912 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1913 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1914 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1915 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1916 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1917 // + 4*por + vinsertf128
1918 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
1919 // + 4*por + vinsertf128
1920 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1921 // + 4*por + vinsertf128
1922 };
1923
1924 if (ST->hasAVX())
1925 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1926 return LT.first * Entry->Cost;
1927
1928 static const CostTblEntry SSE41ShuffleTbl[] = {
1929 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1930 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1931 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1932 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1933 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1934 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
1935 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1936 };
1937
1938 if (ST->hasSSE41())
1939 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1940 return LT.first * Entry->Cost;
1941
1942 static const CostTblEntry SSSE3ShuffleTbl[] = {
1943 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1944 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
1945 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1946
1947 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1948 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
1949 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1950
1951 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1952 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
1953 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1954
1955 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
1956 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
1957 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
1958 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
1959 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
1960
1961 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1962 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
1963 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1964
1965 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1966 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
1967 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1968 };
1969
1970 if (ST->hasSSSE3())
1971 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1972 return LT.first * Entry->Cost;
1973
1974 static const CostTblEntry SSE2ShuffleTbl[] = {
1975 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1976 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1977 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1978 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1979 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
1980 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1981
1982 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1983 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1984 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1985 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1986 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
1987 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1988 // + 2*pshufd + 2*unpck + packus
1989
1990 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1991 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1992 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1993 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1994 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
1995 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1996
1997 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
1998 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
1999 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2000 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2001 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2002 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2003
2004 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2005 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2006 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2007 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2008 // + pshufd/unpck
2009 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2010 // + pshufd/unpck
2011 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2012 // + 2*pshufd + 2*unpck + 2*packus
2013
2014 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2015 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2016 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2017 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2018 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2019 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2020 };
2021
2022 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2023 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2024 };
2025
2026 if (ST->hasSSE2()) {
2027 bool IsLoad =
2028 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2029 if (ST->hasSSE3() && IsLoad)
2030 if (const auto *Entry =
2031 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2033 LT.second.getVectorElementCount()) &&
2034 "Table entry missing from isLegalBroadcastLoad()");
2035 return LT.first * Entry->Cost;
2036 }
2037
2038 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2039 return LT.first * Entry->Cost;
2040 }
2041
2042 static const CostTblEntry SSE1ShuffleTbl[] = {
2043 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2044 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2045 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2046 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2047 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2048 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2049 };
2050
2051 if (ST->hasSSE1())
2052 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2053 return LT.first * Entry->Cost;
2054
2055 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2056}
2057
2059 Type *Src,
2062 const Instruction *I) {
2063 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2064 assert(ISD && "Invalid opcode");
2065
2066 // TODO: Allow non-throughput costs that aren't binary.
2067 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2069 return Cost == 0 ? 0 : 1;
2070 return Cost;
2071 };
2072
2073 // The cost tables include both specific, custom (non-legal) src/dst type
2074 // conversions and generic, legalized types. We test for customs first, before
2075 // falling back to legalization.
2076 // FIXME: Need a better design of the cost table to handle non-simple types of
2077 // potential massive combinations (elem_num x src_type x dst_type).
2078 static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
2081
2082 // Mask sign extend has an instruction.
2100
2101 // Mask zero extend is a sext + shift.
2119
2137
2139 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
2140 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
2141 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
2142 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
2143 };
2144
2145 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
2146 // Mask sign extend has an instruction.
2155
2156 // Mask zero extend is a sext + shift.
2165
2174
2177
2180
2183
2186 };
2187
2188 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2189 // 256-bit wide vectors.
2190
2191 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
2195
2196 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2197 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2198 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2199 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
2200 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2201 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2202 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2203 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
2204 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
2205 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
2206 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
2207 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
2208 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
2209 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
2210 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
2211 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
2212 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
2213 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
2214 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb
2215 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb
2216 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw
2217 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw
2218 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
2219 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
2220 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
2221 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb
2222 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb
2223 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb
2224 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
2225 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw
2226 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw
2227 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
2228 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
2229 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
2230
2231 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
2234
2235 // Sign extend is zmm vpternlogd+vptruncdb.
2236 // Zero extend is zmm broadcast load+vptruncdw.
2245
2246 // Sign extend is zmm vpternlogd+vptruncdw.
2247 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2256
2257 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
2258 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
2259 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
2260 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
2261 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
2262 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
2263 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
2264 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
2265 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
2266 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
2267
2268 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
2269 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
2270 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
2271 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
2272
2283
2284 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
2285 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
2286
2295
2306
2318
2325 };
2326
2327 static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
2328 // Mask sign extend has an instruction.
2346
2347 // Mask zero extend is a sext + shift.
2365
2383
2385 };
2386
2387 static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
2388 // Mask sign extend has an instruction.
2397
2398 // Mask zero extend is a sext + shift.
2407
2416
2421
2426
2431
2436 };
2437
2438 static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
2439 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2440 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2441 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2442 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
2443 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2444 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2445 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2446 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
2447 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
2448 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
2449 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2450 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2451 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
2452 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
2453 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
2454 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
2455 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
2456 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
2457
2458 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2459 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2468
2469 // sign extend is vpcmpeq+maskedmove+vpmovdw
2470 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2479
2480 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
2481 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
2482 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
2483 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
2484 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
2485 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
2486 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, // vpternlogd
2487 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, // vpternlogd+psrld
2488
2489 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
2490 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
2491 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
2492 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
2493
2506
2511
2525
2529
2537 };
2538
2539 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
2546
2561
2563
2576
2579
2584
2593
2601
2612 };
2613
2614 static const TypeConversionCostTblEntry AVXConversionTbl[] = {
2621
2634
2640
2643 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
2647 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
2649
2662
2680
2692
2706
2709 };
2710
2711 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2724
2725 // These truncates end up widening elements.
2726 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
2727 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
2728 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
2729
2733
2745
2760
2771
2782 };
2783
2784 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2785 // These are somewhat magic numbers justified by comparing the
2786 // output of llvm-mca for our various supported scheduler models
2787 // and basing it off the worst case scenario.
2800
2814
2825
2836
2849
2850 // These truncates are really widening elements.
2851 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2852 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2853 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2854 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2855 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2856 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2857
2858 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2860 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
2866 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2867 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2868 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
2869 };
2870
2871 // Attempt to map directly to (simple) MVT types to let us match custom entries.
2872 EVT SrcTy = TLI->getValueType(DL, Src);
2873 EVT DstTy = TLI->getValueType(DL, Dst);
2874
2875 // The function getSimpleVT only handles simple value types.
2876 if (SrcTy.isSimple() && DstTy.isSimple()) {
2877 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2878 MVT SimpleDstTy = DstTy.getSimpleVT();
2879
2880 if (ST->useAVX512Regs()) {
2881 if (ST->hasBWI())
2882 if (const auto *Entry = ConvertCostTableLookup(
2883 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2884 return AdjustCost(Entry->Cost);
2885
2886 if (ST->hasDQI())
2887 if (const auto *Entry = ConvertCostTableLookup(
2888 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2889 return AdjustCost(Entry->Cost);
2890
2891 if (ST->hasAVX512())
2892 if (const auto *Entry = ConvertCostTableLookup(
2893 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2894 return AdjustCost(Entry->Cost);
2895 }
2896
2897 if (ST->hasBWI())
2898 if (const auto *Entry = ConvertCostTableLookup(
2899 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2900 return AdjustCost(Entry->Cost);
2901
2902 if (ST->hasDQI())
2903 if (const auto *Entry = ConvertCostTableLookup(
2904 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2905 return AdjustCost(Entry->Cost);
2906
2907 if (ST->hasAVX512())
2908 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2909 SimpleDstTy, SimpleSrcTy))
2910 return AdjustCost(Entry->Cost);
2911
2912 if (ST->hasAVX2()) {
2913 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2914 SimpleDstTy, SimpleSrcTy))
2915 return AdjustCost(Entry->Cost);
2916 }
2917
2918 if (ST->hasAVX()) {
2919 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2920 SimpleDstTy, SimpleSrcTy))
2921 return AdjustCost(Entry->Cost);
2922 }
2923
2924 if (ST->hasSSE41()) {
2925 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2926 SimpleDstTy, SimpleSrcTy))
2927 return AdjustCost(Entry->Cost);
2928 }
2929
2930 if (ST->hasSSE2()) {
2931 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2932 SimpleDstTy, SimpleSrcTy))
2933 return AdjustCost(Entry->Cost);
2934 }
2935 }
2936
2937 // Fall back to legalized types.
2938 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
2939 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
2940
2941 // If we're truncating to the same legalized type - just assume its free.
2942 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
2943 return TTI::TCC_Free;
2944
2945 if (ST->useAVX512Regs()) {
2946 if (ST->hasBWI())
2947 if (const auto *Entry = ConvertCostTableLookup(
2948 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
2949 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2950
2951 if (ST->hasDQI())
2952 if (const auto *Entry = ConvertCostTableLookup(
2953 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
2954 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2955
2956 if (ST->hasAVX512())
2957 if (const auto *Entry = ConvertCostTableLookup(
2958 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
2959 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2960 }
2961
2962 if (ST->hasBWI())
2963 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2964 LTDest.second, LTSrc.second))
2965 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2966
2967 if (ST->hasDQI())
2968 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
2969 LTDest.second, LTSrc.second))
2970 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2971
2972 if (ST->hasAVX512())
2973 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2974 LTDest.second, LTSrc.second))
2975 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2976
2977 if (ST->hasAVX2())
2978 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2979 LTDest.second, LTSrc.second))
2980 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2981
2982 if (ST->hasAVX())
2983 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2984 LTDest.second, LTSrc.second))
2985 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2986
2987 if (ST->hasSSE41())
2988 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2989 LTDest.second, LTSrc.second))
2990 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2991
2992 if (ST->hasSSE2())
2993 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2994 LTDest.second, LTSrc.second))
2995 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2996
2997 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
2998 // sitofp.
2999 if ((ISD == ISD::SINT_TO_FP