LLVM 17.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
17#include <cmath>
18#include <optional>
19using namespace llvm;
20
21#define DEBUG_TYPE "riscvtti"
22
24 "riscv-v-register-bit-width-lmul",
26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
27 "by autovectorized code. Fractional LMULs are not supported."),
29
31 "riscv-v-slp-max-vf",
33 "Result used for getMaximumVF query which is used exclusively by "
34 "SLP vectorizer. Defaults to 1 which disables SLP."),
36
37InstructionCost RISCVTTIImpl::getLMULCost(MVT VT) {
38 // TODO: Here assume reciprocal throughput is 1 for LMUL_1, it is
39 // implementation-defined.
40 if (!VT.isVector())
42 unsigned Cost;
43 if (VT.isScalableVector()) {
44 unsigned LMul;
45 bool Fractional;
46 std::tie(LMul, Fractional) =
48 if (Fractional)
49 Cost = 1;
50 else
51 Cost = LMul;
52 } else {
53 Cost = VT.getSizeInBits() / ST->getRealMinVLen();
54 }
55 return std::max<unsigned>(Cost, 1);
56}
57
60 assert(Ty->isIntegerTy() &&
61 "getIntImmCost can only estimate cost of materialising integers");
62
63 // We have a Zero register, so 0 is always free.
64 if (Imm == 0)
65 return TTI::TCC_Free;
66
67 // Otherwise, we check how many instructions it will take to materialise.
68 const DataLayout &DL = getDataLayout();
70 getST()->getFeatureBits());
71}
72
73// Look for patterns of shift followed by AND that can be turned into a pair of
74// shifts. We won't need to materialize an immediate for the AND so these can
75// be considered free.
76static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
77 uint64_t Mask = Imm.getZExtValue();
78 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
79 if (!BO || !BO->hasOneUse())
80 return false;
81
82 if (BO->getOpcode() != Instruction::Shl)
83 return false;
84
85 if (!isa<ConstantInt>(BO->getOperand(1)))
86 return false;
87
88 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
89 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
90 // is a mask shifted by c2 bits with c3 leading zeros.
91 if (isShiftedMask_64(Mask)) {
92 unsigned Trailing = llvm::countr_zero(Mask);
93 if (ShAmt == Trailing)
94 return true;
95 }
96
97 return false;
98}
99
101 const APInt &Imm, Type *Ty,
103 Instruction *Inst) {
104 assert(Ty->isIntegerTy() &&
105 "getIntImmCost can only estimate cost of materialising integers");
106
107 // We have a Zero register, so 0 is always free.
108 if (Imm == 0)
109 return TTI::TCC_Free;
110
111 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
112 // commutative, in others the immediate comes from a specific argument index.
113 bool Takes12BitImm = false;
114 unsigned ImmArgIdx = ~0U;
115
116 switch (Opcode) {
117 case Instruction::GetElementPtr:
118 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
119 // split up large offsets in GEP into better parts than ConstantHoisting
120 // can.
121 return TTI::TCC_Free;
122 case Instruction::And:
123 // zext.h
124 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
125 return TTI::TCC_Free;
126 // zext.w
127 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
128 return TTI::TCC_Free;
129 // bclri
130 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
131 return TTI::TCC_Free;
132 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
133 canUseShiftPair(Inst, Imm))
134 return TTI::TCC_Free;
135 Takes12BitImm = true;
136 break;
137 case Instruction::Add:
138 Takes12BitImm = true;
139 break;
140 case Instruction::Or:
141 case Instruction::Xor:
142 // bseti/binvi
143 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
144 return TTI::TCC_Free;
145 Takes12BitImm = true;
146 break;
147 case Instruction::Mul:
148 // Negated power of 2 is a shift and a negate.
149 if (Imm.isNegatedPowerOf2())
150 return TTI::TCC_Free;
151 // FIXME: There is no MULI instruction.
152 Takes12BitImm = true;
153 break;
154 case Instruction::Sub:
155 case Instruction::Shl:
156 case Instruction::LShr:
157 case Instruction::AShr:
158 Takes12BitImm = true;
159 ImmArgIdx = 1;
160 break;
161 default:
162 break;
163 }
164
165 if (Takes12BitImm) {
166 // Check immediate is the correct argument...
167 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
168 // ... and fits into the 12-bit immediate.
169 if (Imm.getSignificantBits() <= 64 &&
170 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
171 return TTI::TCC_Free;
172 }
173 }
174
175 // Otherwise, use the full materialisation cost.
176 return getIntImmCost(Imm, Ty, CostKind);
177 }
178
179 // By default, prevent hoisting.
180 return TTI::TCC_Free;
181}
182
185 const APInt &Imm, Type *Ty,
187 // Prevent hoisting in unknown cases.
188 return TTI::TCC_Free;
189}
190
193 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
194 return ST->hasStdExtZbb() ? TTI::PSK_FastHardware : TTI::PSK_Software;
195}
196
198 // Currently, the ExpandReductions pass can't expand scalable-vector
199 // reductions, but we still request expansion as RVV doesn't support certain
200 // reductions and the SelectionDAG can't legalize them either.
201 switch (II->getIntrinsicID()) {
202 default:
203 return false;
204 // These reductions have no equivalent in RVV
205 case Intrinsic::vector_reduce_mul:
206 case Intrinsic::vector_reduce_fmul:
207 return true;
208 }
209}
210
211std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
212 if (ST->hasVInstructions())
214 return BaseT::getMaxVScale();
215}
216
217std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
218 if (ST->hasVInstructions())
219 if (unsigned MinVLen = ST->getRealMinVLen();
220 MinVLen >= RISCV::RVVBitsPerBlock)
221 return MinVLen / RISCV::RVVBitsPerBlock;
223}
224
227 unsigned LMUL =
228 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
229 switch (K) {
231 return TypeSize::getFixed(ST->getXLen());
233 return TypeSize::getFixed(
234 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
237 (ST->hasVInstructions() &&
240 : 0);
241 }
242
243 llvm_unreachable("Unsupported register kind");
244}
245
247 VectorType *Tp, ArrayRef<int> Mask,
249 int Index, VectorType *SubTp,
251 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
252
253 // First, handle cases where having a fixed length vector enables us to
254 // give a more accurate cost than falling back to generic scalable codegen.
255 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
256 if (isa<FixedVectorType>(Tp)) {
257 switch (Kind) {
258 default:
259 break;
261 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
262 MVT EltTp = LT.second.getVectorElementType();
263 // If the size of the element is < ELEN then shuffles of interleaves and
264 // deinterleaves of 2 vectors can be lowered into the following
265 // sequences
266 if (EltTp.getScalarSizeInBits() < ST->getELEN()) {
267 // Example sequence:
268 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
269 // vwaddu.vv v10, v8, v9
270 // li a0, -1 (ignored)
271 // vwmaccu.vx v10, a0, v9
272 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
273 return 2 * LT.first * getLMULCost(LT.second);
274
275 if (Mask[0] == 0 || Mask[0] == 1) {
276 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
277 // Example sequence:
278 // vnsrl.wi v10, v8, 0
279 if (equal(DeinterleaveMask, Mask))
280 return LT.first * getLMULCost(LT.second);
281 }
282 }
283 }
284 }
285 }
286 };
287
288 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
289 switch (Kind) {
290 default:
291 // Fallthrough to generic handling.
292 // TODO: Most of these cases will return getInvalid in generic code, and
293 // must be implemented here.
294 break;
295 case TTI::SK_Broadcast: {
296 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
297 Instruction::InsertElement);
298 if (LT.second.getScalarSizeInBits() == 1) {
299 if (HasScalar) {
300 // Example sequence:
301 // andi a0, a0, 1
302 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
303 // vmv.v.x v8, a0
304 // vmsne.vi v0, v8, 0
305 return LT.first * getLMULCost(LT.second) * 3;
306 }
307 // Example sequence:
308 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
309 // vmv.v.i v8, 0
310 // vmerge.vim v8, v8, 1, v0
311 // vmv.x.s a0, v8
312 // andi a0, a0, 1
313 // vmv.v.x v8, a0
314 // vmsne.vi v0, v8, 0
315
316 return LT.first * getLMULCost(LT.second) * 6;
317 }
318
319 if (HasScalar) {
320 // Example sequence:
321 // vmv.v.x v8, a0
322 return LT.first * getLMULCost(LT.second);
323 }
324
325 // Example sequence:
326 // vrgather.vi v9, v8, 0
327 // TODO: vrgather could be slower than vmv.v.x. It is
328 // implementation-dependent.
329 return LT.first * getLMULCost(LT.second);
330 }
331 case TTI::SK_Splice:
332 // vslidedown+vslideup.
333 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
334 // of similar code, but I think we expand through memory.
335 return 2 * LT.first * getLMULCost(LT.second);
336 case TTI::SK_Reverse: {
337 // TODO: Cases to improve here:
338 // * LMUL > 1
339 // * i64 on RV32
340 // * i1 vector
341
342 // Most of the cost here is producing the vrgather index register
343 // Example sequence:
344 // csrr a0, vlenb
345 // srli a0, a0, 3
346 // addi a0, a0, -1
347 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
348 // vid.v v9
349 // vrsub.vx v10, v9, a0
350 // vrgather.vv v9, v8, v10
351 unsigned LenCost = 3;
352 if (LT.second.isFixedLengthVector())
353 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
354 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
355 if (Tp->getElementType()->isIntegerTy(1))
356 // Mask operation additionally required extend and truncate
357 return LT.first * (LenCost + 6);
358 return LT.first * (LenCost + 3);
359 }
360 }
361 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
362}
363
365RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
366 unsigned AddressSpace,
368 if (!isLegalMaskedLoadStore(Src, Alignment) ||
370 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
371 CostKind);
372
373 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
374}
375
377 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
378 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
379 bool UseMaskForCond, bool UseMaskForGaps) {
380 auto *FVTy = cast<FixedVectorType>(VecTy);
381 InstructionCost MemCost =
382 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
383 unsigned VF = FVTy->getNumElements() / Factor;
384
385 // An interleaved load will look like this for Factor=3:
386 // %wide.vec = load <12 x i32>, ptr %3, align 4
387 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
388 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
389 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
390 if (Opcode == Instruction::Load) {
391 InstructionCost Cost = MemCost;
392 for (unsigned Index : Indices) {
393 FixedVectorType *SubVecTy =
394 FixedVectorType::get(FVTy->getElementType(), VF);
395 auto Mask = createStrideMask(Index, Factor, VF);
396 InstructionCost ShuffleCost =
398 CostKind, 0, nullptr, {});
399 Cost += ShuffleCost;
400 }
401 return Cost;
402 }
403
404 // TODO: Model for NF > 2
405 // We'll need to enhance getShuffleCost to model shuffles that are just
406 // inserts and extracts into subvectors, since they won't have the full cost
407 // of a vrgather.
408 // An interleaved store for 3 vectors of 4 lanes will look like
409 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
410 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
411 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
412 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
413 // store <12 x i32> %interleaved.vec, ptr %10, align 4
414 if (Factor != 2)
415 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
416 Alignment, AddressSpace, CostKind,
417 UseMaskForCond, UseMaskForGaps);
418
419 assert(Opcode == Instruction::Store && "Opcode must be a store");
420 // For an interleaving store of 2 vectors, we perform one large interleaving
421 // shuffle that goes into the wide store
422 auto Mask = createInterleaveMask(VF, Factor);
423 InstructionCost ShuffleCost =
425 CostKind, 0, nullptr, {});
426 return MemCost + ShuffleCost;
427}
428
430 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
431 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
433 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
434 Alignment, CostKind, I);
435
436 if ((Opcode == Instruction::Load &&
437 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
438 (Opcode == Instruction::Store &&
439 !isLegalMaskedScatter(DataTy, Align(Alignment))))
440 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
441 Alignment, CostKind, I);
442
443 // Cost is proportional to the number of memory operations implied. For
444 // scalable vectors, we use an estimate on that number since we don't
445 // know exactly what VL will be.
446 auto &VTy = *cast<VectorType>(DataTy);
447 InstructionCost MemOpCost =
448 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
449 {TTI::OK_AnyValue, TTI::OP_None}, I);
450 unsigned NumLoads = getEstimatedVLFor(&VTy);
451 return NumLoads * MemOpCost;
452}
453
454// Currently, these represent both throughput and codesize costs
455// for the respective intrinsics. The costs in this table are simply
456// instruction counts with the following adjustments made:
457// * One vsetvli is considered free.
459 {Intrinsic::floor, MVT::v2f32, 9},
460 {Intrinsic::floor, MVT::v4f32, 9},
461 {Intrinsic::floor, MVT::v8f32, 9},
462 {Intrinsic::floor, MVT::v16f32, 9},
463 {Intrinsic::floor, MVT::nxv1f32, 9},
464 {Intrinsic::floor, MVT::nxv2f32, 9},
465 {Intrinsic::floor, MVT::nxv4f32, 9},
466 {Intrinsic::floor, MVT::nxv8f32, 9},
467 {Intrinsic::floor, MVT::nxv16f32, 9},
468 {Intrinsic::floor, MVT::v2f64, 9},
469 {Intrinsic::floor, MVT::v4f64, 9},
470 {Intrinsic::floor, MVT::v8f64, 9},
471 {Intrinsic::floor, MVT::v16f64, 9},
472 {Intrinsic::floor, MVT::nxv1f64, 9},
473 {Intrinsic::floor, MVT::nxv2f64, 9},
474 {Intrinsic::floor, MVT::nxv4f64, 9},
475 {Intrinsic::floor, MVT::nxv8f64, 9},
476 {Intrinsic::ceil, MVT::v2f32, 9},
477 {Intrinsic::ceil, MVT::v4f32, 9},
478 {Intrinsic::ceil, MVT::v8f32, 9},
479 {Intrinsic::ceil, MVT::v16f32, 9},
480 {Intrinsic::ceil, MVT::nxv1f32, 9},
481 {Intrinsic::ceil, MVT::nxv2f32, 9},
482 {Intrinsic::ceil, MVT::nxv4f32, 9},
483 {Intrinsic::ceil, MVT::nxv8f32, 9},
484 {Intrinsic::ceil, MVT::nxv16f32, 9},
485 {Intrinsic::ceil, MVT::v2f64, 9},
486 {Intrinsic::ceil, MVT::v4f64, 9},
487 {Intrinsic::ceil, MVT::v8f64, 9},
488 {Intrinsic::ceil, MVT::v16f64, 9},
489 {Intrinsic::ceil, MVT::nxv1f64, 9},
490 {Intrinsic::ceil, MVT::nxv2f64, 9},
491 {Intrinsic::ceil, MVT::nxv4f64, 9},
492 {Intrinsic::ceil, MVT::nxv8f64, 9},
493 {Intrinsic::trunc, MVT::v2f32, 7},
494 {Intrinsic::trunc, MVT::v4f32, 7},
495 {Intrinsic::trunc, MVT::v8f32, 7},
496 {Intrinsic::trunc, MVT::v16f32, 7},
497 {Intrinsic::trunc, MVT::nxv1f32, 7},
498 {Intrinsic::trunc, MVT::nxv2f32, 7},
499 {Intrinsic::trunc, MVT::nxv4f32, 7},
500 {Intrinsic::trunc, MVT::nxv8f32, 7},
501 {Intrinsic::trunc, MVT::nxv16f32, 7},
502 {Intrinsic::trunc, MVT::v2f64, 7},
503 {Intrinsic::trunc, MVT::v4f64, 7},
504 {Intrinsic::trunc, MVT::v8f64, 7},
505 {Intrinsic::trunc, MVT::v16f64, 7},
506 {Intrinsic::trunc, MVT::nxv1f64, 7},
507 {Intrinsic::trunc, MVT::nxv2f64, 7},
508 {Intrinsic::trunc, MVT::nxv4f64, 7},
509 {Intrinsic::trunc, MVT::nxv8f64, 7},
510 {Intrinsic::round, MVT::v2f32, 9},
511 {Intrinsic::round, MVT::v4f32, 9},
512 {Intrinsic::round, MVT::v8f32, 9},
513 {Intrinsic::round, MVT::v16f32, 9},
514 {Intrinsic::round, MVT::nxv1f32, 9},
515 {Intrinsic::round, MVT::nxv2f32, 9},
516 {Intrinsic::round, MVT::nxv4f32, 9},
517 {Intrinsic::round, MVT::nxv8f32, 9},
518 {Intrinsic::round, MVT::nxv16f32, 9},
519 {Intrinsic::round, MVT::v2f64, 9},
520 {Intrinsic::round, MVT::v4f64, 9},
521 {Intrinsic::round, MVT::v8f64, 9},
522 {Intrinsic::round, MVT::v16f64, 9},
523 {Intrinsic::round, MVT::nxv1f64, 9},
524 {Intrinsic::round, MVT::nxv2f64, 9},
525 {Intrinsic::round, MVT::nxv4f64, 9},
526 {Intrinsic::round, MVT::nxv8f64, 9},
527 {Intrinsic::roundeven, MVT::v2f32, 9},
528 {Intrinsic::roundeven, MVT::v4f32, 9},
529 {Intrinsic::roundeven, MVT::v8f32, 9},
530 {Intrinsic::roundeven, MVT::v16f32, 9},
531 {Intrinsic::roundeven, MVT::nxv1f32, 9},
532 {Intrinsic::roundeven, MVT::nxv2f32, 9},
533 {Intrinsic::roundeven, MVT::nxv4f32, 9},
534 {Intrinsic::roundeven, MVT::nxv8f32, 9},
535 {Intrinsic::roundeven, MVT::nxv16f32, 9},
536 {Intrinsic::roundeven, MVT::v2f64, 9},
537 {Intrinsic::roundeven, MVT::v4f64, 9},
538 {Intrinsic::roundeven, MVT::v8f64, 9},
539 {Intrinsic::roundeven, MVT::v16f64, 9},
540 {Intrinsic::roundeven, MVT::nxv1f64, 9},
541 {Intrinsic::roundeven, MVT::nxv2f64, 9},
542 {Intrinsic::roundeven, MVT::nxv4f64, 9},
543 {Intrinsic::roundeven, MVT::nxv8f64, 9},
544 {Intrinsic::bswap, MVT::v2i16, 3},
545 {Intrinsic::bswap, MVT::v4i16, 3},
546 {Intrinsic::bswap, MVT::v8i16, 3},
547 {Intrinsic::bswap, MVT::v16i16, 3},
548 {Intrinsic::bswap, MVT::nxv1i16, 3},
549 {Intrinsic::bswap, MVT::nxv2i16, 3},
550 {Intrinsic::bswap, MVT::nxv4i16, 3},
551 {Intrinsic::bswap, MVT::nxv8i16, 3},
552 {Intrinsic::bswap, MVT::nxv16i16, 3},
553 {Intrinsic::bswap, MVT::v2i32, 12},
554 {Intrinsic::bswap, MVT::v4i32, 12},
555 {Intrinsic::bswap, MVT::v8i32, 12},
556 {Intrinsic::bswap, MVT::v16i32, 12},
557 {Intrinsic::bswap, MVT::nxv1i32, 12},
558 {Intrinsic::bswap, MVT::nxv2i32, 12},
559 {Intrinsic::bswap, MVT::nxv4i32, 12},
560 {Intrinsic::bswap, MVT::nxv8i32, 12},
561 {Intrinsic::bswap, MVT::nxv16i32, 12},
562 {Intrinsic::bswap, MVT::v2i64, 31},
563 {Intrinsic::bswap, MVT::v4i64, 31},
564 {Intrinsic::bswap, MVT::v8i64, 31},
565 {Intrinsic::bswap, MVT::v16i64, 31},
566 {Intrinsic::bswap, MVT::nxv1i64, 31},
567 {Intrinsic::bswap, MVT::nxv2i64, 31},
568 {Intrinsic::bswap, MVT::nxv4i64, 31},
569 {Intrinsic::bswap, MVT::nxv8i64, 31},
570 {Intrinsic::vp_bswap, MVT::v2i16, 3},
571 {Intrinsic::vp_bswap, MVT::v4i16, 3},
572 {Intrinsic::vp_bswap, MVT::v8i16, 3},
573 {Intrinsic::vp_bswap, MVT::v16i16, 3},
574 {Intrinsic::vp_bswap, MVT::nxv1i16, 3},
575 {Intrinsic::vp_bswap, MVT::nxv2i16, 3},
576 {Intrinsic::vp_bswap, MVT::nxv4i16, 3},
577 {Intrinsic::vp_bswap, MVT::nxv8i16, 3},
578 {Intrinsic::vp_bswap, MVT::nxv16i16, 3},
579 {Intrinsic::vp_bswap, MVT::v2i32, 12},
580 {Intrinsic::vp_bswap, MVT::v4i32, 12},
581 {Intrinsic::vp_bswap, MVT::v8i32, 12},
582 {Intrinsic::vp_bswap, MVT::v16i32, 12},
583 {Intrinsic::vp_bswap, MVT::nxv1i32, 12},
584 {Intrinsic::vp_bswap, MVT::nxv2i32, 12},
585 {Intrinsic::vp_bswap, MVT::nxv4i32, 12},
586 {Intrinsic::vp_bswap, MVT::nxv8i32, 12},
587 {Intrinsic::vp_bswap, MVT::nxv16i32, 12},
588 {Intrinsic::vp_bswap, MVT::v2i64, 31},
589 {Intrinsic::vp_bswap, MVT::v4i64, 31},
590 {Intrinsic::vp_bswap, MVT::v8i64, 31},
591 {Intrinsic::vp_bswap, MVT::v16i64, 31},
592 {Intrinsic::vp_bswap, MVT::nxv1i64, 31},
593 {Intrinsic::vp_bswap, MVT::nxv2i64, 31},
594 {Intrinsic::vp_bswap, MVT::nxv4i64, 31},
595 {Intrinsic::vp_bswap, MVT::nxv8i64, 31},
596 {Intrinsic::vp_fshl, MVT::v2i8, 7},
597 {Intrinsic::vp_fshl, MVT::v4i8, 7},
598 {Intrinsic::vp_fshl, MVT::v8i8, 7},
599 {Intrinsic::vp_fshl, MVT::v16i8, 7},
600 {Intrinsic::vp_fshl, MVT::nxv1i8, 7},
601 {Intrinsic::vp_fshl, MVT::nxv2i8, 7},
602 {Intrinsic::vp_fshl, MVT::nxv4i8, 7},
603 {Intrinsic::vp_fshl, MVT::nxv8i8, 7},
604 {Intrinsic::vp_fshl, MVT::nxv16i8, 7},
605 {Intrinsic::vp_fshl, MVT::nxv32i8, 7},
606 {Intrinsic::vp_fshl, MVT::nxv64i8, 7},
607 {Intrinsic::vp_fshl, MVT::v2i16, 7},
608 {Intrinsic::vp_fshl, MVT::v4i16, 7},
609 {Intrinsic::vp_fshl, MVT::v8i16, 7},
610 {Intrinsic::vp_fshl, MVT::v16i16, 7},
611 {Intrinsic::vp_fshl, MVT::nxv1i16, 7},
612 {Intrinsic::vp_fshl, MVT::nxv2i16, 7},
613 {Intrinsic::vp_fshl, MVT::nxv4i16, 7},
614 {Intrinsic::vp_fshl, MVT::nxv8i16, 7},
615 {Intrinsic::vp_fshl, MVT::nxv16i16, 7},
616 {Intrinsic::vp_fshl, MVT::nxv32i16, 7},
617 {Intrinsic::vp_fshl, MVT::v2i32, 7},
618 {Intrinsic::vp_fshl, MVT::v4i32, 7},
619 {Intrinsic::vp_fshl, MVT::v8i32, 7},
620 {Intrinsic::vp_fshl, MVT::v16i32, 7},
621 {Intrinsic::vp_fshl, MVT::nxv1i32, 7},
622 {Intrinsic::vp_fshl, MVT::nxv2i32, 7},
623 {Intrinsic::vp_fshl, MVT::nxv4i32, 7},
624 {Intrinsic::vp_fshl, MVT::nxv8i32, 7},
625 {Intrinsic::vp_fshl, MVT::nxv16i32, 7},
626 {Intrinsic::vp_fshl, MVT::v2i64, 7},
627 {Intrinsic::vp_fshl, MVT::v4i64, 7},
628 {Intrinsic::vp_fshl, MVT::v8i64, 7},
629 {Intrinsic::vp_fshl, MVT::v16i64, 7},
630 {Intrinsic::vp_fshl, MVT::nxv1i64, 7},
631 {Intrinsic::vp_fshl, MVT::nxv2i64, 7},
632 {Intrinsic::vp_fshl, MVT::nxv4i64, 7},
633 {Intrinsic::vp_fshl, MVT::nxv8i64, 7},
634 {Intrinsic::vp_fshr, MVT::v2i8, 7},
635 {Intrinsic::vp_fshr, MVT::v4i8, 7},
636 {Intrinsic::vp_fshr, MVT::v8i8, 7},
637 {Intrinsic::vp_fshr, MVT::v16i8, 7},
638 {Intrinsic::vp_fshr, MVT::nxv1i8, 7},
639 {Intrinsic::vp_fshr, MVT::nxv2i8, 7},
640 {Intrinsic::vp_fshr, MVT::nxv4i8, 7},
641 {Intrinsic::vp_fshr, MVT::nxv8i8, 7},
642 {Intrinsic::vp_fshr, MVT::nxv16i8, 7},
643 {Intrinsic::vp_fshr, MVT::nxv32i8, 7},
644 {Intrinsic::vp_fshr, MVT::nxv64i8, 7},
645 {Intrinsic::vp_fshr, MVT::v2i16, 7},
646 {Intrinsic::vp_fshr, MVT::v4i16, 7},
647 {Intrinsic::vp_fshr, MVT::v8i16, 7},
648 {Intrinsic::vp_fshr, MVT::v16i16, 7},
649 {Intrinsic::vp_fshr, MVT::nxv1i16, 7},
650 {Intrinsic::vp_fshr, MVT::nxv2i16, 7},
651 {Intrinsic::vp_fshr, MVT::nxv4i16, 7},
652 {Intrinsic::vp_fshr, MVT::nxv8i16, 7},
653 {Intrinsic::vp_fshr, MVT::nxv16i16, 7},
654 {Intrinsic::vp_fshr, MVT::nxv32i16, 7},
655 {Intrinsic::vp_fshr, MVT::v2i32, 7},
656 {Intrinsic::vp_fshr, MVT::v4i32, 7},
657 {Intrinsic::vp_fshr, MVT::v8i32, 7},
658 {Intrinsic::vp_fshr, MVT::v16i32, 7},
659 {Intrinsic::vp_fshr, MVT::nxv1i32, 7},
660 {Intrinsic::vp_fshr, MVT::nxv2i32, 7},
661 {Intrinsic::vp_fshr, MVT::nxv4i32, 7},
662 {Intrinsic::vp_fshr, MVT::nxv8i32, 7},
663 {Intrinsic::vp_fshr, MVT::nxv16i32, 7},
664 {Intrinsic::vp_fshr, MVT::v2i64, 7},
665 {Intrinsic::vp_fshr, MVT::v4i64, 7},
666 {Intrinsic::vp_fshr, MVT::v8i64, 7},
667 {Intrinsic::vp_fshr, MVT::v16i64, 7},
668 {Intrinsic::vp_fshr, MVT::nxv1i64, 7},
669 {Intrinsic::vp_fshr, MVT::nxv2i64, 7},
670 {Intrinsic::vp_fshr, MVT::nxv4i64, 7},
671 {Intrinsic::vp_fshr, MVT::nxv8i64, 7},
672 {Intrinsic::bitreverse, MVT::v2i8, 17},
673 {Intrinsic::bitreverse, MVT::v4i8, 17},
674 {Intrinsic::bitreverse, MVT::v8i8, 17},
675 {Intrinsic::bitreverse, MVT::v16i8, 17},
676 {Intrinsic::bitreverse, MVT::nxv1i8, 17},
677 {Intrinsic::bitreverse, MVT::nxv2i8, 17},
678 {Intrinsic::bitreverse, MVT::nxv4i8, 17},
679 {Intrinsic::bitreverse, MVT::nxv8i8, 17},
680 {Intrinsic::bitreverse, MVT::nxv16i8, 17},
681 {Intrinsic::bitreverse, MVT::v2i16, 24},
682 {Intrinsic::bitreverse, MVT::v4i16, 24},
683 {Intrinsic::bitreverse, MVT::v8i16, 24},
684 {Intrinsic::bitreverse, MVT::v16i16, 24},
685 {Intrinsic::bitreverse, MVT::nxv1i16, 24},
686 {Intrinsic::bitreverse, MVT::nxv2i16, 24},
687 {Intrinsic::bitreverse, MVT::nxv4i16, 24},
688 {Intrinsic::bitreverse, MVT::nxv8i16, 24},
689 {Intrinsic::bitreverse, MVT::nxv16i16, 24},
690 {Intrinsic::bitreverse, MVT::v2i32, 33},
691 {Intrinsic::bitreverse, MVT::v4i32, 33},
692 {Intrinsic::bitreverse, MVT::v8i32, 33},
693 {Intrinsic::bitreverse, MVT::v16i32, 33},
694 {Intrinsic::bitreverse, MVT::nxv1i32, 33},
695 {Intrinsic::bitreverse, MVT::nxv2i32, 33},
696 {Intrinsic::bitreverse, MVT::nxv4i32, 33},
697 {Intrinsic::bitreverse, MVT::nxv8i32, 33},
698 {Intrinsic::bitreverse, MVT::nxv16i32, 33},
699 {Intrinsic::bitreverse, MVT::v2i64, 52},
700 {Intrinsic::bitreverse, MVT::v4i64, 52},
701 {Intrinsic::bitreverse, MVT::v8i64, 52},
702 {Intrinsic::bitreverse, MVT::v16i64, 52},
703 {Intrinsic::bitreverse, MVT::nxv1i64, 52},
704 {Intrinsic::bitreverse, MVT::nxv2i64, 52},
705 {Intrinsic::bitreverse, MVT::nxv4i64, 52},
706 {Intrinsic::bitreverse, MVT::nxv8i64, 52},
707 {Intrinsic::vp_bitreverse, MVT::v2i8, 17},
708 {Intrinsic::vp_bitreverse, MVT::v4i8, 17},
709 {Intrinsic::vp_bitreverse, MVT::v8i8, 17},
710 {Intrinsic::vp_bitreverse, MVT::v16i8, 17},
711 {Intrinsic::vp_bitreverse, MVT::nxv1i8, 17},
712 {Intrinsic::vp_bitreverse, MVT::nxv2i8, 17},
713 {Intrinsic::vp_bitreverse, MVT::nxv4i8, 17},
714 {Intrinsic::vp_bitreverse, MVT::nxv8i8, 17},
715 {Intrinsic::vp_bitreverse, MVT::nxv16i8, 17},
716 {Intrinsic::vp_bitreverse, MVT::v2i16, 24},
717 {Intrinsic::vp_bitreverse, MVT::v4i16, 24},
718 {Intrinsic::vp_bitreverse, MVT::v8i16, 24},
719 {Intrinsic::vp_bitreverse, MVT::v16i16, 24},
720 {Intrinsic::vp_bitreverse, MVT::nxv1i16, 24},
721 {Intrinsic::vp_bitreverse, MVT::nxv2i16, 24},
722 {Intrinsic::vp_bitreverse, MVT::nxv4i16, 24},
723 {Intrinsic::vp_bitreverse, MVT::nxv8i16, 24},
724 {Intrinsic::vp_bitreverse, MVT::nxv16i16, 24},
725 {Intrinsic::vp_bitreverse, MVT::v2i32, 33},
726 {Intrinsic::vp_bitreverse, MVT::v4i32, 33},
727 {Intrinsic::vp_bitreverse, MVT::v8i32, 33},
728 {Intrinsic::vp_bitreverse, MVT::v16i32, 33},
729 {Intrinsic::vp_bitreverse, MVT::nxv1i32, 33},
730 {Intrinsic::vp_bitreverse, MVT::nxv2i32, 33},
731 {Intrinsic::vp_bitreverse, MVT::nxv4i32, 33},
732 {Intrinsic::vp_bitreverse, MVT::nxv8i32, 33},
733 {Intrinsic::vp_bitreverse, MVT::nxv16i32, 33},
734 {Intrinsic::vp_bitreverse, MVT::v2i64, 52},
735 {Intrinsic::vp_bitreverse, MVT::v4i64, 52},
736 {Intrinsic::vp_bitreverse, MVT::v8i64, 52},
737 {Intrinsic::vp_bitreverse, MVT::v16i64, 52},
738 {Intrinsic::vp_bitreverse, MVT::nxv1i64, 52},
739 {Intrinsic::vp_bitreverse, MVT::nxv2i64, 52},
740 {Intrinsic::vp_bitreverse, MVT::nxv4i64, 52},
741 {Intrinsic::vp_bitreverse, MVT::nxv8i64, 52},
742 {Intrinsic::ctpop, MVT::v2i8, 12},
743 {Intrinsic::ctpop, MVT::v4i8, 12},
744 {Intrinsic::ctpop, MVT::v8i8, 12},
745 {Intrinsic::ctpop, MVT::v16i8, 12},
746 {Intrinsic::ctpop, MVT::nxv1i8, 12},
747 {Intrinsic::ctpop, MVT::nxv2i8, 12},
748 {Intrinsic::ctpop, MVT::nxv4i8, 12},
749 {Intrinsic::ctpop, MVT::nxv8i8, 12},
750 {Intrinsic::ctpop, MVT::nxv16i8, 12},
751 {Intrinsic::ctpop, MVT::v2i16, 19},
752 {Intrinsic::ctpop, MVT::v4i16, 19},
753 {Intrinsic::ctpop, MVT::v8i16, 19},
754 {Intrinsic::ctpop, MVT::v16i16, 19},
755 {Intrinsic::ctpop, MVT::nxv1i16, 19},
756 {Intrinsic::ctpop, MVT::nxv2i16, 19},
757 {Intrinsic::ctpop, MVT::nxv4i16, 19},
758 {Intrinsic::ctpop, MVT::nxv8i16, 19},
759 {Intrinsic::ctpop, MVT::nxv16i16, 19},
760 {Intrinsic::ctpop, MVT::v2i32, 20},
761 {Intrinsic::ctpop, MVT::v4i32, 20},
762 {Intrinsic::ctpop, MVT::v8i32, 20},
763 {Intrinsic::ctpop, MVT::v16i32, 20},
764 {Intrinsic::ctpop, MVT::nxv1i32, 20},
765 {Intrinsic::ctpop, MVT::nxv2i32, 20},
766 {Intrinsic::ctpop, MVT::nxv4i32, 20},
767 {Intrinsic::ctpop, MVT::nxv8i32, 20},
768 {Intrinsic::ctpop, MVT::nxv16i32, 20},
769 {Intrinsic::ctpop, MVT::v2i64, 21},
770 {Intrinsic::ctpop, MVT::v4i64, 21},
771 {Intrinsic::ctpop, MVT::v8i64, 21},
772 {Intrinsic::ctpop, MVT::v16i64, 21},
773 {Intrinsic::ctpop, MVT::nxv1i64, 21},
774 {Intrinsic::ctpop, MVT::nxv2i64, 21},
775 {Intrinsic::ctpop, MVT::nxv4i64, 21},
776 {Intrinsic::ctpop, MVT::nxv8i64, 21},
777 {Intrinsic::vp_ctpop, MVT::v2i8, 12},
778 {Intrinsic::vp_ctpop, MVT::v4i8, 12},
779 {Intrinsic::vp_ctpop, MVT::v8i8, 12},
780 {Intrinsic::vp_ctpop, MVT::v16i8, 12},
781 {Intrinsic::vp_ctpop, MVT::nxv1i8, 12},
782 {Intrinsic::vp_ctpop, MVT::nxv2i8, 12},
783 {Intrinsic::vp_ctpop, MVT::nxv4i8, 12},
784 {Intrinsic::vp_ctpop, MVT::nxv8i8, 12},
785 {Intrinsic::vp_ctpop, MVT::nxv16i8, 12},
786 {Intrinsic::vp_ctpop, MVT::v2i16, 19},
787 {Intrinsic::vp_ctpop, MVT::v4i16, 19},
788 {Intrinsic::vp_ctpop, MVT::v8i16, 19},
789 {Intrinsic::vp_ctpop, MVT::v16i16, 19},
790 {Intrinsic::vp_ctpop, MVT::nxv1i16, 19},
791 {Intrinsic::vp_ctpop, MVT::nxv2i16, 19},
792 {Intrinsic::vp_ctpop, MVT::nxv4i16, 19},
793 {Intrinsic::vp_ctpop, MVT::nxv8i16, 19},
794 {Intrinsic::vp_ctpop, MVT::nxv16i16, 19},
795 {Intrinsic::vp_ctpop, MVT::v2i32, 20},
796 {Intrinsic::vp_ctpop, MVT::v4i32, 20},
797 {Intrinsic::vp_ctpop, MVT::v8i32, 20},
798 {Intrinsic::vp_ctpop, MVT::v16i32, 20},
799 {Intrinsic::vp_ctpop, MVT::nxv1i32, 20},
800 {Intrinsic::vp_ctpop, MVT::nxv2i32, 20},
801 {Intrinsic::vp_ctpop, MVT::nxv4i32, 20},
802 {Intrinsic::vp_ctpop, MVT::nxv8i32, 20},
803 {Intrinsic::vp_ctpop, MVT::nxv16i32, 20},
804 {Intrinsic::vp_ctpop, MVT::v2i64, 21},
805 {Intrinsic::vp_ctpop, MVT::v4i64, 21},
806 {Intrinsic::vp_ctpop, MVT::v8i64, 21},
807 {Intrinsic::vp_ctpop, MVT::v16i64, 21},
808 {Intrinsic::vp_ctpop, MVT::nxv1i64, 21},
809 {Intrinsic::vp_ctpop, MVT::nxv2i64, 21},
810 {Intrinsic::vp_ctpop, MVT::nxv4i64, 21},
811 {Intrinsic::vp_ctpop, MVT::nxv8i64, 21},
812 {Intrinsic::vp_ctlz, MVT::v2i8, 19},
813 {Intrinsic::vp_ctlz, MVT::v4i8, 19},
814 {Intrinsic::vp_ctlz, MVT::v8i8, 19},
815 {Intrinsic::vp_ctlz, MVT::v16i8, 19},
816 {Intrinsic::vp_ctlz, MVT::nxv1i8, 19},
817 {Intrinsic::vp_ctlz, MVT::nxv2i8, 19},
818 {Intrinsic::vp_ctlz, MVT::nxv4i8, 19},
819 {Intrinsic::vp_ctlz, MVT::nxv8i8, 19},
820 {Intrinsic::vp_ctlz, MVT::nxv16i8, 19},
821 {Intrinsic::vp_ctlz, MVT::nxv32i8, 19},
822 {Intrinsic::vp_ctlz, MVT::nxv64i8, 19},
823 {Intrinsic::vp_ctlz, MVT::v2i16, 28},
824 {Intrinsic::vp_ctlz, MVT::v4i16, 28},
825 {Intrinsic::vp_ctlz, MVT::v8i16, 28},
826 {Intrinsic::vp_ctlz, MVT::v16i16, 28},
827 {Intrinsic::vp_ctlz, MVT::nxv1i16, 28},
828 {Intrinsic::vp_ctlz, MVT::nxv2i16, 28},
829 {Intrinsic::vp_ctlz, MVT::nxv4i16, 28},
830 {Intrinsic::vp_ctlz, MVT::nxv8i16, 28},
831 {Intrinsic::vp_ctlz, MVT::nxv16i16, 28},
832 {Intrinsic::vp_ctlz, MVT::nxv32i16, 28},
833 {Intrinsic::vp_ctlz, MVT::v2i32, 31},
834 {Intrinsic::vp_ctlz, MVT::v4i32, 31},
835 {Intrinsic::vp_ctlz, MVT::v8i32, 31},
836 {Intrinsic::vp_ctlz, MVT::v16i32, 31},
837 {Intrinsic::vp_ctlz, MVT::nxv1i32, 31},
838 {Intrinsic::vp_ctlz, MVT::nxv2i32, 31},
839 {Intrinsic::vp_ctlz, MVT::nxv4i32, 31},
840 {Intrinsic::vp_ctlz, MVT::nxv8i32, 31},
841 {Intrinsic::vp_ctlz, MVT::nxv16i32, 31},
842 {Intrinsic::vp_ctlz, MVT::v2i64, 35},
843 {Intrinsic::vp_ctlz, MVT::v4i64, 35},
844 {Intrinsic::vp_ctlz, MVT::v8i64, 35},
845 {Intrinsic::vp_ctlz, MVT::v16i64, 35},
846 {Intrinsic::vp_ctlz, MVT::nxv1i64, 35},
847 {Intrinsic::vp_ctlz, MVT::nxv2i64, 35},
848 {Intrinsic::vp_ctlz, MVT::nxv4i64, 35},
849 {Intrinsic::vp_ctlz, MVT::nxv8i64, 35},
850 {Intrinsic::vp_cttz, MVT::v2i8, 16},
851 {Intrinsic::vp_cttz, MVT::v4i8, 16},
852 {Intrinsic::vp_cttz, MVT::v8i8, 16},
853 {Intrinsic::vp_cttz, MVT::v16i8, 16},
854 {Intrinsic::vp_cttz, MVT::nxv1i8, 16},
855 {Intrinsic::vp_cttz, MVT::nxv2i8, 16},
856 {Intrinsic::vp_cttz, MVT::nxv4i8, 16},
857 {Intrinsic::vp_cttz, MVT::nxv8i8, 16},
858 {Intrinsic::vp_cttz, MVT::nxv16i8, 16},
859 {Intrinsic::vp_cttz, MVT::nxv32i8, 16},
860 {Intrinsic::vp_cttz, MVT::nxv64i8, 16},
861 {Intrinsic::vp_cttz, MVT::v2i16, 23},
862 {Intrinsic::vp_cttz, MVT::v4i16, 23},
863 {Intrinsic::vp_cttz, MVT::v8i16, 23},
864 {Intrinsic::vp_cttz, MVT::v16i16, 23},
865 {Intrinsic::vp_cttz, MVT::nxv1i16, 23},
866 {Intrinsic::vp_cttz, MVT::nxv2i16, 23},
867 {Intrinsic::vp_cttz, MVT::nxv4i16, 23},
868 {Intrinsic::vp_cttz, MVT::nxv8i16, 23},
869 {Intrinsic::vp_cttz, MVT::nxv16i16, 23},
870 {Intrinsic::vp_cttz, MVT::nxv32i16, 23},
871 {Intrinsic::vp_cttz, MVT::v2i32, 24},
872 {Intrinsic::vp_cttz, MVT::v4i32, 24},
873 {Intrinsic::vp_cttz, MVT::v8i32, 24},
874 {Intrinsic::vp_cttz, MVT::v16i32, 24},
875 {Intrinsic::vp_cttz, MVT::nxv1i32, 24},
876 {Intrinsic::vp_cttz, MVT::nxv2i32, 24},
877 {Intrinsic::vp_cttz, MVT::nxv4i32, 24},
878 {Intrinsic::vp_cttz, MVT::nxv8i32, 24},
879 {Intrinsic::vp_cttz, MVT::nxv16i32, 24},
880 {Intrinsic::vp_cttz, MVT::v2i64, 25},
881 {Intrinsic::vp_cttz, MVT::v4i64, 25},
882 {Intrinsic::vp_cttz, MVT::v8i64, 25},
883 {Intrinsic::vp_cttz, MVT::v16i64, 25},
884 {Intrinsic::vp_cttz, MVT::nxv1i64, 25},
885 {Intrinsic::vp_cttz, MVT::nxv2i64, 25},
886 {Intrinsic::vp_cttz, MVT::nxv4i64, 25},
887 {Intrinsic::vp_cttz, MVT::nxv8i64, 25},
888};
889
891 switch (ID) {
892#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
893 case Intrinsic::VPID: \
894 return ISD::VPSD;
895#include "llvm/IR/VPIntrinsics.def"
896#undef HELPER_MAP_VPID_TO_VPSD
897 }
898 return ISD::DELETED_NODE;
899}
900
904 auto *RetTy = ICA.getReturnType();
905 switch (ICA.getID()) {
906 case Intrinsic::ceil:
907 case Intrinsic::floor:
908 case Intrinsic::trunc:
909 case Intrinsic::rint:
910 case Intrinsic::round:
911 case Intrinsic::roundeven: {
912 // These all use the same code.
914 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
915 return LT.first * 8;
916 break;
917 }
918 case Intrinsic::umin:
919 case Intrinsic::umax:
920 case Intrinsic::smin:
921 case Intrinsic::smax: {
923 if ((ST->hasVInstructions() && LT.second.isVector()) ||
924 (LT.second.isScalarInteger() && ST->hasStdExtZbb()))
925 return LT.first;
926 break;
927 }
928 case Intrinsic::sadd_sat:
929 case Intrinsic::ssub_sat:
930 case Intrinsic::uadd_sat:
931 case Intrinsic::usub_sat:
932 case Intrinsic::fabs:
933 case Intrinsic::sqrt: {
935 if (ST->hasVInstructions() && LT.second.isVector())
936 return LT.first;
937 break;
938 }
939 case Intrinsic::abs: {
941 if (ST->hasVInstructions() && LT.second.isVector()) {
942 // vrsub.vi v10, v8, 0
943 // vmax.vv v8, v8, v10
944 return LT.first * 2;
945 }
946 break;
947 }
948 // TODO: add more intrinsic
949 case Intrinsic::experimental_stepvector: {
950 unsigned Cost = 1; // vid
952 return Cost + (LT.first - 1);
953 }
954 case Intrinsic::vp_rint: {
955 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
956 unsigned Cost = 5;
958 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
959 return Cost * LT.first;
960 break;
961 }
962 case Intrinsic::vp_nearbyint: {
963 // More one read and one write for fflags than vp_rint.
964 unsigned Cost = 7;
966 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
967 return Cost * LT.first;
968 break;
969 }
970 case Intrinsic::vp_ceil:
971 case Intrinsic::vp_floor:
972 case Intrinsic::vp_round:
973 case Intrinsic::vp_roundeven:
974 case Intrinsic::vp_roundtozero: {
975 // Rounding with static rounding mode needs two more instructions to
976 // swap/write FRM than vp_rint.
977 unsigned Cost = 7;
979 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
980 if (TLI->isOperationCustom(VPISD, LT.second))
981 return Cost * LT.first;
982 break;
983 }
984 }
985
986 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
988 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
989 ICA.getID(), LT.second))
990 return LT.first * Entry->Cost;
991 }
992
994}
995
997 Type *Src,
1000 const Instruction *I) {
1001 if (isa<VectorType>(Dst) && isa<VectorType>(Src)) {
1002 // FIXME: Need to compute legalizing cost for illegal types.
1003 if (!isTypeLegal(Src) || !isTypeLegal(Dst))
1004 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1005
1006 // Skip if element size of Dst or Src is bigger than ELEN.
1007 if (Src->getScalarSizeInBits() > ST->getELEN() ||
1008 Dst->getScalarSizeInBits() > ST->getELEN())
1009 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1010
1011 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1012 assert(ISD && "Invalid opcode");
1013
1014 // FIXME: Need to consider vsetvli and lmul.
1015 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
1016 (int)Log2_32(Src->getScalarSizeInBits());
1017 switch (ISD) {
1018 case ISD::SIGN_EXTEND:
1019 case ISD::ZERO_EXTEND:
1020 if (Src->getScalarSizeInBits() == 1) {
1021 // We do not use vsext/vzext to extend from mask vector.
1022 // Instead we use the following instructions to extend from mask vector:
1023 // vmv.v.i v8, 0
1024 // vmerge.vim v8, v8, -1, v0
1025 return 2;
1026 }
1027 return 1;
1028 case ISD::TRUNCATE:
1029 if (Dst->getScalarSizeInBits() == 1) {
1030 // We do not use several vncvt to truncate to mask vector. So we could
1031 // not use PowDiff to calculate it.
1032 // Instead we use the following instructions to truncate to mask vector:
1033 // vand.vi v8, v8, 1
1034 // vmsne.vi v0, v8, 0
1035 return 2;
1036 }
1037 [[fallthrough]];
1038 case ISD::FP_EXTEND:
1039 case ISD::FP_ROUND:
1040 // Counts of narrow/widen instructions.
1041 return std::abs(PowDiff);
1042 case ISD::FP_TO_SINT:
1043 case ISD::FP_TO_UINT:
1044 case ISD::SINT_TO_FP:
1045 case ISD::UINT_TO_FP:
1046 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
1047 // The cost of convert from or to mask vector is different from other
1048 // cases. We could not use PowDiff to calculate it.
1049 // For mask vector to fp, we should use the following instructions:
1050 // vmv.v.i v8, 0
1051 // vmerge.vim v8, v8, -1, v0
1052 // vfcvt.f.x.v v8, v8
1053
1054 // And for fp vector to mask, we use:
1055 // vfncvt.rtz.x.f.w v9, v8
1056 // vand.vi v8, v9, 1
1057 // vmsne.vi v0, v8, 0
1058 return 3;
1059 }
1060 if (std::abs(PowDiff) <= 1)
1061 return 1;
1062 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
1063 // so it only need two conversion.
1064 if (Src->isIntOrIntVectorTy())
1065 return 2;
1066 // Counts of narrow/widen instructions.
1067 return std::abs(PowDiff);
1068 }
1069 }
1070 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1071}
1072
1073unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1074 if (isa<ScalableVectorType>(Ty)) {
1075 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1076 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1077 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1078 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1079 }
1080 return cast<FixedVectorType>(Ty)->getNumElements();
1081}
1082
1085 bool IsUnsigned,
1087 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1088 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
1089
1090 // Skip if scalar size of Ty is bigger than ELEN.
1091 if (Ty->getScalarSizeInBits() > ST->getELEN())
1092 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
1093
1094 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1095 if (Ty->getElementType()->isIntegerTy(1))
1096 // vcpop sequences, see vreduction-mask.ll. umax, smin actually only
1097 // cost 2, but we don't have enough info here so we slightly over cost.
1098 return (LT.first - 1) + 3;
1099
1100 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1101 InstructionCost BaseCost = 2;
1102
1104 return (LT.first - 1) + BaseCost;
1105
1106 unsigned VL = getEstimatedVLFor(Ty);
1107 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1108}
1109
1112 std::optional<FastMathFlags> FMF,
1114 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1115 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1116
1117 // Skip if scalar size of Ty is bigger than ELEN.
1118 if (Ty->getScalarSizeInBits() > ST->getELEN())
1119 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1120
1121 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1122 assert(ISD && "Invalid opcode");
1123
1124 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1125 ISD != ISD::FADD)
1126 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1127
1128 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1129 if (Ty->getElementType()->isIntegerTy(1))
1130 // vcpop sequences, see vreduction-mask.ll
1131 return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
1132
1133 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1134 InstructionCost BaseCost = 2;
1135
1137 return (LT.first - 1) + BaseCost;
1138
1139 unsigned VL = getEstimatedVLFor(Ty);
1141 return (LT.first - 1) + BaseCost + VL;
1142 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1143}
1144
1146 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1147 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) {
1148 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1149 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1150 FMF, CostKind);
1151
1152 // Skip if scalar size of ResTy is bigger than ELEN.
1153 if (ResTy->getScalarSizeInBits() > ST->getELEN())
1154 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1155 FMF, CostKind);
1156
1157 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1158 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1159 FMF, CostKind);
1160
1161 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1162
1163 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1164 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1165 FMF, CostKind);
1166
1167 return (LT.first - 1) +
1168 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1169}
1170
1172 TTI::OperandValueInfo OpInfo,
1174 assert(OpInfo.isConstant() && "non constant operand?");
1175 if (!isa<VectorType>(Ty))
1176 // FIXME: We need to account for immediate materialization here, but doing
1177 // a decent job requires more knowledge about the immediate than we
1178 // currently have here.
1179 return 0;
1180
1181 if (OpInfo.isUniform())
1182 // vmv.x.i, vmv.v.x, or vfmv.v.f
1183 // We ignore the cost of the scalar constant materialization to be consistent
1184 // with how we treat scalar constants themselves just above.
1185 return 1;
1186
1187 // Add a cost of address generation + the cost of the vector load. The
1188 // address is expected to be a PC relative offset to a constant pool entry
1189 // using auipc/addi.
1190 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
1191 /*AddressSpace=*/0, CostKind);
1192}
1193
1194
1196 MaybeAlign Alignment,
1197 unsigned AddressSpace,
1199 TTI::OperandValueInfo OpInfo,
1200 const Instruction *I) {
1202 if (Opcode == Instruction::Store && OpInfo.isConstant())
1203 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1204 return Cost + BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1205 CostKind, OpInfo, I);
1206}
1207
1209 Type *CondTy,
1210 CmpInst::Predicate VecPred,
1212 const Instruction *I) {
1214 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1215 I);
1216
1217 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1218 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1219 I);
1220
1221 // Skip if scalar size of ValTy is bigger than ELEN.
1222 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELEN())
1223 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1224 I);
1225
1226 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1227 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1228 if (CondTy->isVectorTy()) {
1229 if (ValTy->getScalarSizeInBits() == 1) {
1230 // vmandn.mm v8, v8, v9
1231 // vmand.mm v9, v0, v9
1232 // vmor.mm v0, v9, v8
1233 return LT.first * 3;
1234 }
1235 // vselect and max/min are supported natively.
1236 return LT.first * 1;
1237 }
1238
1239 if (ValTy->getScalarSizeInBits() == 1) {
1240 // vmv.v.x v9, a0
1241 // vmsne.vi v9, v9, 0
1242 // vmandn.mm v8, v8, v9
1243 // vmand.mm v9, v0, v9
1244 // vmor.mm v0, v9, v8
1245 return LT.first * 5;
1246 }
1247
1248 // vmv.v.x v10, a0
1249 // vmsne.vi v0, v10, 0
1250 // vmerge.vvm v8, v9, v8, v0
1251 return LT.first * 3;
1252 }
1253
1254 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1255 ValTy->isVectorTy()) {
1256 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1257
1258 // Support natively.
1259 if (CmpInst::isIntPredicate(VecPred))
1260 return LT.first * 1;
1261
1262 // If we do not support the input floating point vector type, use the base
1263 // one which will calculate as:
1264 // ScalarizeCost + Num * Cost for fixed vector,
1265 // InvalidCost for scalable vector.
1266 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1267 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1268 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1269 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1270 I);
1271 switch (VecPred) {
1272 // Support natively.
1273 case CmpInst::FCMP_OEQ:
1274 case CmpInst::FCMP_OGT:
1275 case CmpInst::FCMP_OGE:
1276 case CmpInst::FCMP_OLT:
1277 case CmpInst::FCMP_OLE:
1278 case CmpInst::FCMP_UNE:
1279 return LT.first * 1;
1280 // TODO: Other comparisons?
1281 default:
1282 break;
1283 }
1284 }
1285
1286 // TODO: Add cost for scalar type.
1287
1288 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1289}
1290
1293 unsigned Index, Value *Op0,
1294 Value *Op1) {
1295 assert(Val->isVectorTy() && "This must be a vector type");
1296
1297 if (Opcode != Instruction::ExtractElement &&
1298 Opcode != Instruction::InsertElement)
1299 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1300
1301 // Legalize the type.
1302 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1303
1304 // This type is legalized to a scalar type.
1305 if (!LT.second.isVector())
1306 return 0;
1307
1308 // For unsupported scalable vector.
1309 if (LT.second.isScalableVector() && !LT.first.isValid())
1310 return LT.first;
1311
1312 if (!isTypeLegal(Val))
1313 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1314
1315 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1316 // and vslideup + vmv.s.x to insert element to vector.
1317 unsigned BaseCost = 1;
1318 // When insertelement we should add the index with 1 as the input of vslideup.
1319 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1320
1321 if (Index != -1U) {
1322 // The type may be split. For fixed-width vectors we can normalize the
1323 // index to the new type.
1324 if (LT.second.isFixedLengthVector()) {
1325 unsigned Width = LT.second.getVectorNumElements();
1326 Index = Index % Width;
1327 }
1328
1329 // We could extract/insert the first element without vslidedown/vslideup.
1330 if (Index == 0)
1331 SlideCost = 0;
1332 else if (Opcode == Instruction::InsertElement)
1333 SlideCost = 1; // With a constant index, we do not need to use addi.
1334 }
1335
1336 // Mask vector extract/insert element is different from normal case.
1337 if (Val->getScalarSizeInBits() == 1) {
1338 // For extractelement, we need the following instructions:
1339 // vmv.v.i v8, 0
1340 // vmerge.vim v8, v8, 1, v0
1341 // vsetivli zero, 1, e8, m2, ta, mu (not count)
1342 // vslidedown.vx v8, v8, a0
1343 // vmv.x.s a0, v8
1344
1345 // For insertelement, we need the following instructions:
1346 // vsetvli a2, zero, e8, m1, ta, mu (not count)
1347 // vmv.s.x v8, a0
1348 // vmv.v.i v9, 0
1349 // vmerge.vim v9, v9, 1, v0
1350 // addi a0, a1, 1
1351 // vsetvli zero, a0, e8, m1, tu, mu (not count)
1352 // vslideup.vx v9, v8, a1
1353 // vsetvli a0, zero, e8, m1, ta, mu (not count)
1354 // vand.vi v8, v9, 1
1355 // vmsne.vi v0, v8, 0
1356
1357 // TODO: should we count these special vsetvlis?
1358 BaseCost = Opcode == Instruction::InsertElement ? 5 : 3;
1359 }
1360 // Extract i64 in the target that has XLEN=32 need more instruction.
1361 if (Val->getScalarType()->isIntegerTy() &&
1362 ST->getXLen() < Val->getScalarSizeInBits()) {
1363 // For extractelement, we need the following instructions:
1364 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1365 // vslidedown.vx v8, v8, a0
1366 // vmv.x.s a0, v8
1367 // li a1, 32
1368 // vsrl.vx v8, v8, a1
1369 // vmv.x.s a1, v8
1370
1371 // For insertelement, we need the following instructions:
1372 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1373 // vmv.v.i v12, 0
1374 // vslide1up.vx v16, v12, a1
1375 // vslide1up.vx v12, v16, a0
1376 // addi a0, a2, 1
1377 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1378 // vslideup.vx v8, v12, a2
1379
1380 // TODO: should we count these special vsetvlis?
1381 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1382 }
1383 return BaseCost + SlideCost;
1384}
1385
1387 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1389 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1390
1391 // TODO: Handle more cost kinds.
1393 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1394 Args, CxtI);
1395
1396 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1397 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1398 Args, CxtI);
1399
1400 // Skip if scalar size of Ty is bigger than ELEN.
1401 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELEN())
1402 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1403 Args, CxtI);
1404
1405 // Legalize the type.
1406 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1407
1408 // TODO: Handle scalar type.
1409 if (!LT.second.isVector())
1410 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1411 Args, CxtI);
1412
1413
1414 auto getConstantMatCost =
1415 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1416 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1417 // Two sub-cases:
1418 // * Has a 5 bit immediate operand which can be splatted.
1419 // * Has a larger immediate which must be materialized in scalar register
1420 // We return 0 for both as we currently ignore the cost of materializing
1421 // scalar constants in GPRs.
1422 return 0;
1423
1424 // Add a cost of address generation + the cost of the vector load. The
1425 // address is expected to be a PC relative offset to a constant pool entry
1426 // using auipc/addi.
1427 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
1428 /*AddressSpace=*/0, CostKind);
1429 };
1430
1431 // Add the cost of materializing any constant vectors required.
1432 InstructionCost ConstantMatCost = 0;
1433 if (Op1Info.isConstant())
1434 ConstantMatCost += getConstantMatCost(0, Op1Info);
1435 if (Op2Info.isConstant())
1436 ConstantMatCost += getConstantMatCost(1, Op2Info);
1437
1438 switch (TLI->InstructionOpcodeToISD(Opcode)) {
1439 case ISD::ADD:
1440 case ISD::SUB:
1441 case ISD::AND:
1442 case ISD::OR:
1443 case ISD::XOR:
1444 case ISD::SHL:
1445 case ISD::SRL:
1446 case ISD::SRA:
1447 case ISD::MUL:
1448 case ISD::MULHS:
1449 case ISD::MULHU:
1450 case ISD::FADD:
1451 case ISD::FSUB:
1452 case ISD::FMUL:
1453 case ISD::FNEG: {
1454 return ConstantMatCost + getLMULCost(LT.second) * LT.first * 1;
1455 }
1456 default:
1457 return ConstantMatCost +
1458 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1459 Args, CxtI);
1460 }
1461}
1462
1466 // TODO: More tuning on benchmarks and metrics with changes as needed
1467 // would apply to all settings below to enable performance.
1468
1469
1470 if (ST->enableDefaultUnroll())
1471 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1472
1473 // Enable Upper bound unrolling universally, not dependant upon the conditions
1474 // below.
1475 UP.UpperBound = true;
1476
1477 // Disable loop unrolling for Oz and Os.
1478 UP.OptSizeThreshold = 0;
1480 if (L->getHeader()->getParent()->hasOptSize())
1481 return;
1482
1483 SmallVector<BasicBlock *, 4> ExitingBlocks;
1484 L->getExitingBlocks(ExitingBlocks);
1485 LLVM_DEBUG(dbgs() << "Loop has:\n"
1486 << "Blocks: " << L->getNumBlocks() << "\n"
1487 << "Exit blocks: " << ExitingBlocks.size() << "\n");
1488
1489 // Only allow another exit other than the latch. This acts as an early exit
1490 // as it mirrors the profitability calculation of the runtime unroller.
1491 if (ExitingBlocks.size() > 2)
1492 return;
1493
1494 // Limit the CFG of the loop body for targets with a branch predictor.
1495 // Allowing 4 blocks permits if-then-else diamonds in the body.
1496 if (L->getNumBlocks() > 4)
1497 return;
1498
1499 // Don't unroll vectorized loops, including the remainder loop
1500 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1501 return;
1502
1503 // Scan the loop: don't unroll loops with calls as this could prevent
1504 // inlining.
1506 for (auto *BB : L->getBlocks()) {
1507 for (auto &I : *BB) {
1508 // Initial setting - Don't unroll loops containing vectorized
1509 // instructions.
1510 if (I.getType()->isVectorTy())
1511 return;
1512
1513 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1514 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1515 if (!isLoweredToCall(F))
1516 continue;
1517 }
1518 return;
1519 }
1520
1521 SmallVector<const Value *> Operands(I.operand_values());
1524 }
1525 }
1526
1527 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1528
1529 UP.Partial = true;
1530 UP.Runtime = true;
1531 UP.UnrollRemainder = true;
1532 UP.UnrollAndJam = true;
1534
1535 // Force unrolling small loops can be very useful because of the branch
1536 // taken cost of the backedge.
1537 if (Cost < 12)
1538 UP.Force = true;
1539}
1540
1544}
1545
1548 if (Ty->isVectorTy()) {
1549 if (Size.isScalable() && ST->hasVInstructions())
1550 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1551
1553 return divideCeil(Size, ST->getRealMinVLen());
1554 }
1555
1556 return BaseT::getRegUsageForType(Ty);
1557}
1558
1559unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1560 // This interface is currently only used by SLP. Returning 1 (which is the
1561 // default value for SLPMaxVF) disables SLP. We currently have a cost modeling
1562 // problem w/ constant materialization which causes SLP to perform majorly
1563 // unprofitable transformations.
1564 // TODO: Figure out constant materialization cost modeling and remove.
1565 return SLPMaxVF;
1566}
1567
1569 const TargetTransformInfo::LSRCost &C2) {
1570 // RISCV specific here are "instruction number 1st priority".
1571 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
1572 C1.NumIVMuls, C1.NumBaseAdds,
1573 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1574 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
1575 C2.NumIVMuls, C2.NumBaseAdds,
1576 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1577}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Result used for getMaximumVF query which is used exclusively by " "SLP vectorizer. Defaults to 1 which disables SLP."), cl::init(1), cl::Hidden)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID)
This file defines a TargetTransformInfo::Concept conforming object specific to the RISC-V target mach...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:75
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:538
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:716
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:850
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:715
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
Definition: BasicTTIImpl.h:964
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:610
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:814
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:994
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:718
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:721
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:724
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:722
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:723
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:725
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:734
bool isIntPredicate() const
Definition: InstrTypes.h:826
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:848
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:669
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:525
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:704
static InstructionCost getInvalid(CostType Val=0)
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
Machine Value Type.
uint64_t getScalarSizeInBits() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:41
The optimization diagnostic interface.
bool hasVInstructionsF64() const
unsigned getRealMinVLen() const
bool useRVVForFixedLengthVectors() const
unsigned getXLen() const
unsigned getELEN() const
bool hasVInstructionsF16() const
bool hasVInstructions() const
unsigned getRealMaxVLen() const
bool hasVInstructionsF32() const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getMaxVScale() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool shouldExpandReduction(const IntrinsicInst *II) const
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind)
Return the cost of materializing an immediate for a value operand of a store instruction.
unsigned getRegUsageForType(Type *Ty)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
static RISCVII::VLMUL getLMUL(MVT VT)
The main scalar evolution driver.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_t size() const
Definition: SmallVector.h:91
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
const DataLayout & getDataLayout() const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:322
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:325
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:267
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:231
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:350
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
Type * getElementType() const
Definition: DerivedTypes.h:422
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:163
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:786
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:773
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:923
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:650
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:704
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:776
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:883
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:832
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:679
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:865
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:782
int getIntMatCost(const APInt &Val, unsigned Size, const FeatureBitset &ActiveFeatures, bool CompressionCost)
std::pair< unsigned, bool > decodeVLMUL(RISCVII::VLMUL VLMUL)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:395
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1085
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:522
AddressSpace
Definition: NVPTXBaseInfo.h:21
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:179
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:286
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:382
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2050
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:291
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).