LLVM 23.0.0git
BasicTTIImpl.h
Go to the documentation of this file.
1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/BitVector.h"
21#include "llvm/ADT/STLExtras.h"
35#include "llvm/IR/BasicBlock.h"
36#include "llvm/IR/Constant.h"
37#include "llvm/IR/Constants.h"
38#include "llvm/IR/DataLayout.h"
40#include "llvm/IR/InstrTypes.h"
41#include "llvm/IR/Instruction.h"
43#include "llvm/IR/Intrinsics.h"
44#include "llvm/IR/Operator.h"
45#include "llvm/IR/Type.h"
46#include "llvm/IR/Value.h"
55#include <algorithm>
56#include <cassert>
57#include <cstdint>
58#include <limits>
59#include <optional>
60#include <utility>
61
62namespace llvm {
63
64class Function;
65class GlobalValue;
66class LLVMContext;
67class ScalarEvolution;
68class SCEV;
69class TargetMachine;
70
72
73/// Base class which can be used to help build a TTI implementation.
74///
75/// This class provides as much implementation of the TTI interface as is
76/// possible using the target independent parts of the code generator.
77///
78/// In order to subclass it, your class must implement a getST() method to
79/// return the subtarget, and a getTLI() method to return the target lowering.
80/// We need these methods implemented in the derived class so that this class
81/// doesn't have to duplicate storage for them.
82template <typename T>
84private:
86 using TTI = TargetTransformInfo;
87
88 /// Helper function to access this as a T.
89 const T *thisT() const { return static_cast<const T *>(this); }
90
91 /// Estimate a cost of Broadcast as an extract and sequence of insert
92 /// operations.
94 getBroadcastShuffleOverhead(FixedVectorType *VTy,
97 // Broadcast cost is equal to the cost of extracting the zero'th element
98 // plus the cost of inserting it into every element of the result vector.
99 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
100 CostKind, 0, nullptr, nullptr);
101
102 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
103 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
104 CostKind, i, nullptr, nullptr);
105 }
106 return Cost;
107 }
108
109 /// Estimate a cost of shuffle as a sequence of extract and insert
110 /// operations.
112 getPermuteShuffleOverhead(FixedVectorType *VTy,
115 // Shuffle cost is equal to the cost of extracting element from its argument
116 // plus the cost of inserting them onto the result vector.
117
118 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
119 // index 0 of first vector, index 1 of second vector,index 2 of first
120 // vector and finally index 3 of second vector and insert them at index
121 // <0,1,2,3> of result vector.
122 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
123 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
124 CostKind, i, nullptr, nullptr);
125 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
126 CostKind, i, nullptr, nullptr);
127 }
128 return Cost;
129 }
130
131 /// Estimate a cost of subvector extraction as a sequence of extract and
132 /// insert operations.
133 InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
135 int Index,
136 FixedVectorType *SubVTy) const {
137 assert(VTy && SubVTy &&
138 "Can only extract subvectors from vectors");
139 int NumSubElts = SubVTy->getNumElements();
141 (Index + NumSubElts) <=
143 "SK_ExtractSubvector index out of range");
144
146 // Subvector extraction cost is equal to the cost of extracting element from
147 // the source type plus the cost of inserting them into the result vector
148 // type.
149 for (int i = 0; i != NumSubElts; ++i) {
150 Cost +=
151 thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
152 CostKind, i + Index, nullptr, nullptr);
153 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
154 CostKind, i, nullptr, nullptr);
155 }
156 return Cost;
157 }
158
159 /// Estimate a cost of subvector insertion as a sequence of extract and
160 /// insert operations.
161 InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
163 int Index,
164 FixedVectorType *SubVTy) const {
165 assert(VTy && SubVTy &&
166 "Can only insert subvectors into vectors");
167 int NumSubElts = SubVTy->getNumElements();
169 (Index + NumSubElts) <=
171 "SK_InsertSubvector index out of range");
172
174 // Subvector insertion cost is equal to the cost of extracting element from
175 // the source type plus the cost of inserting them into the result vector
176 // type.
177 for (int i = 0; i != NumSubElts; ++i) {
178 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
179 CostKind, i, nullptr, nullptr);
180 Cost +=
181 thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
182 i + Index, nullptr, nullptr);
183 }
184 return Cost;
185 }
186
187 /// Local query method delegates up to T which *must* implement this!
188 const TargetSubtargetInfo *getST() const {
189 return static_cast<const T *>(this)->getST();
190 }
191
192 /// Local query method delegates up to T which *must* implement this!
193 const TargetLoweringBase *getTLI() const {
194 return static_cast<const T *>(this)->getTLI();
195 }
196
197 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
198 switch (M) {
200 return ISD::UNINDEXED;
201 case TTI::MIM_PreInc:
202 return ISD::PRE_INC;
203 case TTI::MIM_PreDec:
204 return ISD::PRE_DEC;
205 case TTI::MIM_PostInc:
206 return ISD::POST_INC;
207 case TTI::MIM_PostDec:
208 return ISD::POST_DEC;
209 }
210 llvm_unreachable("Unexpected MemIndexedMode");
211 }
212
213 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
214 Align Alignment,
215 bool VariableMask,
216 bool IsGatherScatter,
218 unsigned AddressSpace = 0) const {
219 // We cannot scalarize scalable vectors, so return Invalid.
220 if (isa<ScalableVectorType>(DataTy))
222
223 auto *VT = cast<FixedVectorType>(DataTy);
224 unsigned VF = VT->getNumElements();
225
226 // Assume the target does not have support for gather/scatter operations
227 // and provide a rough estimate.
228 //
229 // First, compute the cost of the individual memory operations.
230 InstructionCost AddrExtractCost =
231 IsGatherScatter ? getScalarizationOverhead(
233 PointerType::get(VT->getContext(), 0), VF),
234 /*Insert=*/false, /*Extract=*/true, CostKind)
235 : 0;
236
237 // The cost of the scalar loads/stores.
238 InstructionCost MemoryOpCost =
239 VF * thisT()->getMemoryOpCost(Opcode, VT->getElementType(), Alignment,
241
242 // Next, compute the cost of packing the result in a vector.
243 InstructionCost PackingCost =
244 getScalarizationOverhead(VT, Opcode != Instruction::Store,
245 Opcode == Instruction::Store, CostKind);
246
247 InstructionCost ConditionalCost = 0;
248 if (VariableMask) {
249 // Compute the cost of conditionally executing the memory operations with
250 // variable masks. This includes extracting the individual conditions, a
251 // branches and PHIs to combine the results.
252 // NOTE: Estimating the cost of conditionally executing the memory
253 // operations accurately is quite difficult and the current solution
254 // provides a very rough estimate only.
255 ConditionalCost =
258 /*Insert=*/false, /*Extract=*/true, CostKind) +
259 VF * (thisT()->getCFInstrCost(Instruction::CondBr, CostKind) +
260 thisT()->getCFInstrCost(Instruction::PHI, CostKind));
261 }
262
263 return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
264 }
265
266 /// Checks if the provided mask \p is a splat mask, i.e. it contains only -1
267 /// or same non -1 index value and this index value contained at least twice.
268 /// So, mask <0, -1,-1, -1> is not considered splat (it is just identity),
269 /// same for <-1, 0, -1, -1> (just a slide), while <2, -1, 2, -1> is a splat
270 /// with \p Index=2.
271 static bool isSplatMask(ArrayRef<int> Mask, unsigned NumSrcElts, int &Index) {
272 // Check that the broadcast index meets at least twice.
273 bool IsCompared = false;
274 if (int SplatIdx = PoisonMaskElem;
275 all_of(enumerate(Mask), [&](const auto &P) {
276 if (P.value() == PoisonMaskElem)
277 return P.index() != Mask.size() - 1 || IsCompared;
278 if (static_cast<unsigned>(P.value()) >= NumSrcElts * 2)
279 return false;
280 if (SplatIdx == PoisonMaskElem) {
281 SplatIdx = P.value();
282 return P.index() != Mask.size() - 1;
283 }
284 IsCompared = true;
285 return SplatIdx == P.value();
286 })) {
287 Index = SplatIdx;
288 return true;
289 }
290 return false;
291 }
292
293 /// Several intrinsics that return structs (including llvm.sincos[pi] and
294 /// llvm.modf) can be lowered to a vector library call (for certain VFs). The
295 /// vector library functions correspond to the scalar calls (e.g. sincos or
296 /// modf), which unlike the intrinsic return values via output pointers. This
297 /// helper checks if a vector call exists for the given intrinsic, and returns
298 /// the cost, which includes the cost of the mask (if required), and the loads
299 /// for values returned via output pointers. \p LC is the scalar libcall and
300 /// \p CallRetElementIndex (optional) is the struct element which is mapped to
301 /// the call return value. If std::nullopt is returned, then no vector library
302 /// call is available, so the intrinsic should be assigned the default cost
303 /// (e.g. scalarization).
304 std::optional<InstructionCost> getMultipleResultIntrinsicVectorLibCallCost(
306 std::optional<unsigned> CallRetElementIndex = {}) const {
307 Type *RetTy = ICA.getReturnType();
308 // Vector variants of the intrinsic can be mapped to a vector library call.
309 if (!isa<StructType>(RetTy) ||
311 return std::nullopt;
312
313 Type *Ty = getContainedTypes(RetTy).front();
314 EVT VT = getTLI()->getValueType(DL, Ty);
315
316 RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
317
318 switch (ICA.getID()) {
319 case Intrinsic::modf:
320 LC = RTLIB::getMODF(VT);
321 break;
322 case Intrinsic::sincospi:
323 LC = RTLIB::getSINCOSPI(VT);
324 break;
325 case Intrinsic::sincos:
326 LC = RTLIB::getSINCOS(VT);
327 break;
328 default:
329 return std::nullopt;
330 }
331
332 // Find associated libcall.
333 RTLIB::LibcallImpl LibcallImpl = getTLI()->getLibcallImpl(LC);
334 if (LibcallImpl == RTLIB::Unsupported)
335 return std::nullopt;
336
337 LLVMContext &Ctx = RetTy->getContext();
338
339 // Cost the call + mask.
340 auto Cost =
341 thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
342
345 auto VecTy = VectorType::get(IntegerType::getInt1Ty(Ctx), VF);
346 Cost += thisT()->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,
347 VecTy, {}, CostKind, 0, nullptr, {});
348 }
349
350 // Lowering to a library call (with output pointers) may require us to emit
351 // reloads for the results.
352 for (auto [Idx, VectorTy] : enumerate(getContainedTypes(RetTy))) {
353 if (Idx == CallRetElementIndex)
354 continue;
355 Cost += thisT()->getMemoryOpCost(
356 Instruction::Load, VectorTy,
357 thisT()->getDataLayout().getABITypeAlign(VectorTy), 0, CostKind);
358 }
359 return Cost;
360 }
361
362 /// Filter out constant and duplicated entries in \p Ops and return a vector
363 /// containing the types from \p Tys corresponding to the remaining operands.
365 filterConstantAndDuplicatedOperands(ArrayRef<const Value *> Ops,
366 ArrayRef<Type *> Tys) {
367 SmallPtrSet<const Value *, 4> UniqueOperands;
368 SmallVector<Type *, 4> FilteredTys;
369 for (const auto &[Op, Ty] : zip_equal(Ops, Tys)) {
370 if (isa<Constant>(Op) || !UniqueOperands.insert(Op).second)
371 continue;
372 FilteredTys.push_back(Ty);
373 }
374 return FilteredTys;
375 }
376
377protected:
378 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
379 : BaseT(DL) {}
380 ~BasicTTIImplBase() override = default;
381
384
385public:
386 /// \name Scalar TTI Implementations
387 /// @{
389 unsigned AddressSpace, Align Alignment,
390 unsigned *Fast) const override {
391 EVT E = EVT::getIntegerVT(Context, BitWidth);
392 return getTLI()->allowsMisalignedMemoryAccesses(
394 }
395
396 bool areInlineCompatible(const Function *Caller,
397 const Function *Callee) const override {
398 const TargetMachine &TM = getTLI()->getTargetMachine();
399
400 const FeatureBitset &CallerBits =
401 TM.getSubtargetImpl(*Caller)->getFeatureBits();
402 const FeatureBitset &CalleeBits =
403 TM.getSubtargetImpl(*Callee)->getFeatureBits();
404
405 // Inline a callee if its target-features are a subset of the callers
406 // target-features.
407 return (CallerBits & CalleeBits) == CalleeBits;
408 }
409
410 bool hasBranchDivergence(const Function *F = nullptr) const override {
411 return false;
412 }
413
414 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override {
415 return false;
416 }
417
418 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const override {
419 return true;
420 }
421
422 unsigned getFlatAddressSpace() const override {
423 // Return an invalid address space.
424 return -1;
425 }
426
428 Intrinsic::ID IID) const override {
429 return false;
430 }
431
432 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override {
433 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
434 }
435
436 unsigned getAssumedAddrSpace(const Value *V) const override {
437 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
438 }
439
440 bool isSingleThreaded() const override {
441 return getTLI()->getTargetMachine().Options.ThreadModel ==
443 }
444
445 std::pair<const Value *, unsigned>
446 getPredicatedAddrSpace(const Value *V) const override {
447 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
448 }
449
451 Value *NewV) const override {
452 return nullptr;
453 }
454
455 bool isLegalAddImmediate(int64_t imm) const override {
456 return getTLI()->isLegalAddImmediate(imm);
457 }
458
459 bool isLegalAddScalableImmediate(int64_t Imm) const override {
460 return getTLI()->isLegalAddScalableImmediate(Imm);
461 }
462
463 bool isLegalICmpImmediate(int64_t imm) const override {
464 return getTLI()->isLegalICmpImmediate(imm);
465 }
466
467 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
468 bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
469 Instruction *I = nullptr,
470 int64_t ScalableOffset = 0) const override {
472 AM.BaseGV = BaseGV;
473 AM.BaseOffs = BaseOffset;
474 AM.HasBaseReg = HasBaseReg;
475 AM.Scale = Scale;
476 AM.ScalableOffset = ScalableOffset;
477 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
478 }
479
480 int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) {
481 return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset);
482 }
483
484 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy,
485 Align Alignment,
486 unsigned AddrSpace) const override {
487 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy, Alignment,
488 AddrSpace](unsigned VF) {
489 auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
490 EVT VT = getTLI()->getValueType(DL, SrcTy);
491 if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
492 getTLI()->isOperationCustom(ISD::STORE, VT))
493 return true;
494
495 EVT ValVT =
496 getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
497 EVT LegalizedVT =
498 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
499 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT, Alignment,
500 AddrSpace);
501 };
502 while (VF > 2 && IsSupportedByTarget(VF))
503 VF /= 2;
504 return VF;
505 }
506
507 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty) const override {
508 EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
509 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
510 }
511
512 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty) const override {
513 EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
514 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
515 }
516
518 const TTI::LSRCost &C2) const override {
520 }
521
525
529
533
535 StackOffset BaseOffset, bool HasBaseReg,
536 int64_t Scale,
537 unsigned AddrSpace) const override {
539 AM.BaseGV = BaseGV;
540 AM.BaseOffs = BaseOffset.getFixed();
541 AM.HasBaseReg = HasBaseReg;
542 AM.Scale = Scale;
543 AM.ScalableOffset = BaseOffset.getScalable();
544 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
545 return 0;
547 }
548
549 bool isTruncateFree(Type *Ty1, Type *Ty2) const override {
550 return getTLI()->isTruncateFree(Ty1, Ty2);
551 }
552
553 bool isProfitableToHoist(Instruction *I) const override {
554 return getTLI()->isProfitableToHoist(I);
555 }
556
557 bool useAA() const override { return getST()->useAA(); }
558
559 bool isTypeLegal(Type *Ty) const override {
560 EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
561 return getTLI()->isTypeLegal(VT);
562 }
563
564 unsigned getRegUsageForType(Type *Ty) const override {
565 EVT ETy = getTLI()->getValueType(DL, Ty);
566 return getTLI()->getNumRegisters(Ty->getContext(), ETy);
567 }
568
569 InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
570 ArrayRef<const Value *> Operands, Type *AccessType,
571 TTI::TargetCostKind CostKind) const override {
572 return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind);
573 }
574
576 const SwitchInst &SI, unsigned &JumpTableSize, ProfileSummaryInfo *PSI,
577 BlockFrequencyInfo *BFI) const override {
578 /// Try to find the estimated number of clusters. Note that the number of
579 /// clusters identified in this function could be different from the actual
580 /// numbers found in lowering. This function ignore switches that are
581 /// lowered with a mix of jump table / bit test / BTree. This function was
582 /// initially intended to be used when estimating the cost of switch in
583 /// inline cost heuristic, but it's a generic cost model to be used in other
584 /// places (e.g., in loop unrolling).
585 unsigned N = SI.getNumCases();
586 const TargetLoweringBase *TLI = getTLI();
587 const DataLayout &DL = this->getDataLayout();
588
589 JumpTableSize = 0;
590 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
591
592 // Early exit if both a jump table and bit test are not allowed.
593 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
594 return N;
595
596 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
597 APInt MinCaseVal = MaxCaseVal;
598 for (auto CI : SI.cases()) {
599 const APInt &CaseVal = CI.getCaseValue()->getValue();
600 if (CaseVal.sgt(MaxCaseVal))
601 MaxCaseVal = CaseVal;
602 if (CaseVal.slt(MinCaseVal))
603 MinCaseVal = CaseVal;
604 }
605
606 // Check if suitable for a bit test
607 if (N <= DL.getIndexSizeInBits(0u)) {
609 for (auto I : SI.cases()) {
610 const BasicBlock *BB = I.getCaseSuccessor();
611 ++DestMap[BB];
612 }
613
614 if (TLI->isSuitableForBitTests(DestMap, MinCaseVal, MaxCaseVal, DL))
615 return 1;
616 }
617
618 // Check if suitable for a jump table.
619 if (IsJTAllowed) {
620 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
621 return N;
623 (MaxCaseVal - MinCaseVal)
624 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
625 // Check whether a range of clusters is dense enough for a jump table
626 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
627 JumpTableSize = Range;
628 return 1;
629 }
630 }
631 return N;
632 }
633
634 bool shouldBuildLookupTables() const override {
635 const TargetLoweringBase *TLI = getTLI();
636 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
637 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
638 }
639
640 bool shouldBuildRelLookupTables() const override {
641 const TargetMachine &TM = getTLI()->getTargetMachine();
642 // If non-PIC mode, do not generate a relative lookup table.
643 if (!TM.isPositionIndependent())
644 return false;
645
646 /// Relative lookup table entries consist of 32-bit offsets.
647 /// Do not generate relative lookup tables for large code models
648 /// in 64-bit achitectures where 32-bit offsets might not be enough.
649 if (TM.getCodeModel() == CodeModel::Medium ||
651 return false;
652
653 const Triple &TargetTriple = TM.getTargetTriple();
654 if (!TargetTriple.isArch64Bit())
655 return false;
656
657 // Disable relative lookup tables for all AArch64 targets. Even AArch64's
658 // small code model allows a 4GB span of text + data, which might not fit
659 // in the 32-bit offsets relative lookup tables generate.
660 if (TargetTriple.isAArch64())
661 return false;
662
663 return true;
664 }
665
666 bool haveFastSqrt(Type *Ty) const override {
667 const TargetLoweringBase *TLI = getTLI();
668 EVT VT = TLI->getValueType(DL, Ty);
669 return TLI->isTypeLegal(VT) &&
671 }
672
673 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const override { return true; }
674
675 InstructionCost getFPOpCost(Type *Ty) const override {
676 // Check whether FADD is available, as a proxy for floating-point in
677 // general.
678 const TargetLoweringBase *TLI = getTLI();
679 EVT VT = TLI->getValueType(DL, Ty);
683 }
684
686 const Function &Fn) const override {
687 switch (Inst.getOpcode()) {
688 default:
689 break;
690 case Instruction::SDiv:
691 case Instruction::SRem:
692 case Instruction::UDiv:
693 case Instruction::URem: {
694 if (!isa<ConstantInt>(Inst.getOperand(1)))
695 return false;
696 EVT VT = getTLI()->getValueType(DL, Inst.getType());
697 return !getTLI()->isIntDivCheap(VT, Fn.getAttributes());
698 }
699 };
700
701 return false;
702 }
703
704 unsigned getInliningThresholdMultiplier() const override { return 1; }
705 unsigned adjustInliningThreshold(const CallBase *CB) const override {
706 return 0;
707 }
708 unsigned getCallerAllocaCost(const CallBase *CB,
709 const AllocaInst *AI) const override {
710 return 0;
711 }
712
713 int getInlinerVectorBonusPercent() const override { return 150; }
714
717 OptimizationRemarkEmitter *ORE) const override {
718 // This unrolling functionality is target independent, but to provide some
719 // motivation for its intended use, for x86:
720
721 // According to the Intel 64 and IA-32 Architectures Optimization Reference
722 // Manual, Intel Core models and later have a loop stream detector (and
723 // associated uop queue) that can benefit from partial unrolling.
724 // The relevant requirements are:
725 // - The loop must have no more than 4 (8 for Nehalem and later) branches
726 // taken, and none of them may be calls.
727 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
728
729 // According to the Software Optimization Guide for AMD Family 15h
730 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
731 // and loop buffer which can benefit from partial unrolling.
732 // The relevant requirements are:
733 // - The loop must have fewer than 16 branches
734 // - The loop must have less than 40 uops in all executed loop branches
735
736 // The number of taken branches in a loop is hard to estimate here, and
737 // benchmarking has revealed that it is better not to be conservative when
738 // estimating the branch count. As a result, we'll ignore the branch limits
739 // until someone finds a case where it matters in practice.
740
741 unsigned MaxOps;
742 const TargetSubtargetInfo *ST = getST();
743 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
745 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
746 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
747 else
748 return;
749
750 // Scan the loop: don't unroll loops with calls.
751 for (BasicBlock *BB : L->blocks()) {
752 for (Instruction &I : *BB) {
753 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
754 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
755 if (!thisT()->isLoweredToCall(F))
756 continue;
757 }
758
759 if (ORE) {
760 ORE->emit([&]() {
761 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
762 L->getHeader())
763 << "advising against unrolling the loop because it "
764 "contains a "
765 << ore::NV("Call", &I);
766 });
767 }
768 return;
769 }
770 }
771 }
772
773 // Enable runtime and partial unrolling up to the specified size.
774 // Enable using trip count upper bound to unroll loops.
775 UP.Partial = UP.Runtime = UP.UpperBound = true;
776 UP.PartialThreshold = MaxOps;
777
778 // Avoid unrolling when optimizing for size.
779 UP.OptSizeThreshold = 0;
781
782 // Set number of instructions optimized when "back edge"
783 // becomes "fall through" to default value of 2.
784 UP.BEInsns = 2;
785 }
786
788 TTI::PeelingPreferences &PP) const override {
789 PP.PeelCount = 0;
790 PP.AllowPeeling = true;
791 PP.AllowLoopNestsPeeling = false;
792 PP.PeelProfiledIterations = true;
793 }
794
797 HardwareLoopInfo &HWLoopInfo) const override {
798 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
799 }
800
801 unsigned getEpilogueVectorizationMinVF() const override {
803 }
804
808
812
813 std::optional<Instruction *>
816 }
817
818 std::optional<Value *>
820 APInt DemandedMask, KnownBits &Known,
821 bool &KnownBitsComputed) const override {
822 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
823 KnownBitsComputed);
824 }
825
827 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
828 APInt &UndefElts2, APInt &UndefElts3,
829 std::function<void(Instruction *, unsigned, APInt, APInt &)>
830 SimplifyAndSetOp) const override {
832 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
833 SimplifyAndSetOp);
834 }
835
836 std::optional<unsigned>
838 return std::optional<unsigned>(
839 getST()->getCacheSize(static_cast<unsigned>(Level)));
840 }
841
842 std::optional<unsigned>
844 std::optional<unsigned> TargetResult =
845 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
846
847 if (TargetResult)
848 return TargetResult;
849
850 return BaseT::getCacheAssociativity(Level);
851 }
852
853 unsigned getCacheLineSize() const override {
854 return getST()->getCacheLineSize();
855 }
856
857 unsigned getPrefetchDistance() const override {
858 return getST()->getPrefetchDistance();
859 }
860
861 unsigned getMinPrefetchStride(unsigned NumMemAccesses,
862 unsigned NumStridedMemAccesses,
863 unsigned NumPrefetches,
864 bool HasCall) const override {
865 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
866 NumPrefetches, HasCall);
867 }
868
869 unsigned getMaxPrefetchIterationsAhead() const override {
870 return getST()->getMaxPrefetchIterationsAhead();
871 }
872
873 bool enableWritePrefetching() const override {
874 return getST()->enableWritePrefetching();
875 }
876
877 bool shouldPrefetchAddressSpace(unsigned AS) const override {
878 return getST()->shouldPrefetchAddressSpace(AS);
879 }
880
881 /// @}
882
883 /// \name Vector TTI Implementations
884 /// @{
885
890
891 std::optional<unsigned> getMaxVScale() const override { return std::nullopt; }
892 std::optional<unsigned> getVScaleForTuning() const override {
893 return std::nullopt;
894 }
895
896 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
897 /// are set if the demanded result elements need to be inserted and/or
898 /// extracted from vectors.
900 getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts,
901 bool Insert, bool Extract,
903 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {},
905 TTI::VectorInstrContext::None) const override {
906 /// FIXME: a bitfield is not a reasonable abstraction for talking about
907 /// which elements are needed from a scalable vector
908 if (isa<ScalableVectorType>(InTy))
910 auto *Ty = cast<FixedVectorType>(InTy);
911
912 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
913 (VL.empty() || VL.size() == Ty->getNumElements()) &&
914 "Vector size mismatch");
915
917
918 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
919 if (!DemandedElts[i])
920 continue;
921 if (Insert) {
922 Value *InsertedVal = VL.empty() ? nullptr : VL[i];
923 Cost +=
924 thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
925 CostKind, i, nullptr, InsertedVal, VIC);
926 }
927 if (Extract)
928 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
929 CostKind, i, nullptr, nullptr, VIC);
930 }
931
932 return Cost;
933 }
934
935 bool
937 unsigned ScalarOpdIdx) const override {
938 return false;
939 }
940
942 int OpdIdx) const override {
943 return OpdIdx == -1;
944 }
945
946 bool
948 int RetIdx) const override {
949 return RetIdx == 0;
950 }
951
952 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
954 VectorType *InTy, bool Insert, bool Extract, TTI::TargetCostKind CostKind,
955 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {},
957 if (isa<ScalableVectorType>(InTy))
959 auto *Ty = cast<FixedVectorType>(InTy);
960
961 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
962 // Use CRTP to allow target overrides
963 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
964 CostKind, ForPoisonSrc, VL, VIC);
965 }
966
967 /// Estimate the overhead of scalarizing an instruction's
968 /// operands. The (potentially vector) types to use for each of
969 /// argument are passes via Tys.
973 TTI::VectorInstrContext::None) const override {
975 for (Type *Ty : Tys) {
976 // Disregard things like metadata arguments.
977 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
978 !Ty->isPtrOrPtrVectorTy())
979 continue;
980
981 if (auto *VecTy = dyn_cast<VectorType>(Ty))
982 Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
983 /*Extract*/ true, CostKind,
984 /*ForPoisonSrc=*/true, {}, VIC);
985 }
986
987 return Cost;
988 }
989
990 /// Estimate the overhead of scalarizing the inputs and outputs of an
991 /// instruction, with return type RetTy and arguments Args of type Tys. If
992 /// Args are unknown (empty), then the cost associated with one argument is
993 /// added as a heuristic.
999 RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
1000 if (!Args.empty())
1002 filterConstantAndDuplicatedOperands(Args, Tys), CostKind);
1003 else
1004 // When no information on arguments is provided, we add the cost
1005 // associated with one argument as a heuristic.
1006 Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
1007 /*Extract*/ true, CostKind);
1008
1009 return Cost;
1010 }
1011
1012 /// Estimate the cost of type-legalization and the legalized type.
1013 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
1014 LLVMContext &C = Ty->getContext();
1015 EVT MTy = getTLI()->getValueType(DL, Ty);
1016
1018 // We keep legalizing the type until we find a legal kind. We assume that
1019 // the only operation that costs anything is the split. After splitting
1020 // we need to handle two types.
1021 while (true) {
1022 TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy);
1023
1025 // Ensure we return a sensible simple VT here, since many callers of
1026 // this function require it.
1027 MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
1028 return std::make_pair(InstructionCost::getInvalid(), VT);
1029 }
1030
1031 if (LK.first == TargetLoweringBase::TypeLegal)
1032 return std::make_pair(Cost, MTy.getSimpleVT());
1033
1034 if (LK.first == TargetLoweringBase::TypeSplitVector ||
1036 Cost *= 2;
1037
1038 // Do not loop with f128 type.
1039 if (MTy == LK.second)
1040 return std::make_pair(Cost, MTy.getSimpleVT());
1041
1042 // Keep legalizing the type.
1043 MTy = LK.second;
1044 }
1045 }
1046
1047 unsigned getMaxInterleaveFactor(ElementCount VF) const override { return 1; }
1048
1050 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1053 ArrayRef<const Value *> Args = {},
1054 const Instruction *CxtI = nullptr) const override {
1055 // Check if any of the operands are vector operands.
1056 const TargetLoweringBase *TLI = getTLI();
1057 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1058 assert(ISD && "Invalid opcode");
1059
1060 // TODO: Handle more cost kinds.
1062 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
1063 Opd1Info, Opd2Info,
1064 Args, CxtI);
1065
1066 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1067
1068 bool IsFloat = Ty->isFPOrFPVectorTy();
1069 // Assume that floating point arithmetic operations cost twice as much as
1070 // integer operations.
1071 InstructionCost OpCost = (IsFloat ? 2 : 1);
1072
1073 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
1074 // The operation is legal. Assume it costs 1.
1075 // TODO: Once we have extract/insert subvector cost we need to use them.
1076 return LT.first * OpCost;
1077 }
1078
1079 if (!TLI->isOperationExpand(ISD, LT.second)) {
1080 // If the operation is custom lowered, then assume that the code is twice
1081 // as expensive.
1082 return LT.first * 2 * OpCost;
1083 }
1084
1085 // An 'Expand' of URem and SRem is special because it may default
1086 // to expanding the operation into a sequence of sub-operations
1087 // i.e. X % Y -> X-(X/Y)*Y.
1088 if (ISD == ISD::UREM || ISD == ISD::SREM) {
1089 bool IsSigned = ISD == ISD::SREM;
1090 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
1091 LT.second) ||
1092 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
1093 LT.second)) {
1094 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
1095 InstructionCost DivCost = thisT()->getArithmeticInstrCost(
1096 DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
1097 InstructionCost MulCost =
1098 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
1099 InstructionCost SubCost =
1100 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
1101 return DivCost + MulCost + SubCost;
1102 }
1103 }
1104
1105 // We cannot scalarize scalable vectors, so return Invalid.
1108
1109 // Else, assume that we need to scalarize this op.
1110 // TODO: If one of the types get legalized by splitting, handle this
1111 // similarly to what getCastInstrCost() does.
1112 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1113 InstructionCost Cost = thisT()->getArithmeticInstrCost(
1114 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
1115 Args, CxtI);
1116 // Return the cost of multiple scalar invocation plus the cost of
1117 // inserting and extracting the values.
1118 SmallVector<Type *> Tys(Args.size(), Ty);
1119 return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1120 VTy->getNumElements() * Cost;
1121 }
1122
1123 // We don't know anything about this scalar instruction.
1124 return OpCost;
1125 }
1126
1128 ArrayRef<int> Mask,
1129 VectorType *SrcTy, int &Index,
1130 VectorType *&SubTy) const {
1131 if (Mask.empty())
1132 return Kind;
1133 int NumDstElts = Mask.size();
1134 int NumSrcElts = SrcTy->getElementCount().getKnownMinValue();
1135 switch (Kind) {
1137 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
1138 return TTI::SK_Reverse;
1139 if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
1140 return TTI::SK_Broadcast;
1141 if (isSplatMask(Mask, NumSrcElts, Index))
1142 return TTI::SK_Broadcast;
1143 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
1144 (Index + NumDstElts) <= NumSrcElts) {
1145 SubTy = FixedVectorType::get(SrcTy->getElementType(), NumDstElts);
1147 }
1148 break;
1149 }
1150 case TTI::SK_PermuteTwoSrc: {
1151 if (all_of(Mask, [NumSrcElts](int M) { return M < NumSrcElts; }))
1153 Index, SubTy);
1154 int NumSubElts;
1155 if (NumDstElts > 2 && ShuffleVectorInst::isInsertSubvectorMask(
1156 Mask, NumSrcElts, NumSubElts, Index)) {
1157 if (Index + NumSubElts > NumSrcElts)
1158 return Kind;
1159 SubTy = FixedVectorType::get(SrcTy->getElementType(), NumSubElts);
1161 }
1162 if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts))
1163 return TTI::SK_Select;
1164 if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts))
1165 return TTI::SK_Transpose;
1166 if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index))
1167 return TTI::SK_Splice;
1168 break;
1169 }
1170 case TTI::SK_Select:
1171 case TTI::SK_Reverse:
1172 case TTI::SK_Broadcast:
1173 case TTI::SK_Transpose:
1176 case TTI::SK_Splice:
1177 break;
1178 }
1179 return Kind;
1180 }
1181
1185 VectorType *SubTp, ArrayRef<const Value *> Args = {},
1186 const Instruction *CxtI = nullptr) const override {
1187 switch (improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp)) {
1188 case TTI::SK_Broadcast:
1189 if (auto *FVT = dyn_cast<FixedVectorType>(SrcTy))
1190 return getBroadcastShuffleOverhead(FVT, CostKind);
1192 case TTI::SK_Select:
1193 case TTI::SK_Splice:
1194 case TTI::SK_Reverse:
1195 case TTI::SK_Transpose:
1198 if (auto *FVT = dyn_cast<FixedVectorType>(SrcTy))
1199 return getPermuteShuffleOverhead(FVT, CostKind);
1202 return getExtractSubvectorOverhead(SrcTy, CostKind, Index,
1203 cast<FixedVectorType>(SubTp));
1205 return getInsertSubvectorOverhead(DstTy, CostKind, Index,
1206 cast<FixedVectorType>(SubTp));
1207 }
1208 llvm_unreachable("Unknown TTI::ShuffleKind");
1209 }
1210
1212 getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1214 const Instruction *I = nullptr) const override {
1215 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
1216 return 0;
1217
1218 const TargetLoweringBase *TLI = getTLI();
1219 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1220 assert(ISD && "Invalid opcode");
1221 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1222 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1223
1224 TypeSize SrcSize = SrcLT.second.getSizeInBits();
1225 TypeSize DstSize = DstLT.second.getSizeInBits();
1226 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1227 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1228
1229 switch (Opcode) {
1230 default:
1231 break;
1232 case Instruction::Trunc:
1233 // Check for NOOP conversions.
1234 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
1235 return 0;
1236 [[fallthrough]];
1237 case Instruction::BitCast:
1238 // Bitcast between types that are legalized to the same type are free and
1239 // assume int to/from ptr of the same size is also free.
1240 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1241 SrcSize == DstSize)
1242 return 0;
1243 break;
1244 case Instruction::FPExt:
1245 if (I && getTLI()->isExtFree(I))
1246 return 0;
1247 break;
1248 case Instruction::ZExt:
1249 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
1250 return 0;
1251 [[fallthrough]];
1252 case Instruction::SExt:
1253 if (I && getTLI()->isExtFree(I))
1254 return 0;
1255
1256 // If this is a zext/sext of a load, return 0 if the corresponding
1257 // extending load exists on target and the result type is legal.
1258 if (CCH == TTI::CastContextHint::Normal) {
1259 EVT ExtVT = EVT::getEVT(Dst);
1260 EVT LoadVT = EVT::getEVT(Src);
1261 unsigned LType =
1262 Opcode == Instruction::ZExt ? ISD::ZEXTLOAD : ISD::SEXTLOAD;
1263 if (I) {
1264 if (auto *LI = dyn_cast<LoadInst>(I->getOperand(0))) {
1265 if (DstLT.first == SrcLT.first &&
1266 TLI->isLoadLegal(ExtVT, LoadVT, LI->getAlign(),
1267 LI->getPointerAddressSpace(), LType, false))
1268 return 0;
1269 } else if (auto *II = dyn_cast<IntrinsicInst>(I->getOperand(0))) {
1270 switch (II->getIntrinsicID()) {
1271 case Intrinsic::masked_load: {
1272 Type *PtrType = II->getArgOperand(0)->getType();
1273 assert(PtrType->isPointerTy());
1274
1275 if (DstLT.first == SrcLT.first &&
1276 TLI->isLoadLegal(
1277 ExtVT, LoadVT, II->getParamAlign(0).valueOrOne(),
1278 PtrType->getPointerAddressSpace(), LType, false))
1279 return 0;
1280
1281 break;
1282 }
1283 default:
1284 break;
1285 }
1286 }
1287 }
1288 }
1289 break;
1290 case Instruction::AddrSpaceCast:
1291 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
1292 Dst->getPointerAddressSpace()))
1293 return 0;
1294 break;
1295 }
1296
1297 auto *SrcVTy = dyn_cast<VectorType>(Src);
1298 auto *DstVTy = dyn_cast<VectorType>(Dst);
1299
1300 // If the cast is marked as legal (or promote) then assume low cost.
1301 if (SrcLT.first == DstLT.first &&
1302 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1303 return SrcLT.first;
1304
1305 // Handle scalar conversions.
1306 if (!SrcVTy && !DstVTy) {
1307 // Just check the op cost. If the operation is legal then assume it costs
1308 // 1.
1309 if (!TLI->isOperationExpand(ISD, DstLT.second))
1310 return 1;
1311
1312 // Assume that illegal scalar instruction are expensive.
1313 return 4;
1314 }
1315
1316 // Check vector-to-vector casts.
1317 if (DstVTy && SrcVTy) {
1318 // If the cast is between same-sized registers, then the check is simple.
1319 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1320
1321 // Assume that Zext is done using AND.
1322 if (Opcode == Instruction::ZExt)
1323 return SrcLT.first;
1324
1325 // Assume that sext is done using SHL and SRA.
1326 if (Opcode == Instruction::SExt)
1327 return SrcLT.first * 2;
1328
1329 // Just check the op cost. If the operation is legal then assume it
1330 // costs
1331 // 1 and multiply by the type-legalization overhead.
1332 if (!TLI->isOperationExpand(ISD, DstLT.second))
1333 return SrcLT.first * 1;
1334 }
1335
1336 // If we are legalizing by splitting, query the concrete TTI for the cost
1337 // of casting the original vector twice. We also need to factor in the
1338 // cost of the split itself. Count that as 1, to be consistent with
1339 // getTypeLegalizationCost().
1340 bool SplitSrc =
1341 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1343 bool SplitDst =
1344 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1346 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isKnownEven() &&
1347 DstVTy->getElementCount().isKnownEven()) {
1348 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1349 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1350 const T *TTI = thisT();
1351 // If both types need to be split then the split is free.
1352 InstructionCost SplitCost =
1353 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1354 return SplitCost +
1355 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1356 CostKind, I));
1357 }
1358
1359 // Scalarization cost is Invalid, can't assume any num elements.
1360 if (isa<ScalableVectorType>(DstVTy))
1362
1363 // In other cases where the source or destination are illegal, assume
1364 // the operation will get scalarized.
1365 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1366 InstructionCost Cost = thisT()->getCastInstrCost(
1367 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1368
1369 // Return the cost of multiple scalar invocation plus the cost of
1370 // inserting and extracting the values.
1371 return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1372 CostKind) +
1373 Num * Cost;
1374 }
1375
1376 // We already handled vector-to-vector and scalar-to-scalar conversions.
1377 // This
1378 // is where we handle bitcast between vectors and scalars. We need to assume
1379 // that the conversion is scalarized in one way or another.
1380 if (Opcode == Instruction::BitCast) {
1381 // Illegal bitcasts are done by storing and loading from a stack slot.
1382 return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1383 /*Extract*/ true, CostKind)
1384 : 0) +
1385 (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1386 /*Extract*/ false, CostKind)
1387 : 0);
1388 }
1389
1390 llvm_unreachable("Unhandled cast");
1391 }
1392
1394 getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
1395 unsigned Index,
1396 TTI::TargetCostKind CostKind) const override {
1397 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1398 CostKind, Index, nullptr, nullptr) +
1399 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1401 }
1402
1405 const Instruction *I = nullptr) const override {
1406 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1407 }
1408
1410 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
1414 const Instruction *I = nullptr) const override {
1415 const TargetLoweringBase *TLI = getTLI();
1416 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1417 assert(ISD && "Invalid opcode");
1418
1419 if (getTLI()->getValueType(DL, ValTy, true) == MVT::Other)
1420 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1421 Op1Info, Op2Info, I);
1422
1423 // Selects on vectors are actually vector selects.
1424 if (ISD == ISD::SELECT) {
1425 assert(CondTy && "CondTy must exist");
1426 if (CondTy->isVectorTy())
1427 ISD = ISD::VSELECT;
1428 }
1429 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1430
1431 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1432 !TLI->isOperationExpand(ISD, LT.second)) {
1433 // The operation is legal. Assume it costs 1. Multiply
1434 // by the type-legalization overhead.
1435 return LT.first * 1;
1436 }
1437
1438 // Otherwise, assume that the cast is scalarized.
1439 // TODO: If one of the types get legalized by splitting, handle this
1440 // similarly to what getCastInstrCost() does.
1441 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1442 if (isa<ScalableVectorType>(ValTy))
1444
1445 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1446 InstructionCost Cost = thisT()->getCmpSelInstrCost(
1447 Opcode, ValVTy->getScalarType(), CondTy->getScalarType(), VecPred,
1448 CostKind, Op1Info, Op2Info, I);
1449
1450 // Return the cost of multiple scalar invocation plus the cost of
1451 // inserting and extracting the values.
1452 return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1453 /*Extract*/ false, CostKind) +
1454 Num * Cost;
1455 }
1456
1457 // Unknown scalar opcode.
1458 return 1;
1459 }
1460
1463 unsigned Index, const Value *Op0, const Value *Op1,
1465 TTI::VectorInstrContext::None) const override {
1466 return getRegUsageForType(Val->getScalarType());
1467 }
1468
1469 /// \param ScalarUserAndIdx encodes the information about extracts from a
1470 /// vector with 'Scalar' being the value being extracted,'User' being the user
1471 /// of the extract(nullptr if user is not known before vectorization) and
1472 /// 'Idx' being the extract lane.
1474 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
1475 Value *Scalar,
1476 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
1478 TTI::VectorInstrContext::None) const override {
1479 return getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr, nullptr,
1480 VIC);
1481 }
1482
1485 TTI::TargetCostKind CostKind, unsigned Index,
1487 TTI::VectorInstrContext::None) const override {
1488 Value *Op0 = nullptr;
1489 Value *Op1 = nullptr;
1490 if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
1491 Op0 = IE->getOperand(0);
1492 Op1 = IE->getOperand(1);
1493 }
1494 // If VIC is None, compute it from the instruction
1497 return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1498 Op1, VIC);
1499 }
1500
1504 unsigned Index) const override {
1505 unsigned NewIndex = -1;
1506 if (auto *FVTy = dyn_cast<FixedVectorType>(Val)) {
1507 assert(Index < FVTy->getNumElements() &&
1508 "Unexpected index from end of vector");
1509 NewIndex = FVTy->getNumElements() - 1 - Index;
1510 }
1511 return thisT()->getVectorInstrCost(Opcode, Val, CostKind, NewIndex, nullptr,
1512 nullptr);
1513 }
1514
1516 getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
1517 const APInt &DemandedDstElts,
1518 TTI::TargetCostKind CostKind) const override {
1519 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1520 "Unexpected size of DemandedDstElts.");
1521
1523
1524 auto *SrcVT = FixedVectorType::get(EltTy, VF);
1525 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1526
1527 // The Mask shuffling cost is extract all the elements of the Mask
1528 // and insert each of them Factor times into the wide vector:
1529 //
1530 // E.g. an interleaved group with factor 3:
1531 // %mask = icmp ult <8 x i32> %vec1, %vec2
1532 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1533 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1534 // The cost is estimated as extract all mask elements from the <8xi1> mask
1535 // vector and insert them factor times into the <24xi1> shuffled mask
1536 // vector.
1537 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1538 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1539 /*Insert*/ false,
1540 /*Extract*/ true, CostKind);
1541 Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1542 /*Insert*/ true,
1543 /*Extract*/ false, CostKind);
1544
1545 return Cost;
1546 }
1547
1549 unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
1552 const Instruction *I = nullptr) const override {
1553 assert(!Src->isVoidTy() && "Invalid type");
1554 // Assume types, such as structs, are expensive.
1555 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1556 return 4;
1557 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1558
1559 // FIXME: Arbitrary cost
1560 if (Opcode == Instruction::Load && CostKind == TTI::TCK_Latency)
1561 return 4;
1562
1563 // Assuming that all loads of legal types cost 1.
1564 InstructionCost Cost = LT.first;
1566 return Cost;
1567
1568 const DataLayout &DL = this->getDataLayout();
1569 if (Src->isVectorTy() &&
1570 // In practice it's not currently possible to have a change in lane
1571 // length for extending loads or truncating stores so both types should
1572 // have the same scalable property.
1573 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
1574 LT.second.getSizeInBits())) {
1575 // This is a vector load that legalizes to a larger type than the vector
1576 // itself. Unless the corresponding extending load or truncating store is
1577 // legal, then this will scalarize.
1579 EVT MemVT = getTLI()->getValueType(DL, Src);
1580 if (Opcode == Instruction::Store)
1581 LA = getTLI()->getTruncStoreAction(LT.second, MemVT, Alignment,
1582 AddressSpace);
1583 else
1584 LA = getTLI()->getLoadAction(LT.second, MemVT, Alignment, AddressSpace,
1585 ISD::EXTLOAD, false);
1586
1587 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1588 // This is a vector load/store for some illegal type that is scalarized.
1589 // We must account for the cost of building or decomposing the vector.
1591 cast<VectorType>(Src), Opcode != Instruction::Store,
1592 Opcode == Instruction::Store, CostKind);
1593 }
1594 }
1595
1596 return Cost;
1597 }
1598
1600 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1601 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1602 bool UseMaskForCond = false, bool UseMaskForGaps = false) const override {
1603
1604 // We cannot scalarize scalable vectors, so return Invalid.
1605 if (isa<ScalableVectorType>(VecTy))
1607
1608 auto *VT = cast<FixedVectorType>(VecTy);
1609
1610 unsigned NumElts = VT->getNumElements();
1611 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1612
1613 unsigned NumSubElts = NumElts / Factor;
1614 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1615
1616 // Firstly, the cost of load/store operation.
1618 if (UseMaskForCond || UseMaskForGaps) {
1619 unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load
1620 : Intrinsic::masked_store;
1621 Cost = thisT()->getMemIntrinsicInstrCost(
1622 MemIntrinsicCostAttributes(IID, VecTy, Alignment, AddressSpace),
1623 CostKind);
1624 } else
1625 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1626 CostKind);
1627
1628 // Legalize the vector type, and get the legalized and unlegalized type
1629 // sizes.
1630 MVT VecTyLT = getTypeLegalizationCost(VecTy).second;
1631 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1632 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1633
1634 // Scale the cost of the memory operation by the fraction of legalized
1635 // instructions that will actually be used. We shouldn't account for the
1636 // cost of dead instructions since they will be removed.
1637 //
1638 // E.g., An interleaved load of factor 8:
1639 // %vec = load <16 x i64>, <16 x i64>* %ptr
1640 // %v0 = shufflevector %vec, undef, <0, 8>
1641 //
1642 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1643 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1644 // type). The other loads are unused.
1645 //
1646 // TODO: Note that legalization can turn masked loads/stores into unmasked
1647 // (legalized) loads/stores. This can be reflected in the cost.
1648 if (Cost.isValid() && VecTySize > VecTyLTSize) {
1649 // The number of loads of a legal type it will take to represent a load
1650 // of the unlegalized vector type.
1651 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1652
1653 // The number of elements of the unlegalized type that correspond to a
1654 // single legal instruction.
1655 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1656
1657 // Determine which legal instructions will be used.
1658 BitVector UsedInsts(NumLegalInsts, false);
1659 for (unsigned Index : Indices)
1660 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1661 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1662
1663 // Scale the cost of the load by the fraction of legal instructions that
1664 // will be used.
1665 Cost = divideCeil(UsedInsts.count() * Cost.getValue(), NumLegalInsts);
1666 }
1667
1668 // Then plus the cost of interleave operation.
1669 assert(Indices.size() <= Factor &&
1670 "Interleaved memory op has too many members");
1671
1672 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1673 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1674
1675 APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1676 for (unsigned Index : Indices) {
1677 assert(Index < Factor && "Invalid index for interleaved memory op");
1678 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1679 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1680 }
1681
1682 if (Opcode == Instruction::Load) {
1683 // The interleave cost is similar to extract sub vectors' elements
1684 // from the wide vector, and insert them into sub vectors.
1685 //
1686 // E.g. An interleaved load of factor 2 (with one member of index 0):
1687 // %vec = load <8 x i32>, <8 x i32>* %ptr
1688 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1689 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1690 // <8 x i32> vector and insert them into a <4 x i32> vector.
1691 InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1692 SubVT, DemandedAllSubElts,
1693 /*Insert*/ true, /*Extract*/ false, CostKind);
1694 Cost += Indices.size() * InsSubCost;
1695 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1696 /*Insert*/ false,
1697 /*Extract*/ true, CostKind);
1698 } else {
1699 // The interleave cost is extract elements from sub vectors, and
1700 // insert them into the wide vector.
1701 //
1702 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1703 // (using VF=4):
1704 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1705 // %gaps.mask = <true, true, false, true, true, false,
1706 // true, true, false, true, true, false>
1707 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1708 // i32 Align, <12 x i1> %gaps.mask
1709 // The cost is estimated as extract all elements (of actual members,
1710 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1711 // i32> vector.
1712 InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1713 SubVT, DemandedAllSubElts,
1714 /*Insert*/ false, /*Extract*/ true, CostKind);
1715 Cost += ExtSubCost * Indices.size();
1716 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1717 /*Insert*/ true,
1718 /*Extract*/ false, CostKind);
1719 }
1720
1721 if (!UseMaskForCond)
1722 return Cost;
1723
1724 Type *I8Type = Type::getInt8Ty(VT->getContext());
1725
1726 Cost += thisT()->getReplicationShuffleCost(
1727 I8Type, Factor, NumSubElts,
1728 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1729 CostKind);
1730
1731 // The Gaps mask is invariant and created outside the loop, therefore the
1732 // cost of creating it is not accounted for here. However if we have both
1733 // a MaskForGaps and some other mask that guards the execution of the
1734 // memory access, we need to account for the cost of And-ing the two masks
1735 // inside the loop.
1736 if (UseMaskForGaps) {
1737 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1738 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1739 CostKind);
1740 }
1741
1742 return Cost;
1743 }
1744
1745 /// Get intrinsic cost based on arguments.
1748 TTI::TargetCostKind CostKind) const override {
1749 // Check for generically free intrinsics.
1751 return 0;
1752
1753 // Assume that target intrinsics are cheap.
1754 Intrinsic::ID IID = ICA.getID();
1757
1758 // VP Intrinsics should have the same cost as their non-vp counterpart.
1759 // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
1760 // counterpart when the vector length argument is smaller than the maximum
1761 // vector length.
1762 // TODO: Support other kinds of VPIntrinsics
1763 if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
1764 std::optional<unsigned> FOp =
1766 if (FOp) {
1767 if (ICA.getID() == Intrinsic::vp_load) {
1768 Align Alignment;
1769 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1770 Alignment = VPI->getPointerAlignment().valueOrOne();
1771 unsigned AS = 0;
1772 if (ICA.getArgTypes().size() > 1)
1773 if (auto *PtrTy = dyn_cast<PointerType>(ICA.getArgTypes()[0]))
1774 AS = PtrTy->getAddressSpace();
1775 return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
1776 AS, CostKind);
1777 }
1778 if (ICA.getID() == Intrinsic::vp_store) {
1779 Align Alignment;
1780 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1781 Alignment = VPI->getPointerAlignment().valueOrOne();
1782 unsigned AS = 0;
1783 if (ICA.getArgTypes().size() >= 2)
1784 if (auto *PtrTy = dyn_cast<PointerType>(ICA.getArgTypes()[1]))
1785 AS = PtrTy->getAddressSpace();
1786 return thisT()->getMemoryOpCost(*FOp, ICA.getArgTypes()[0], Alignment,
1787 AS, CostKind);
1788 }
1790 ICA.getID() == Intrinsic::vp_fneg) {
1791 return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
1792 CostKind);
1793 }
1794 if (VPCastIntrinsic::isVPCast(ICA.getID())) {
1795 return thisT()->getCastInstrCost(
1796 *FOp, ICA.getReturnType(), ICA.getArgTypes()[0],
1798 }
1799 if (VPCmpIntrinsic::isVPCmp(ICA.getID())) {
1800 // We can only handle vp_cmp intrinsics with underlying instructions.
1801 if (ICA.getInst()) {
1802 assert(FOp);
1803 auto *UI = cast<VPCmpIntrinsic>(ICA.getInst());
1804 return thisT()->getCmpSelInstrCost(*FOp, ICA.getArgTypes()[0],
1805 ICA.getReturnType(),
1806 UI->getPredicate(), CostKind);
1807 }
1808 }
1809 }
1810 if (ICA.getID() == Intrinsic::vp_load_ff) {
1811 Type *RetTy = ICA.getReturnType();
1812 Type *DataTy = cast<StructType>(RetTy)->getElementType(0);
1813 Align Alignment;
1814 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1815 Alignment = VPI->getPointerAlignment().valueOrOne();
1816 return thisT()->getMemIntrinsicInstrCost(
1817 MemIntrinsicCostAttributes(ICA.getID(), DataTy, Alignment),
1818 CostKind);
1819 }
1820 if (ICA.getID() == Intrinsic::vp_scatter) {
1821 if (ICA.isTypeBasedOnly()) {
1822 IntrinsicCostAttributes MaskedScatter(
1825 ICA.getFlags());
1826 return getTypeBasedIntrinsicInstrCost(MaskedScatter, CostKind);
1827 }
1828 Align Alignment;
1829 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1830 Alignment = VPI->getPointerAlignment().valueOrOne();
1831 bool VarMask = isa<Constant>(ICA.getArgs()[2]);
1832 return thisT()->getMemIntrinsicInstrCost(
1833 MemIntrinsicCostAttributes(Intrinsic::vp_scatter,
1834 ICA.getArgTypes()[0], ICA.getArgs()[1],
1835 VarMask, Alignment, nullptr),
1836 CostKind);
1837 }
1838 if (ICA.getID() == Intrinsic::vp_gather) {
1839 if (ICA.isTypeBasedOnly()) {
1840 IntrinsicCostAttributes MaskedGather(
1843 ICA.getFlags());
1844 return getTypeBasedIntrinsicInstrCost(MaskedGather, CostKind);
1845 }
1846 Align Alignment;
1847 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1848 Alignment = VPI->getPointerAlignment().valueOrOne();
1849 bool VarMask = isa<Constant>(ICA.getArgs()[1]);
1850 return thisT()->getMemIntrinsicInstrCost(
1851 MemIntrinsicCostAttributes(Intrinsic::vp_gather,
1852 ICA.getReturnType(), ICA.getArgs()[0],
1853 VarMask, Alignment, nullptr),
1854 CostKind);
1855 }
1856
1857 if (ICA.getID() == Intrinsic::vp_select ||
1858 ICA.getID() == Intrinsic::vp_merge) {
1859 TTI::OperandValueInfo OpInfoX, OpInfoY;
1860 if (!ICA.isTypeBasedOnly()) {
1861 OpInfoX = TTI::getOperandInfo(ICA.getArgs()[0]);
1862 OpInfoY = TTI::getOperandInfo(ICA.getArgs()[1]);
1863 }
1864 return getCmpSelInstrCost(
1865 Instruction::Select, ICA.getReturnType(), ICA.getArgTypes()[0],
1866 CmpInst::BAD_ICMP_PREDICATE, CostKind, OpInfoX, OpInfoY);
1867 }
1868
1869 std::optional<Intrinsic::ID> FID =
1871
1872 // Not functionally equivalent but close enough for cost modelling.
1873 if (ICA.getID() == Intrinsic::experimental_vp_reverse)
1874 FID = Intrinsic::vector_reverse;
1875
1876 if (FID) {
1877 // Non-vp version will have same arg types except mask and vector
1878 // length.
1879 assert(ICA.getArgTypes().size() >= 2 &&
1880 "Expected VPIntrinsic to have Mask and Vector Length args and "
1881 "types");
1882
1883 ArrayRef<const Value *> NewArgs = ArrayRef(ICA.getArgs());
1884 if (!ICA.isTypeBasedOnly())
1885 NewArgs = NewArgs.drop_back(2);
1887
1888 // VPReduction intrinsics have a start value argument that their non-vp
1889 // counterparts do not have, except for the fadd and fmul non-vp
1890 // counterpart.
1892 *FID != Intrinsic::vector_reduce_fadd &&
1893 *FID != Intrinsic::vector_reduce_fmul) {
1894 if (!ICA.isTypeBasedOnly())
1895 NewArgs = NewArgs.drop_front();
1896 NewTys = NewTys.drop_front();
1897 }
1898
1899 IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewArgs,
1900 NewTys, ICA.getFlags());
1901 return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
1902 }
1903 }
1904
1905 if (ICA.isTypeBasedOnly())
1907
1908 Type *RetTy = ICA.getReturnType();
1909
1910 ElementCount RetVF = isVectorizedTy(RetTy) ? getVectorizedTypeVF(RetTy)
1912
1913 const IntrinsicInst *I = ICA.getInst();
1914 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1915 FastMathFlags FMF = ICA.getFlags();
1916 switch (IID) {
1917 default:
1918 break;
1919
1920 case Intrinsic::powi:
1921 if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1922 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1923 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1924 ShouldOptForSize)) {
1925 // The cost is modeled on the expansion performed by ExpandPowI in
1926 // SelectionDAGBuilder.
1927 APInt Exponent = RHSC->getValue().abs();
1928 unsigned ActiveBits = Exponent.getActiveBits();
1929 unsigned PopCount = Exponent.popcount();
1930 InstructionCost Cost = (ActiveBits + PopCount - 2) *
1931 thisT()->getArithmeticInstrCost(
1932 Instruction::FMul, RetTy, CostKind);
1933 if (RHSC->isNegative())
1934 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1935 CostKind);
1936 return Cost;
1937 }
1938 }
1939 break;
1940 case Intrinsic::cttz:
1941 // FIXME: If necessary, this should go in target-specific overrides.
1942 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1944 break;
1945
1946 case Intrinsic::ctlz:
1947 // FIXME: If necessary, this should go in target-specific overrides.
1948 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1950 break;
1951
1952 case Intrinsic::memcpy:
1953 return thisT()->getMemcpyCost(ICA.getInst());
1954
1955 case Intrinsic::masked_scatter: {
1956 const Value *Mask = Args[2];
1957 bool VarMask = !isa<Constant>(Mask);
1958 Align Alignment = I->getParamAlign(1).valueOrOne();
1959 return thisT()->getMemIntrinsicInstrCost(
1960 MemIntrinsicCostAttributes(Intrinsic::masked_scatter,
1961 ICA.getArgTypes()[0], Args[1], VarMask,
1962 Alignment, I),
1963 CostKind);
1964 }
1965 case Intrinsic::masked_gather: {
1966 const Value *Mask = Args[1];
1967 bool VarMask = !isa<Constant>(Mask);
1968 Align Alignment = I->getParamAlign(0).valueOrOne();
1969 return thisT()->getMemIntrinsicInstrCost(
1970 MemIntrinsicCostAttributes(Intrinsic::masked_gather, RetTy, Args[0],
1971 VarMask, Alignment, I),
1972 CostKind);
1973 }
1974 case Intrinsic::masked_compressstore: {
1975 const Value *Data = Args[0];
1976 const Value *Mask = Args[2];
1977 Align Alignment = I->getParamAlign(1).valueOrOne();
1978 return thisT()->getMemIntrinsicInstrCost(
1979 MemIntrinsicCostAttributes(IID, Data->getType(), !isa<Constant>(Mask),
1980 Alignment, I),
1981 CostKind);
1982 }
1983 case Intrinsic::masked_expandload: {
1984 const Value *Mask = Args[1];
1985 Align Alignment = I->getParamAlign(0).valueOrOne();
1986 return thisT()->getMemIntrinsicInstrCost(
1987 MemIntrinsicCostAttributes(IID, RetTy, !isa<Constant>(Mask),
1988 Alignment, I),
1989 CostKind);
1990 }
1991 case Intrinsic::experimental_vp_strided_store: {
1992 const Value *Data = Args[0];
1993 const Value *Ptr = Args[1];
1994 const Value *Mask = Args[3];
1995 const Value *EVL = Args[4];
1996 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1997 Type *EltTy = cast<VectorType>(Data->getType())->getElementType();
1998 Align Alignment =
1999 I->getParamAlign(1).value_or(thisT()->DL.getABITypeAlign(EltTy));
2000 return thisT()->getMemIntrinsicInstrCost(
2001 MemIntrinsicCostAttributes(IID, Data->getType(), Ptr, VarMask,
2002 Alignment, I),
2003 CostKind);
2004 }
2005 case Intrinsic::experimental_vp_strided_load: {
2006 const Value *Ptr = Args[0];
2007 const Value *Mask = Args[2];
2008 const Value *EVL = Args[3];
2009 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
2010 Type *EltTy = cast<VectorType>(RetTy)->getElementType();
2011 Align Alignment =
2012 I->getParamAlign(0).value_or(thisT()->DL.getABITypeAlign(EltTy));
2013 return thisT()->getMemIntrinsicInstrCost(
2014 MemIntrinsicCostAttributes(IID, RetTy, Ptr, VarMask, Alignment, I),
2015 CostKind);
2016 }
2017 case Intrinsic::stepvector: {
2018 if (isa<ScalableVectorType>(RetTy))
2020 // The cost of materialising a constant integer vector.
2022 }
2023 case Intrinsic::vector_extract: {
2024 // FIXME: Handle case where a scalable vector is extracted from a scalable
2025 // vector
2026 if (isa<ScalableVectorType>(RetTy))
2028 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
2029 return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
2030 cast<VectorType>(RetTy),
2031 cast<VectorType>(Args[0]->getType()), {},
2032 CostKind, Index, cast<VectorType>(RetTy));
2033 }
2034 case Intrinsic::vector_insert: {
2035 // FIXME: Handle case where a scalable vector is inserted into a scalable
2036 // vector
2037 if (isa<ScalableVectorType>(Args[1]->getType()))
2039 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
2040 return thisT()->getShuffleCost(
2042 cast<VectorType>(Args[0]->getType()), {}, CostKind, Index,
2043 cast<VectorType>(Args[1]->getType()));
2044 }
2045 case Intrinsic::vector_splice_left:
2046 case Intrinsic::vector_splice_right: {
2047 auto *COffset = dyn_cast<ConstantInt>(Args[2]);
2048 if (!COffset)
2049 break;
2050 unsigned Index = COffset->getZExtValue();
2051 return thisT()->getShuffleCost(
2053 cast<VectorType>(Args[0]->getType()), {}, CostKind,
2054 IID == Intrinsic::vector_splice_left ? Index : -Index,
2055 cast<VectorType>(RetTy));
2056 }
2057 case Intrinsic::vector_reduce_add:
2058 case Intrinsic::vector_reduce_mul:
2059 case Intrinsic::vector_reduce_and:
2060 case Intrinsic::vector_reduce_or:
2061 case Intrinsic::vector_reduce_xor:
2062 case Intrinsic::vector_reduce_smax:
2063 case Intrinsic::vector_reduce_smin:
2064 case Intrinsic::vector_reduce_fmax:
2065 case Intrinsic::vector_reduce_fmin:
2066 case Intrinsic::vector_reduce_fmaximum:
2067 case Intrinsic::vector_reduce_fminimum:
2068 case Intrinsic::vector_reduce_umax:
2069 case Intrinsic::vector_reduce_umin: {
2070 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
2072 }
2073 case Intrinsic::vector_reduce_fadd:
2074 case Intrinsic::vector_reduce_fmul: {
2076 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
2078 }
2079 case Intrinsic::fshl:
2080 case Intrinsic::fshr: {
2081 const Value *X = Args[0];
2082 const Value *Y = Args[1];
2083 const Value *Z = Args[2];
2086 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z);
2087
2088 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
2089 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
2091 Cost +=
2092 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
2093 Cost += thisT()->getArithmeticInstrCost(
2094 BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
2095 {OpInfoZ.Kind, TTI::OP_None});
2096 Cost += thisT()->getArithmeticInstrCost(
2097 BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
2098 {OpInfoZ.Kind, TTI::OP_None});
2099
2100 if (!OpInfoZ.isConstant()) {
2101 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy,
2102 CostKind);
2103 // Non-constant shift amounts requires a modulo. If the typesize is a
2104 // power-2 then this will be converted to an and, otherwise it will use
2105 // a urem.
2106 Cost += thisT()->getArithmeticInstrCost(
2107 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? BinaryOperator::And
2108 : BinaryOperator::URem,
2109 RetTy, CostKind, OpInfoZ,
2110 {TTI::OK_UniformConstantValue, TTI::OP_None});
2111 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
2112 if (X != Y) {
2113 Type *CondTy = RetTy->getWithNewBitWidth(1);
2114 Cost += thisT()->getCmpSelInstrCost(
2115 BinaryOperator::ICmp, RetTy, CondTy, CmpInst::ICMP_EQ, CostKind);
2116 Cost +=
2117 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2119 }
2120 }
2121 return Cost;
2122 }
2123 case Intrinsic::experimental_cttz_elts: {
2124 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
2125
2126 // If we're not expanding the intrinsic then we assume this is cheap
2127 // to implement.
2128 if (!getTLI()->shouldExpandCttzElements(ArgType))
2129 return getTypeLegalizationCost(RetTy).first;
2130
2131 // TODO: The costs below reflect the expansion code in
2132 // SelectionDAGBuilder, but we may want to sacrifice some accuracy in
2133 // favour of compile time.
2134
2135 // Find the smallest "sensible" element type to use for the expansion.
2136 bool ZeroIsPoison = !cast<ConstantInt>(Args[1])->isZero();
2137 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
2138 if (isa<ScalableVectorType>(ICA.getArgTypes()[0]) && I && I->getCaller())
2139 VScaleRange = getVScaleRange(I->getCaller(), 64);
2140
2141 unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
2142 getTLI()->getValueType(DL, RetTy), ArgType.getVectorElementCount(),
2143 ZeroIsPoison, &VScaleRange);
2144 Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth);
2145
2146 // Create the new vector type & get the vector length
2147 Type *NewVecTy = VectorType::get(
2148 NewEltTy, cast<VectorType>(Args[0]->getType())->getElementCount());
2149
2150 IntrinsicCostAttributes StepVecAttrs(Intrinsic::stepvector, NewVecTy, {},
2151 FMF);
2153 thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
2154
2155 Cost +=
2156 thisT()->getArithmeticInstrCost(Instruction::Sub, NewVecTy, CostKind);
2157 Cost += thisT()->getCastInstrCost(Instruction::SExt, NewVecTy,
2158 Args[0]->getType(),
2160 Cost +=
2161 thisT()->getArithmeticInstrCost(Instruction::And, NewVecTy, CostKind);
2162
2163 IntrinsicCostAttributes ReducAttrs(Intrinsic::vector_reduce_umax,
2164 NewEltTy, NewVecTy, FMF, I, 1);
2165 Cost += thisT()->getTypeBasedIntrinsicInstrCost(ReducAttrs, CostKind);
2166 Cost +=
2167 thisT()->getArithmeticInstrCost(Instruction::Sub, NewEltTy, CostKind);
2168
2169 return Cost;
2170 }
2171 case Intrinsic::get_active_lane_mask:
2172 case Intrinsic::experimental_vector_match:
2173 case Intrinsic::experimental_vector_histogram_add:
2174 case Intrinsic::experimental_vector_histogram_uadd_sat:
2175 case Intrinsic::experimental_vector_histogram_umax:
2176 case Intrinsic::experimental_vector_histogram_umin:
2177 case Intrinsic::masked_udiv:
2178 case Intrinsic::masked_sdiv:
2179 case Intrinsic::masked_urem:
2180 case Intrinsic::masked_srem:
2181 return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind);
2182 case Intrinsic::modf:
2183 case Intrinsic::sincos:
2184 case Intrinsic::sincospi: {
2185 std::optional<unsigned> CallRetElementIndex;
2186 // The first element of the modf result is returned by value in the
2187 // libcall.
2188 if (ICA.getID() == Intrinsic::modf)
2189 CallRetElementIndex = 0;
2190
2191 if (auto Cost = getMultipleResultIntrinsicVectorLibCallCost(
2192 ICA, CostKind, CallRetElementIndex))
2193 return *Cost;
2194 // Otherwise, fallback to default scalarization cost.
2195 break;
2196 }
2197 case Intrinsic::loop_dependence_war_mask:
2198 case Intrinsic::loop_dependence_raw_mask: {
2199 // Compute the cost of the expanded version of these intrinsics:
2200 //
2201 // The possible expansions are...
2202 //
2203 // loop_dependence_war_mask:
2204 // diff = (addrB - addrA) / eltSize
2205 // cmp = icmp sle diff, 0
2206 // upper_bound = select cmp, -1, diff
2207 // mask = get_active_lane_mask 0, upper_bound
2208 //
2209 // loop_dependence_raw_mask:
2210 // diff = (abs(addrB - addrA)) / eltSize
2211 // cmp = icmp eq diff, 0
2212 // upper_bound = select cmp, -1, diff
2213 // mask = get_active_lane_mask 0, upper_bound
2214 //
2215 Type *AddrTy = ICA.getArgTypes()[0];
2216 bool IsReadAfterWrite = IID == Intrinsic::loop_dependence_raw_mask;
2217
2219 thisT()->getArithmeticInstrCost(Instruction::Sub, AddrTy, CostKind);
2220 if (IsReadAfterWrite) {
2221 IntrinsicCostAttributes AbsAttrs(Intrinsic::abs, AddrTy, {AddrTy}, {});
2222 Cost += thisT()->getIntrinsicInstrCost(AbsAttrs, CostKind);
2223 }
2224
2225 TTI::OperandValueInfo EltSizeOpInfo =
2226 TTI::getOperandInfo(ICA.getArgs()[2]);
2227 Cost += thisT()->getArithmeticInstrCost(Instruction::SDiv, AddrTy,
2228 CostKind, {}, EltSizeOpInfo);
2229
2230 Type *CondTy = IntegerType::getInt1Ty(RetTy->getContext());
2231 CmpInst::Predicate Pred =
2232 IsReadAfterWrite ? CmpInst::ICMP_EQ : CmpInst::ICMP_SLE;
2233 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CondTy, AddrTy,
2234 Pred, CostKind);
2235 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, AddrTy,
2236 CondTy, Pred, CostKind);
2237
2238 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
2239 {AddrTy, AddrTy}, FMF);
2240 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2241 return Cost;
2242 }
2243 }
2244
2245 // Assume that we need to scalarize this intrinsic.)
2246 // Compute the scalarization overhead based on Args for a vector
2247 // intrinsic.
2248 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
2249 if (RetVF.isVector() && !RetVF.isScalable()) {
2250 ScalarizationCost = 0;
2251 if (!RetTy->isVoidTy()) {
2252 for (Type *VectorTy : getContainedTypes(RetTy)) {
2253 ScalarizationCost += getScalarizationOverhead(
2254 cast<VectorType>(VectorTy),
2255 /*Insert=*/true, /*Extract=*/false, CostKind);
2256 }
2257 }
2258 ScalarizationCost += getOperandsScalarizationOverhead(
2259 filterConstantAndDuplicatedOperands(Args, ICA.getArgTypes()),
2260 CostKind);
2261 }
2262
2263 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
2264 ScalarizationCost);
2265 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
2266 }
2267
2268 /// Get intrinsic cost based on argument types.
2269 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
2270 /// cost of scalarizing the arguments and the return value will be computed
2271 /// based on types.
2275 Intrinsic::ID IID = ICA.getID();
2276 Type *RetTy = ICA.getReturnType();
2277 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
2278 FastMathFlags FMF = ICA.getFlags();
2279 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
2280 bool SkipScalarizationCost = ICA.skipScalarizationCost();
2281
2282 VectorType *VecOpTy = nullptr;
2283 if (!Tys.empty()) {
2284 // The vector reduction operand is operand 0 except for fadd/fmul.
2285 // Their operand 0 is a scalar start value, so the vector op is operand 1.
2286 unsigned VecTyIndex = 0;
2287 if (IID == Intrinsic::vector_reduce_fadd ||
2288 IID == Intrinsic::vector_reduce_fmul)
2289 VecTyIndex = 1;
2290 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
2291 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
2292 }
2293
2294 // Library call cost - other than size, make it expensive.
2295 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
2296 unsigned ISD = 0;
2297 switch (IID) {
2298 default: {
2299 // Scalable vectors cannot be scalarized, so return Invalid.
2300 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2301 return isa<ScalableVectorType>(Ty);
2302 }))
2304
2305 // Assume that we need to scalarize this intrinsic.
2306 InstructionCost ScalarizationCost =
2307 SkipScalarizationCost ? ScalarizationCostPassed : 0;
2308 unsigned ScalarCalls = 1;
2309 Type *ScalarRetTy = RetTy;
2310 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2311 if (!SkipScalarizationCost)
2312 ScalarizationCost = getScalarizationOverhead(
2313 RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
2314 ScalarCalls = std::max(ScalarCalls,
2316 ScalarRetTy = RetTy->getScalarType();
2317 }
2318 SmallVector<Type *, 4> ScalarTys;
2319 for (Type *Ty : Tys) {
2320 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
2321 if (!SkipScalarizationCost)
2322 ScalarizationCost += getScalarizationOverhead(
2323 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2324 ScalarCalls = std::max(ScalarCalls,
2326 Ty = Ty->getScalarType();
2327 }
2328 ScalarTys.push_back(Ty);
2329 }
2330 if (ScalarCalls == 1)
2331 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
2332
2333 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
2334 InstructionCost ScalarCost =
2335 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
2336
2337 return ScalarCalls * ScalarCost + ScalarizationCost;
2338 }
2339 // Look for intrinsics that can be lowered directly or turned into a scalar
2340 // intrinsic call.
2341 case Intrinsic::sqrt:
2342 ISD = ISD::FSQRT;
2343 break;
2344 case Intrinsic::sin:
2345 ISD = ISD::FSIN;
2346 break;
2347 case Intrinsic::cos:
2348 ISD = ISD::FCOS;
2349 break;
2350 case Intrinsic::sincos:
2351 ISD = ISD::FSINCOS;
2352 break;
2353 case Intrinsic::sincospi:
2355 break;
2356 case Intrinsic::modf:
2357 ISD = ISD::FMODF;
2358 break;
2359 case Intrinsic::tan:
2360 ISD = ISD::FTAN;
2361 break;
2362 case Intrinsic::asin:
2363 ISD = ISD::FASIN;
2364 break;
2365 case Intrinsic::acos:
2366 ISD = ISD::FACOS;
2367 break;
2368 case Intrinsic::atan:
2369 ISD = ISD::FATAN;
2370 break;
2371 case Intrinsic::atan2:
2372 ISD = ISD::FATAN2;
2373 break;
2374 case Intrinsic::sinh:
2375 ISD = ISD::FSINH;
2376 break;
2377 case Intrinsic::cosh:
2378 ISD = ISD::FCOSH;
2379 break;
2380 case Intrinsic::tanh:
2381 ISD = ISD::FTANH;
2382 break;
2383 case Intrinsic::exp:
2384 ISD = ISD::FEXP;
2385 break;
2386 case Intrinsic::exp2:
2387 ISD = ISD::FEXP2;
2388 break;
2389 case Intrinsic::exp10:
2390 ISD = ISD::FEXP10;
2391 break;
2392 case Intrinsic::log:
2393 ISD = ISD::FLOG;
2394 break;
2395 case Intrinsic::log10:
2396 ISD = ISD::FLOG10;
2397 break;
2398 case Intrinsic::log2:
2399 ISD = ISD::FLOG2;
2400 break;
2401 case Intrinsic::ldexp:
2402 ISD = ISD::FLDEXP;
2403 break;
2404 case Intrinsic::fabs:
2405 ISD = ISD::FABS;
2406 break;
2407 case Intrinsic::canonicalize:
2409 break;
2410 case Intrinsic::minnum:
2411 ISD = ISD::FMINNUM;
2412 break;
2413 case Intrinsic::maxnum:
2414 ISD = ISD::FMAXNUM;
2415 break;
2416 case Intrinsic::minimum:
2418 break;
2419 case Intrinsic::maximum:
2421 break;
2422 case Intrinsic::minimumnum:
2424 break;
2425 case Intrinsic::maximumnum:
2427 break;
2428 case Intrinsic::copysign:
2430 break;
2431 case Intrinsic::floor:
2432 ISD = ISD::FFLOOR;
2433 break;
2434 case Intrinsic::ceil:
2435 ISD = ISD::FCEIL;
2436 break;
2437 case Intrinsic::trunc:
2438 ISD = ISD::FTRUNC;
2439 break;
2440 case Intrinsic::nearbyint:
2442 break;
2443 case Intrinsic::rint:
2444 ISD = ISD::FRINT;
2445 break;
2446 case Intrinsic::lrint:
2447 ISD = ISD::LRINT;
2448 break;
2449 case Intrinsic::llrint:
2450 ISD = ISD::LLRINT;
2451 break;
2452 case Intrinsic::round:
2453 ISD = ISD::FROUND;
2454 break;
2455 case Intrinsic::roundeven:
2457 break;
2458 case Intrinsic::lround:
2459 ISD = ISD::LROUND;
2460 break;
2461 case Intrinsic::llround:
2462 ISD = ISD::LLROUND;
2463 break;
2464 case Intrinsic::pow:
2465 ISD = ISD::FPOW;
2466 break;
2467 case Intrinsic::fma:
2468 ISD = ISD::FMA;
2469 break;
2470 case Intrinsic::fmuladd:
2471 ISD = ISD::FMA;
2472 break;
2473 case Intrinsic::experimental_constrained_fmuladd:
2475 break;
2476 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
2477 case Intrinsic::lifetime_start:
2478 case Intrinsic::lifetime_end:
2479 case Intrinsic::sideeffect:
2480 case Intrinsic::pseudoprobe:
2481 case Intrinsic::arithmetic_fence:
2482 return 0;
2483 case Intrinsic::masked_store: {
2484 Type *Ty = Tys[0];
2485 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2486 return thisT()->getMemIntrinsicInstrCost(
2487 MemIntrinsicCostAttributes(IID, Ty, TyAlign, 0), CostKind);
2488 }
2489 case Intrinsic::masked_load: {
2490 Type *Ty = RetTy;
2491 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2492 return thisT()->getMemIntrinsicInstrCost(
2493 MemIntrinsicCostAttributes(IID, Ty, TyAlign, 0), CostKind);
2494 }
2495 case Intrinsic::experimental_vp_strided_store: {
2496 auto *Ty = cast<VectorType>(ICA.getArgTypes()[0]);
2497 Align Alignment = thisT()->DL.getABITypeAlign(Ty->getElementType());
2498 return thisT()->getMemIntrinsicInstrCost(
2499 MemIntrinsicCostAttributes(IID, Ty, /*Ptr=*/nullptr,
2500 /*VariableMask=*/true, Alignment,
2501 ICA.getInst()),
2502 CostKind);
2503 }
2504 case Intrinsic::experimental_vp_strided_load: {
2505 auto *Ty = cast<VectorType>(ICA.getReturnType());
2506 Align Alignment = thisT()->DL.getABITypeAlign(Ty->getElementType());
2507 return thisT()->getMemIntrinsicInstrCost(
2508 MemIntrinsicCostAttributes(IID, Ty, /*Ptr=*/nullptr,
2509 /*VariableMask=*/true, Alignment,
2510 ICA.getInst()),
2511 CostKind);
2512 }
2513 case Intrinsic::vector_reduce_add:
2514 case Intrinsic::vector_reduce_mul:
2515 case Intrinsic::vector_reduce_and:
2516 case Intrinsic::vector_reduce_or:
2517 case Intrinsic::vector_reduce_xor:
2518 return thisT()->getArithmeticReductionCost(
2519 getArithmeticReductionInstruction(IID), VecOpTy, std::nullopt,
2520 CostKind);
2521 case Intrinsic::vector_reduce_fadd:
2522 case Intrinsic::vector_reduce_fmul:
2523 return thisT()->getArithmeticReductionCost(
2524 getArithmeticReductionInstruction(IID), VecOpTy, FMF, CostKind);
2525 case Intrinsic::vector_reduce_smax:
2526 case Intrinsic::vector_reduce_smin:
2527 case Intrinsic::vector_reduce_umax:
2528 case Intrinsic::vector_reduce_umin:
2529 case Intrinsic::vector_reduce_fmax:
2530 case Intrinsic::vector_reduce_fmin:
2531 case Intrinsic::vector_reduce_fmaximum:
2532 case Intrinsic::vector_reduce_fminimum:
2533 return thisT()->getMinMaxReductionCost(getMinMaxReductionIntrinsicOp(IID),
2534 VecOpTy, ICA.getFlags(), CostKind);
2535 case Intrinsic::experimental_vector_match: {
2536 auto *SearchTy = cast<VectorType>(ICA.getArgTypes()[0]);
2537 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
2538 unsigned SearchSize = NeedleTy->getNumElements();
2539
2540 // If we're not expanding the intrinsic then we assume this is cheap to
2541 // implement.
2542 EVT SearchVT = getTLI()->getValueType(DL, SearchTy);
2543 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize))
2544 return getTypeLegalizationCost(RetTy).first;
2545
2546 // Approximate the cost based on the expansion code in
2547 // SelectionDAGBuilder.
2549 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, NeedleTy,
2550 CostKind, 1, nullptr, nullptr);
2551 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SearchTy,
2552 CostKind, 0, nullptr, nullptr);
2553 Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SearchTy, SearchTy, {},
2554 CostKind, 0, nullptr);
2555 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SearchTy, RetTy,
2557 Cost +=
2558 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
2559 Cost *= SearchSize;
2560 Cost +=
2561 thisT()->getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
2562 return Cost;
2563 }
2564 case Intrinsic::vector_reverse:
2565 return thisT()->getShuffleCost(TTI::SK_Reverse, cast<VectorType>(RetTy),
2566 cast<VectorType>(ICA.getArgTypes()[0]), {},
2567 CostKind, 0, cast<VectorType>(RetTy));
2568 case Intrinsic::experimental_vector_histogram_add:
2569 case Intrinsic::experimental_vector_histogram_uadd_sat:
2570 case Intrinsic::experimental_vector_histogram_umax:
2571 case Intrinsic::experimental_vector_histogram_umin: {
2573 Type *EltTy = ICA.getArgTypes()[1];
2574
2575 // Targets with scalable vectors must handle this on their own.
2576 if (!PtrsTy)
2578
2579 Align Alignment = thisT()->DL.getABITypeAlign(EltTy);
2581 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, PtrsTy,
2582 CostKind, 1, nullptr, nullptr);
2583 Cost += thisT()->getMemoryOpCost(Instruction::Load, EltTy, Alignment, 0,
2584 CostKind);
2585 switch (IID) {
2586 default:
2587 llvm_unreachable("Unhandled histogram update operation.");
2588 case Intrinsic::experimental_vector_histogram_add:
2589 Cost +=
2590 thisT()->getArithmeticInstrCost(Instruction::Add, EltTy, CostKind);
2591 break;
2592 case Intrinsic::experimental_vector_histogram_uadd_sat: {
2593 IntrinsicCostAttributes UAddSat(Intrinsic::uadd_sat, EltTy, {EltTy});
2594 Cost += thisT()->getIntrinsicInstrCost(UAddSat, CostKind);
2595 break;
2596 }
2597 case Intrinsic::experimental_vector_histogram_umax: {
2598 IntrinsicCostAttributes UMax(Intrinsic::umax, EltTy, {EltTy});
2599 Cost += thisT()->getIntrinsicInstrCost(UMax, CostKind);
2600 break;
2601 }
2602 case Intrinsic::experimental_vector_histogram_umin: {
2603 IntrinsicCostAttributes UMin(Intrinsic::umin, EltTy, {EltTy});
2604 Cost += thisT()->getIntrinsicInstrCost(UMin, CostKind);
2605 break;
2606 }
2607 }
2608 Cost += thisT()->getMemoryOpCost(Instruction::Store, EltTy, Alignment, 0,
2609 CostKind);
2610 Cost *= PtrsTy->getNumElements();
2611 return Cost;
2612 }
2613 case Intrinsic::get_active_lane_mask: {
2614 Type *ArgTy = ICA.getArgTypes()[0];
2615 EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
2616 EVT ArgVT = getTLI()->getValueType(DL, ArgTy, true);
2617
2618 // If we're not expanding the intrinsic then we assume this is cheap
2619 // to implement.
2620 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgVT))
2621 return getTypeLegalizationCost(RetTy).first;
2622
2623 // Create the expanded types that will be used to calculate the uadd_sat
2624 // operation.
2625 Type *ExpRetTy =
2626 VectorType::get(ArgTy, cast<VectorType>(RetTy)->getElementCount());
2627 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
2629 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
2630 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
2632 return Cost;
2633 }
2634 case Intrinsic::experimental_memset_pattern:
2635 // This cost is set to match the cost of the memset_pattern16 libcall.
2636 // It should likely be re-evaluated after migration to this intrinsic
2637 // is complete.
2638 return TTI::TCC_Basic * 4;
2639 case Intrinsic::abs:
2640 ISD = ISD::ABS;
2641 break;
2642 case Intrinsic::fshl:
2643 ISD = ISD::FSHL;
2644 break;
2645 case Intrinsic::fshr:
2646 ISD = ISD::FSHR;
2647 break;
2648 case Intrinsic::smax:
2649 ISD = ISD::SMAX;
2650 break;
2651 case Intrinsic::smin:
2652 ISD = ISD::SMIN;
2653 break;
2654 case Intrinsic::umax:
2655 ISD = ISD::UMAX;
2656 break;
2657 case Intrinsic::umin:
2658 ISD = ISD::UMIN;
2659 break;
2660 case Intrinsic::sadd_sat:
2661 ISD = ISD::SADDSAT;
2662 break;
2663 case Intrinsic::ssub_sat:
2664 ISD = ISD::SSUBSAT;
2665 break;
2666 case Intrinsic::uadd_sat:
2667 ISD = ISD::UADDSAT;
2668 break;
2669 case Intrinsic::usub_sat:
2670 ISD = ISD::USUBSAT;
2671 break;
2672 case Intrinsic::smul_fix:
2673 ISD = ISD::SMULFIX;
2674 break;
2675 case Intrinsic::umul_fix:
2676 ISD = ISD::UMULFIX;
2677 break;
2678 case Intrinsic::sadd_with_overflow:
2679 ISD = ISD::SADDO;
2680 break;
2681 case Intrinsic::ssub_with_overflow:
2682 ISD = ISD::SSUBO;
2683 break;
2684 case Intrinsic::uadd_with_overflow:
2685 ISD = ISD::UADDO;
2686 break;
2687 case Intrinsic::usub_with_overflow:
2688 ISD = ISD::USUBO;
2689 break;
2690 case Intrinsic::smul_with_overflow:
2691 ISD = ISD::SMULO;
2692 break;
2693 case Intrinsic::umul_with_overflow:
2694 ISD = ISD::UMULO;
2695 break;
2696 case Intrinsic::fptosi_sat:
2697 case Intrinsic::fptoui_sat: {
2698 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Tys[0]);
2699 std::pair<InstructionCost, MVT> RetLT = getTypeLegalizationCost(RetTy);
2700
2701 // For cast instructions, types are different between source and
2702 // destination. Also need to check if the source type can be legalize.
2703 if (!SrcLT.first.isValid() || !RetLT.first.isValid())
2705 ISD = IID == Intrinsic::fptosi_sat ? ISD::FP_TO_SINT_SAT
2707 break;
2708 }
2709 case Intrinsic::ctpop:
2710 ISD = ISD::CTPOP;
2711 // In case of legalization use TCC_Expensive. This is cheaper than a
2712 // library call but still not a cheap instruction.
2713 SingleCallCost = TargetTransformInfo::TCC_Expensive;
2714 break;
2715 case Intrinsic::ctlz:
2716 ISD = ISD::CTLZ;
2717 break;
2718 case Intrinsic::cttz:
2719 ISD = ISD::CTTZ;
2720 break;
2721 case Intrinsic::bswap:
2722 ISD = ISD::BSWAP;
2723 break;
2724 case Intrinsic::bitreverse:
2726 break;
2727 case Intrinsic::ucmp:
2728 ISD = ISD::UCMP;
2729 break;
2730 case Intrinsic::scmp:
2731 ISD = ISD::SCMP;
2732 break;
2733 case Intrinsic::clmul:
2734 ISD = ISD::CLMUL;
2735 break;
2736 case Intrinsic::masked_udiv:
2737 case Intrinsic::masked_sdiv:
2738 case Intrinsic::masked_urem:
2739 case Intrinsic::masked_srem: {
2740 unsigned UnmaskedOpc;
2741 switch (IID) {
2742 case Intrinsic::masked_udiv:
2744 UnmaskedOpc = Instruction::UDiv;
2745 break;
2746 case Intrinsic::masked_sdiv:
2748 UnmaskedOpc = Instruction::SDiv;
2749 break;
2750 case Intrinsic::masked_urem:
2752 UnmaskedOpc = Instruction::URem;
2753 break;
2754 case Intrinsic::masked_srem:
2756 UnmaskedOpc = Instruction::SRem;
2757 break;
2758 default:
2759 llvm_unreachable("Unexpected intrinsic ID");
2760 }
2762 thisT()->getArithmeticInstrCost(UnmaskedOpc, RetTy, CostKind);
2763
2764 // Expansion generates a (select %mask, %rhs, 1) for the divisor.
2765 MVT LT = getTypeLegalizationCost(RetTy).second;
2766 if (!getTLI()->isOperationLegalOrCustom(ISD, LT)) {
2767 Type *CondTy = cast<VectorType>(RetTy)->getWithNewType(
2769 Cost += thisT()->getCmpSelInstrCost(
2770 BinaryOperator::Select, RetTy, CondTy, CmpInst::BAD_ICMP_PREDICATE,
2772 }
2773
2774 return Cost;
2775 }
2776 }
2777
2778 auto *ST = dyn_cast<StructType>(RetTy);
2779 Type *LegalizeTy = ST ? ST->getContainedType(0) : RetTy;
2780 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(LegalizeTy);
2781
2782 const TargetLoweringBase *TLI = getTLI();
2783
2784 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2785 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2786 TLI->isFAbsFree(LT.second)) {
2787 return 0;
2788 }
2789
2790 // The operation is legal. Assume it costs 1.
2791 // If the type is split to multiple registers, assume that there is some
2792 // overhead to this.
2793 // TODO: Once we have extract/insert subvector cost we need to use them.
2794 if (LT.first > 1)
2795 return (LT.first * 2);
2796 else
2797 return (LT.first * 1);
2798 } else if (TLI->isOperationCustom(ISD, LT.second)) {
2799 // If the operation is custom lowered then assume
2800 // that the code is twice as expensive.
2801 return (LT.first * 2);
2802 }
2803
2804 switch (IID) {
2805 case Intrinsic::fmuladd: {
2806 // If we can't lower fmuladd into an FMA estimate the cost as a floating
2807 // point mul followed by an add.
2808
2809 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2810 CostKind) +
2811 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2812 CostKind);
2813 }
2814 case Intrinsic::experimental_constrained_fmuladd: {
2815 IntrinsicCostAttributes FMulAttrs(
2816 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2817 IntrinsicCostAttributes FAddAttrs(
2818 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2819 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2820 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2821 }
2822 case Intrinsic::smin:
2823 case Intrinsic::smax:
2824 case Intrinsic::umin:
2825 case Intrinsic::umax: {
2826 // minmax(X,Y) = select(icmp(X,Y),X,Y)
2827 Type *CondTy = RetTy->getWithNewBitWidth(1);
2828 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
2829 CmpInst::Predicate Pred =
2830 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
2832 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2833 Pred, CostKind);
2834 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2835 Pred, CostKind);
2836 return Cost;
2837 }
2838 case Intrinsic::sadd_with_overflow:
2839 case Intrinsic::ssub_with_overflow: {
2840 Type *SumTy = RetTy->getContainedType(0);
2841 Type *OverflowTy = RetTy->getContainedType(1);
2842 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
2843 ? BinaryOperator::Add
2844 : BinaryOperator::Sub;
2845
2846 // Add:
2847 // Overflow -> (Result < LHS) ^ (RHS < 0)
2848 // Sub:
2849 // Overflow -> (Result < LHS) ^ (RHS > 0)
2851 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2852 Cost +=
2853 2 * thisT()->getCmpSelInstrCost(Instruction::ICmp, SumTy, OverflowTy,
2855 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2856 CostKind);
2857 return Cost;
2858 }
2859 case Intrinsic::uadd_with_overflow:
2860 case Intrinsic::usub_with_overflow: {
2861 Type *SumTy = RetTy->getContainedType(0);
2862 Type *OverflowTy = RetTy->getContainedType(1);
2863 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2864 ? BinaryOperator::Add
2865 : BinaryOperator::Sub;
2866 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2869
2871 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2872 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
2873 OverflowTy, Pred, CostKind);
2874 return Cost;
2875 }
2876 case Intrinsic::smul_with_overflow:
2877 case Intrinsic::umul_with_overflow: {
2878 Type *MulTy = RetTy->getContainedType(0);
2879 Type *OverflowTy = RetTy->getContainedType(1);
2880 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2881 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
2882 bool IsSigned = IID == Intrinsic::smul_with_overflow;
2883
2884 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2886
2888 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2889 Cost +=
2890 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2891 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2892 CCH, CostKind);
2893 Cost += thisT()->getArithmeticInstrCost(
2894 Instruction::LShr, ExtTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2896
2897 if (IsSigned)
2898 Cost += thisT()->getArithmeticInstrCost(
2899 Instruction::AShr, MulTy, CostKind,
2902
2903 Cost += thisT()->getCmpSelInstrCost(
2904 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2905 return Cost;
2906 }
2907 case Intrinsic::sadd_sat:
2908 case Intrinsic::ssub_sat: {
2909 // Assume a default expansion.
2910 Type *CondTy = RetTy->getWithNewBitWidth(1);
2911
2912 Type *OpTy = StructType::create({RetTy, CondTy});
2913 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
2914 ? Intrinsic::sadd_with_overflow
2915 : Intrinsic::ssub_with_overflow;
2917
2918 // SatMax -> Overflow && SumDiff < 0
2919 // SatMin -> Overflow && SumDiff >= 0
2921 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2922 nullptr, ScalarizationCostPassed);
2923 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2924 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2925 Pred, CostKind);
2926 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
2927 CondTy, Pred, CostKind);
2928 return Cost;
2929 }
2930 case Intrinsic::uadd_sat:
2931 case Intrinsic::usub_sat: {
2932 Type *CondTy = RetTy->getWithNewBitWidth(1);
2933
2934 Type *OpTy = StructType::create({RetTy, CondTy});
2935 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
2936 ? Intrinsic::uadd_with_overflow
2937 : Intrinsic::usub_with_overflow;
2938
2940 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2941 nullptr, ScalarizationCostPassed);
2942 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2943 Cost +=
2944 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2946 return Cost;
2947 }
2948 case Intrinsic::smul_fix:
2949 case Intrinsic::umul_fix: {
2950 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
2951 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
2952
2953 unsigned ExtOp =
2954 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
2956
2958 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
2959 Cost +=
2960 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2961 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
2962 CCH, CostKind);
2963 Cost += thisT()->getArithmeticInstrCost(
2964 Instruction::LShr, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2966 Cost += thisT()->getArithmeticInstrCost(
2967 Instruction::Shl, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2969 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
2970 return Cost;
2971 }
2972 case Intrinsic::abs: {
2973 // abs(X) = select(icmp(X,0),X,sub(0,X))
2974 Type *CondTy = RetTy->getWithNewBitWidth(1);
2977 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2978 Pred, CostKind);
2979 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2980 Pred, CostKind);
2981 // TODO: Should we add an OperandValueProperties::OP_Zero property?
2982 Cost += thisT()->getArithmeticInstrCost(
2983 BinaryOperator::Sub, RetTy, CostKind,
2985 return Cost;
2986 }
2987 case Intrinsic::fshl:
2988 case Intrinsic::fshr: {
2989 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
2990 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
2991 Type *CondTy = RetTy->getWithNewBitWidth(1);
2993 Cost +=
2994 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
2995 Cost +=
2996 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
2997 Cost +=
2998 thisT()->getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
2999 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::LShr, RetTy,
3000 CostKind);
3001 // Non-constant shift amounts requires a modulo. If the typesize is a
3002 // power-2 then this will be converted to an and, otherwise it will use a
3003 // urem.
3004 Cost += thisT()->getArithmeticInstrCost(
3005 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? BinaryOperator::And
3006 : BinaryOperator::URem,
3007 RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
3008 {TTI::OK_UniformConstantValue, TTI::OP_None});
3009 // Shift-by-zero handling.
3010 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
3012 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
3014 return Cost;
3015 }
3016 case Intrinsic::fptosi_sat:
3017 case Intrinsic::fptoui_sat: {
3018 if (Tys.empty())
3019 break;
3020 Type *FromTy = Tys[0];
3021 bool IsSigned = IID == Intrinsic::fptosi_sat;
3022
3024 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
3025 {FromTy, FromTy});
3026 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
3027 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
3028 {FromTy, FromTy});
3029 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
3030 Cost += thisT()->getCastInstrCost(
3031 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
3033 if (IsSigned) {
3034 Type *CondTy = RetTy->getWithNewBitWidth(1);
3035 Cost += thisT()->getCmpSelInstrCost(
3036 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
3037 Cost += thisT()->getCmpSelInstrCost(
3038 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
3039 }
3040 return Cost;
3041 }
3042 case Intrinsic::ucmp:
3043 case Intrinsic::scmp: {
3044 Type *CmpTy = Tys[0];
3045 Type *CondTy = RetTy->getWithNewBitWidth(1);
3047 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CmpTy, CondTy,
3049 CostKind) +
3050 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CmpTy, CondTy,
3052 CostKind);
3053
3054 EVT VT = TLI->getValueType(DL, CmpTy, true);
3056 // x < y ? -1 : (x > y ? 1 : 0)
3057 Cost += 2 * thisT()->getCmpSelInstrCost(
3058 BinaryOperator::Select, RetTy, CondTy,
3060 } else {
3061 // zext(x > y) - zext(x < y)
3062 Cost +=
3063 2 * thisT()->getCastInstrCost(CastInst::ZExt, RetTy, CondTy,
3065 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy,
3066 CostKind);
3067 }
3068 return Cost;
3069 }
3070 case Intrinsic::maximumnum:
3071 case Intrinsic::minimumnum: {
3072 // On platform that support FMAXNUM_IEEE/FMINNUM_IEEE, we expand
3073 // maximumnum/minimumnum to
3074 // ARG0 = fcanonicalize ARG0, ARG0 // to quiet ARG0
3075 // ARG1 = fcanonicalize ARG1, ARG1 // to quiet ARG1
3076 // RESULT = MAXNUM_IEEE ARG0, ARG1 // or MINNUM_IEEE
3077 // FIXME: In LangRef, we claimed FMAXNUM has the same behaviour of
3078 // FMAXNUM_IEEE, while the backend hasn't migrated the code yet.
3079 // Finally, we will remove FMAXNUM_IEEE and FMINNUM_IEEE.
3080 int IeeeISD =
3081 IID == Intrinsic::maximumnum ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE;
3082 if (TLI->isOperationLegal(IeeeISD, LT.second)) {
3083 IntrinsicCostAttributes FCanonicalizeAttrs(Intrinsic::canonicalize,
3084 RetTy, Tys[0]);
3085 InstructionCost FCanonicalizeCost =
3086 thisT()->getIntrinsicInstrCost(FCanonicalizeAttrs, CostKind);
3087 return LT.first + FCanonicalizeCost * 2;
3088 }
3089 break;
3090 }
3091 case Intrinsic::clmul: {
3092 // This cost model should match the expansion in
3093 // TargetLowering::expandCLMUL.
3094 unsigned BW = RetTy->getScalarSizeInBits();
3095 InstructionCost AndCost =
3096 thisT()->getArithmeticInstrCost(Instruction::And, RetTy, CostKind);
3097 InstructionCost OrCost =
3098 thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
3099 InstructionCost XorCost =
3100 thisT()->getArithmeticInstrCost(Instruction::Xor, RetTy, CostKind);
3101 InstructionCost MulCost =
3102 thisT()->getArithmeticInstrCost(Instruction::Mul, RetTy, CostKind);
3103
3104 // When the multiplication with holes approach is used, that emits 16
3105 // MULs, 8 + 4 ANDs, 12 XORs and 3 ORs.
3106 if (BW >= 32 && BW <= 64 &&
3108 TLI->getValueType(DL, RetTy))) {
3109 return 16 * MulCost + 12 * AndCost + 12 * XorCost + 3 * OrCost;
3110 }
3111
3112 InstructionCost PerBitCostMul = AndCost + MulCost + XorCost;
3113 InstructionCost PerBitCostBittest =
3114 AndCost +
3115 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, RetTy,
3117 thisT()->getCmpSelInstrCost(Instruction::ICmp, RetTy, RetTy,
3119 InstructionCost PerBitCost = std::min(PerBitCostMul, PerBitCostBittest);
3120 return BW * PerBitCost;
3121 }
3122 default:
3123 break;
3124 }
3125
3126 // Else, assume that we need to scalarize this intrinsic. For math builtins
3127 // this will emit a costly libcall, adding call overhead and spills. Make it
3128 // very expensive.
3129 if (isVectorizedTy(RetTy)) {
3130 ArrayRef<Type *> RetVTys = getContainedTypes(RetTy);
3131
3132 // Scalable vectors cannot be scalarized, so return Invalid.
3133 if (any_of(concat<Type *const>(RetVTys, Tys),
3134 [](Type *Ty) { return isa<ScalableVectorType>(Ty); }))
3136
3137 InstructionCost ScalarizationCost = ScalarizationCostPassed;
3138 if (!SkipScalarizationCost) {
3139 ScalarizationCost = 0;
3140 for (Type *RetVTy : RetVTys) {
3141 ScalarizationCost += getScalarizationOverhead(
3142 cast<VectorType>(RetVTy), /*Insert=*/true,
3143 /*Extract=*/false, CostKind);
3144 }
3145 }
3146
3147 unsigned ScalarCalls = getVectorizedTypeVF(RetTy).getFixedValue();
3148 SmallVector<Type *, 4> ScalarTys;
3149 for (Type *Ty : Tys) {
3150 if (Ty->isVectorTy())
3151 Ty = Ty->getScalarType();
3152 ScalarTys.push_back(Ty);
3153 }
3154 IntrinsicCostAttributes Attrs(IID, toScalarizedTy(RetTy), ScalarTys, FMF);
3155 InstructionCost ScalarCost =
3156 thisT()->getIntrinsicInstrCost(Attrs, CostKind);
3157 for (Type *Ty : Tys) {
3158 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
3159 if (!ICA.skipScalarizationCost())
3160 ScalarizationCost += getScalarizationOverhead(
3161 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
3162 ScalarCalls = std::max(ScalarCalls,
3164 }
3165 }
3166 return ScalarCalls * ScalarCost + ScalarizationCost;
3167 }
3168
3169 // This is going to be turned into a library call, make it expensive.
3170 return SingleCallCost;
3171 }
3172
3173 /// Get memory intrinsic cost based on arguments.
3176 TTI::TargetCostKind CostKind) const override {
3177 unsigned Id = MICA.getID();
3178 Type *DataTy = MICA.getDataType();
3179 bool VariableMask = MICA.getVariableMask();
3180 Align Alignment = MICA.getAlignment();
3181
3182 switch (Id) {
3183 case Intrinsic::experimental_vp_strided_load:
3184 case Intrinsic::experimental_vp_strided_store: {
3185 unsigned Opcode = Id == Intrinsic::experimental_vp_strided_load
3186 ? Instruction::Load
3187 : Instruction::Store;
3188 // For a target without strided memory operations (or for an illegal
3189 // operation type on one which does), assume we lower to a gather/scatter
3190 // operation. (Which may in turn be scalarized.)
3191 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment,
3192 VariableMask, true, CostKind);
3193 }
3194 case Intrinsic::masked_scatter:
3195 case Intrinsic::masked_gather:
3196 case Intrinsic::vp_scatter:
3197 case Intrinsic::vp_gather: {
3198 unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
3199 MICA.getID() == Intrinsic::vp_gather)
3200 ? Instruction::Load
3201 : Instruction::Store;
3202
3203 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment,
3204 VariableMask, true, CostKind);
3205 }
3206 case Intrinsic::vp_load:
3207 case Intrinsic::vp_store:
3209 case Intrinsic::masked_load:
3210 case Intrinsic::masked_store: {
3211 unsigned Opcode =
3212 Id == Intrinsic::masked_load ? Instruction::Load : Instruction::Store;
3213 // TODO: Pass on AddressSpace when we have test coverage.
3214 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
3215 CostKind);
3216 }
3217 case Intrinsic::masked_compressstore:
3218 case Intrinsic::masked_expandload: {
3219 unsigned Opcode = MICA.getID() == Intrinsic::masked_expandload
3220 ? Instruction::Load
3221 : Instruction::Store;
3222 // Treat expand load/compress store as gather/scatter operation.
3223 // TODO: implement more precise cost estimation for these intrinsics.
3224 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment,
3225 VariableMask,
3226 /*IsGatherScatter*/ true, CostKind);
3227 }
3228 case Intrinsic::vp_load_ff:
3230 default:
3231 llvm_unreachable("unexpected intrinsic");
3232 }
3233 }
3234
3235 /// Compute a cost of the given call instruction.
3236 ///
3237 /// Compute the cost of calling function F with return type RetTy and
3238 /// argument types Tys. F might be nullptr, in this case the cost of an
3239 /// arbitrary call with the specified signature will be returned.
3240 /// This is used, for instance, when we estimate call of a vector
3241 /// counterpart of the given function.
3242 /// \param F Called function, might be nullptr.
3243 /// \param RetTy Return value types.
3244 /// \param Tys Argument types.
3245 /// \returns The cost of Call instruction.
3248 TTI::TargetCostKind CostKind) const override {
3249 return 10;
3250 }
3251
3252 unsigned getNumberOfParts(Type *Tp) const override {
3253 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3254 if (!LT.first.isValid())
3255 return 0;
3256 // Try to find actual number of parts for non-power-of-2 elements as
3257 // ceil(num-of-elements/num-of-subtype-elements).
3258 if (auto *FTp = dyn_cast<FixedVectorType>(Tp);
3259 Tp && LT.second.isFixedLengthVector() &&
3260 !has_single_bit(FTp->getNumElements())) {
3261 if (auto *SubTp = dyn_cast_if_present<FixedVectorType>(
3262 EVT(LT.second).getTypeForEVT(Tp->getContext()));
3263 SubTp && SubTp->getElementType() == FTp->getElementType())
3264 return divideCeil(FTp->getNumElements(), SubTp->getNumElements());
3265 }
3266 return LT.first.getValue();
3267 }
3268
3271 TTI::TargetCostKind) const override {
3272 return 0;
3273 }
3274
3275 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
3276 /// We're assuming that reduction operation are performing the following way:
3277 ///
3278 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
3279 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
3280 /// \----------------v-------------/ \----------v------------/
3281 /// n/2 elements n/2 elements
3282 /// %red1 = op <n x t> %val, <n x t> val1
3283 /// After this operation we have a vector %red1 where only the first n/2
3284 /// elements are meaningful, the second n/2 elements are undefined and can be
3285 /// dropped. All other operations are actually working with the vector of
3286 /// length n/2, not n, though the real vector length is still n.
3287 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
3288 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
3289 /// \----------------v-------------/ \----------v------------/
3290 /// n/4 elements 3*n/4 elements
3291 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
3292 /// length n/2, the resulting vector has length n/4 etc.
3293 ///
3294 /// The cost model should take into account that the actual length of the
3295 /// vector is reduced on each iteration.
3298 // Targets must implement a default value for the scalable case, since
3299 // we don't know how many lanes the vector has.
3302
3303 Type *ScalarTy = Ty->getElementType();
3304 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
3305 if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
3306 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
3307 NumVecElts >= 2) {
3308 // Or reduction for i1 is represented as:
3309 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
3310 // %res = cmp ne iReduxWidth %val, 0
3311 // And reduction for i1 is represented as:
3312 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
3313 // %res = cmp eq iReduxWidth %val, 11111
3314 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
3315 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
3317 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
3320 }
3321 unsigned NumReduxLevels = Log2_32(NumVecElts);
3322 InstructionCost ArithCost = 0;
3323 InstructionCost ShuffleCost = 0;
3324 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
3325 unsigned LongVectorCount = 0;
3326 unsigned MVTLen =
3327 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
3328 while (NumVecElts > MVTLen) {
3329 NumVecElts /= 2;
3330 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
3331 ShuffleCost += thisT()->getShuffleCost(
3332 TTI::SK_ExtractSubvector, SubTy, Ty, {}, CostKind, NumVecElts, SubTy);
3333 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
3334 Ty = SubTy;
3335 ++LongVectorCount;
3336 }
3337
3338 NumReduxLevels -= LongVectorCount;
3339
3340 // The minimal length of the vector is limited by the real length of vector
3341 // operations performed on the current platform. That's why several final
3342 // reduction operations are performed on the vectors with the same
3343 // architecture-dependent length.
3344
3345 // By default reductions need one shuffle per reduction level.
3346 ShuffleCost +=
3347 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
3348 Ty, {}, CostKind, 0, Ty);
3349 ArithCost +=
3350 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
3351 return ShuffleCost + ArithCost +
3352 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
3353 CostKind, 0, nullptr, nullptr);
3354 }
3355
3356 /// Try to calculate the cost of performing strict (in-order) reductions,
3357 /// which involves doing a sequence of floating point additions in lane
3358 /// order, starting with an initial value. For example, consider a scalar
3359 /// initial value 'InitVal' of type float and a vector of type <4 x float>:
3360 ///
3361 /// Vector = <float %v0, float %v1, float %v2, float %v3>
3362 ///
3363 /// %add1 = %InitVal + %v0
3364 /// %add2 = %add1 + %v1
3365 /// %add3 = %add2 + %v2
3366 /// %add4 = %add3 + %v3
3367 ///
3368 /// As a simple estimate we can say the cost of such a reduction is 4 times
3369 /// the cost of a scalar FP addition. We can only estimate the costs for
3370 /// fixed-width vectors here because for scalable vectors we do not know the
3371 /// runtime number of operations.
3374 // Targets must implement a default value for the scalable case, since
3375 // we don't know how many lanes the vector has.
3378
3379 auto *VTy = cast<FixedVectorType>(Ty);
3381 VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
3382 InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
3383 Opcode, VTy->getElementType(), CostKind);
3384 ArithCost *= VTy->getNumElements();
3385
3386 return ExtractCost + ArithCost;
3387 }
3388
3391 std::optional<FastMathFlags> FMF,
3392 TTI::TargetCostKind CostKind) const override {
3393 assert(Ty && "Unknown reduction vector type");
3395 return getOrderedReductionCost(Opcode, Ty, CostKind);
3396 return getTreeReductionCost(Opcode, Ty, CostKind);
3397 }
3398
3399 /// Try to calculate op costs for min/max reduction operations.
3400 /// \param CondTy Conditional type for the Select instruction.
3403 TTI::TargetCostKind CostKind) const override {
3404 // Targets must implement a default value for the scalable case, since
3405 // we don't know how many lanes the vector has.
3408
3409 Type *ScalarTy = Ty->getElementType();
3410 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
3411 unsigned NumReduxLevels = Log2_32(NumVecElts);
3412 InstructionCost MinMaxCost = 0;
3413 InstructionCost ShuffleCost = 0;
3414 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
3415 unsigned LongVectorCount = 0;
3416 unsigned MVTLen =
3417 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
3418 while (NumVecElts > MVTLen) {
3419 NumVecElts /= 2;
3420 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
3421
3422 ShuffleCost += thisT()->getShuffleCost(
3423 TTI::SK_ExtractSubvector, SubTy, Ty, {}, CostKind, NumVecElts, SubTy);
3424
3425 IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF);
3426 MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind);
3427 Ty = SubTy;
3428 ++LongVectorCount;
3429 }
3430
3431 NumReduxLevels -= LongVectorCount;
3432
3433 // The minimal length of the vector is limited by the real length of vector
3434 // operations performed on the current platform. That's why several final
3435 // reduction opertions are perfomed on the vectors with the same
3436 // architecture-dependent length.
3437 ShuffleCost +=
3438 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
3439 Ty, {}, CostKind, 0, Ty);
3440 IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF);
3441 MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind);
3442 // The last min/max should be in vector registers and we counted it above.
3443 // So just need a single extractelement.
3444 return ShuffleCost + MinMaxCost +
3445 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
3446 CostKind, 0, nullptr, nullptr);
3447 }
3448
3450 getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy,
3451 VectorType *Ty, std::optional<FastMathFlags> FMF,
3452 TTI::TargetCostKind CostKind) const override {
3453 if (auto *FTy = dyn_cast<FixedVectorType>(Ty);
3454 FTy && IsUnsigned && Opcode == Instruction::Add &&
3455 FTy->getElementType() == IntegerType::getInt1Ty(Ty->getContext())) {
3456 // Represent vector_reduce_add(ZExt(<n x i1>)) as
3457 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
3458 auto *IntTy =
3459 IntegerType::get(ResTy->getContext(), FTy->getNumElements());
3460 IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy},
3461 FMF ? *FMF : FastMathFlags());
3462 return thisT()->getCastInstrCost(Instruction::BitCast, IntTy, FTy,
3464 thisT()->getIntrinsicInstrCost(ICA, CostKind);
3465 }
3466 // Without any native support, this is equivalent to the cost of
3467 // vecreduce.opcode(ext(Ty A)).
3468 VectorType *ExtTy = VectorType::get(ResTy, Ty);
3469 InstructionCost RedCost =
3470 thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
3471 InstructionCost ExtCost = thisT()->getCastInstrCost(
3472 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
3474
3475 return RedCost + ExtCost;
3476 }
3477
3479 getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy,
3480 VectorType *Ty,
3481 TTI::TargetCostKind CostKind) const override {
3482 // Without any native support, this is equivalent to the cost of
3483 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
3484 // vecreduce.add(mul(A, B)).
3485 assert((RedOpcode == Instruction::Add || RedOpcode == Instruction::Sub) &&
3486 "The reduction opcode is expected to be Add or Sub.");
3487 VectorType *ExtTy = VectorType::get(ResTy, Ty);
3488 InstructionCost RedCost = thisT()->getArithmeticReductionCost(
3489 RedOpcode, ExtTy, std::nullopt, CostKind);
3490 InstructionCost ExtCost = thisT()->getCastInstrCost(
3491 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
3493
3494 InstructionCost MulCost =
3495 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
3496
3497 return RedCost + MulCost + 2 * ExtCost;
3498 }
3499
3501 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
3503 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
3505 std::optional<FastMathFlags> FMF) const override {
3506 unsigned EltSizeAcc = AccumType->getScalarSizeInBits();
3507 unsigned EltSizeInA = InputTypeA->getScalarSizeInBits();
3508 unsigned Ratio = EltSizeAcc / EltSizeInA;
3509 if (VF.getKnownMinValue() <= Ratio || VF.getKnownMinValue() % Ratio != 0 ||
3510 EltSizeAcc % EltSizeInA != 0 || (BinOp && InputTypeA != InputTypeB))
3512
3513 Type *InputVectorType = VectorType::get(InputTypeA, VF);
3514 Type *ExtInputVectorType = VectorType::get(AccumType, VF);
3515 Type *AccumVectorType =
3516 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
3517
3518 InstructionCost ExtendCostA = 0;
3520 ExtendCostA = getCastInstrCost(
3522 ExtInputVectorType, InputVectorType, TTI::CastContextHint::None,
3523 CostKind);
3524
3525 // TODO: add cost of extracting subvectors from the source vector that
3526 // is to be partially reduced.
3527 InstructionCost ReductionOpCost =
3528 Ratio * getArithmeticInstrCost(Opcode, AccumVectorType, CostKind);
3529
3530 if (!BinOp)
3531 return ExtendCostA + ReductionOpCost;
3532
3533 InstructionCost ExtendCostB = 0;
3535 ExtendCostB = getCastInstrCost(
3537 ExtInputVectorType, InputVectorType, TTI::CastContextHint::None,
3538 CostKind);
3539 return ExtendCostA + ExtendCostB + ReductionOpCost +
3540 getArithmeticInstrCost(*BinOp, ExtInputVectorType, CostKind);
3541 }
3542
3544
3545 /// @}
3546};
3547
3548/// Concrete BasicTTIImpl that can be used if no further customization
3549/// is needed.
3550class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
3551 using BaseT = BasicTTIImplBase<BasicTTIImpl>;
3552
3553 friend class BasicTTIImplBase<BasicTTIImpl>;
3554
3555 const TargetSubtargetInfo *ST;
3556 const TargetLoweringBase *TLI;
3557
3558 const TargetSubtargetInfo *getST() const { return ST; }
3559 const TargetLoweringBase *getTLI() const { return TLI; }
3560
3561public:
3562 LLVM_ABI explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
3563};
3564
3565} // end namespace llvm
3566
3567#endif // LLVM_CODEGEN_BASICTTIIMPL_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
This file implements the BitVector class.
#define LLVM_ABI
Definition Compiler.h:215
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
static unsigned getNumElements(Type *Ty)
static Type * getValueType(Value *V, bool LookThroughCmp=false)
Returns the "element type" of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This file provides helpers for the implementation of a TargetTransformInfo-conforming class.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1353
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition APInt.h:1208
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition APInt.h:1137
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
an instruction to allocate memory on the stack
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:194
size_t size() const
Get the array size.
Definition ArrayRef.h:141
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:200
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getFPOpCost(Type *Ty) const override
bool preferToKeepConstantsAttached(const Instruction &Inst, const Function &Fn) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool shouldBuildLookupTables() const override
bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override
bool isProfitableToHoist(Instruction *I) const override
unsigned getNumberOfParts(Type *Tp) const override
unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const override
bool useAA() const override
unsigned getPrefetchDistance() const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getOperandsScalarizationOverhead(ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction's operands.
bool isLegalAddScalableImmediate(int64_t Imm) const override
unsigned getAssumedAddrSpace(const Value *V) const override
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) const override
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty) const override
bool haveFastSqrt(Type *Ty) const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JumpTableSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const override
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy, Align Alignment, unsigned AddrSpace) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
unsigned adjustInliningThreshold(const CallBase *CB) const override
unsigned getInliningThresholdMultiplier() const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset)
bool shouldBuildRelLookupTables() const override
bool isTargetIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, int RetIdx) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
InstructionCost getVectorSplitCost() const
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
std::optional< unsigned > getMaxVScale() const override
unsigned getFlatAddressSpace() const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
Compute a cost of the given call instruction.
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind) const
Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
~BasicTTIImplBase() override=default
std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const override
unsigned getMaxPrefetchIterationsAhead() const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
Get intrinsic cost based on argument types.
bool hasBranchDivergence(const Function *F=nullptr) const override
InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind) const
Try to calculate the cost of performing strict (in-order) reductions, which involves doing a sequence...
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
bool shouldPrefetchAddressSpace(unsigned AS) const override
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace, Align Alignment, unsigned *Fast) const override
unsigned getCacheLineSize() const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
bool shouldDropLSRSolutionIfLessProfitable() const override
int getInlinerVectorBonusPercent() const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
bool isLegalAddImmediate(int64_t imm) const override
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
bool isSingleThreaded() const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool isProfitableLSRChainElement(Instruction *I) const override
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override
bool isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx) const override
bool isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
bool preferTailFoldingOverEpilogue(TailFoldingInfo *TFI) const override
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const override
InstructionCost getScalarizationOverhead(VectorType *RetTy, ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing the inputs and outputs of an instruction, with return type RetTy...
TailFoldingStyle getPreferredTailFoldingStyle() const override
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
bool isLegalICmpImmediate(int64_t imm) const override
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
bool enableWritePrefetching() const override
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
bool isNumRegsMajorCostOfLSR() const override
LLVM_ABI BasicTTIImpl(const TargetMachine *TM, const Function &F)
size_type count() const
Returns the number of bits which are set.
Definition BitVector.h:181
BitVector & set()
Set all bits in the bitvector.
Definition BitVector.h:366
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
static CmpInst::Predicate getGTPredicate(Intrinsic::ID ID)
static CmpInst::Predicate getLTPredicate(Intrinsic::ID ID)
This class represents a range of values.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:869
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:354
The core instruction combiner logic.
static InstructionCost getInvalid(CostType Val=0)
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:350
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
InstructionCost getScalarizationCost() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Machine Value Type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Information for memory intrinsic cost model.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Analysis providing profile information.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isSpliceMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is a splice mask, concatenating the two inputs together and then ext...
static LLVM_ABI bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isTransposeMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask is a transpose mask.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:685
Multiway switch.
Provides information about what library functions are available for the current target.
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool preferSelectsOverBooleanArithmetic(EVT VT) const
Should we prefer selects to doing arithmetic on boolean types.
virtual bool isZExtFree(Type *FromTy, Type *ToTy) const
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, uint64_t Range, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
Return true if lowering to a jump table is suitable for a set of case clusters which may contain NumC...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT, Align Alignment, unsigned AddrSpace) const
Return how this store with truncation should be treated: either it is legal, needs to be promoted to ...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
bool isSuitableForBitTests(const DenseMap< const BasicBlock *, unsigned int > &DestCmps, const APInt &Low, const APInt &High, const DataLayout &DL) const
Return true if lowering to a bit test is suitable for a set of case clusters which contains NumDests ...
virtual bool isTruncateFree(Type *FromTy, Type *ToTy) const
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
LegalizeAction getLoadAction(EVT ValVT, EVT MemVT, Align Alignment, unsigned AddrSpace, unsigned ExtType, bool Atomic) const
Return how this load with extension should be treated: either it is legal, needs to be promoted to a ...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
bool isLoadLegal(EVT ValVT, EVT MemVT, Align Alignment, unsigned AddrSpace, unsigned ExtType, bool Atomic) const
Return true if the specified load with extension is legal on this target.
virtual bool isFAbsFree(EVT VT) const
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
bool isPositionIndependent() const
const Triple & getTargetTriple() const
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
CodeModel::Model getCodeModel() const
Returns the code model.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual bool isProfitableLSRChainElement(Instruction *I) const
virtual TailFoldingStyle getPreferredTailFoldingStyle() const
virtual const DataLayout & getDataLayout() const
virtual std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
virtual std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
virtual bool shouldDropLSRSolutionIfLessProfitable() const
virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const
virtual std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) const
virtual bool preferTailFoldingOverEpilogue(TailFoldingInfo *TFI) const
virtual std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
virtual unsigned getEpilogueVectorizationMinVF() const
virtual InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
virtual bool isLoweredToCall(const Function *F) const
virtual InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info, ArrayRef< const Value * > Args, const Instruction *CxtI=nullptr) const
virtual InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
virtual InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) const
virtual InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
virtual InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, const Instruction *I) const
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Basic
The cost of a typical 'add' instruction.
static LLVM_ABI Instruction::CastOps getOpcodeForPartialReductionExtendKind(PartialReductionExtendKind Kind)
Get the cast opcode for an extension kind.
MemIndexedMode
The type of load/store indexing.
static LLVM_ABI VectorInstrContext getVectorInstrContextHint(const Instruction *I)
Calculates a VectorInstrContext from I.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Normal
The cast is used with a normal load/store.
CacheLevel
The possible cache levels.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
LLVM_ABI bool isArch64Bit() const
Test whether the architecture is 64-bit.
Definition Triple.cpp:2061
bool isAArch64() const
Tests whether the target is AArch64 (little and big endian).
Definition Triple.h:1009
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:288
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:227
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition Type.h:397
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
Value * getOperand(unsigned i) const
Definition User.h:207
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
static LLVM_ABI bool isVPCast(Intrinsic::ID ID)
static LLVM_ABI bool isVPCmp(Intrinsic::ID ID)
static LLVM_ABI std::optional< unsigned > getFunctionalOpcodeForVP(Intrinsic::ID ID)
static LLVM_ABI std::optional< Intrinsic::ID > getFunctionalIntrinsicIDForVP(Intrinsic::ID ID)
static LLVM_ABI bool isVPIntrinsic(Intrinsic::ID)
static LLVM_ABI bool isVPReduction(Intrinsic::ID ID)
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
Base class of all SIMD vector types.
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3040
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:787
@ SMULFIX
RESULT = [US]MULFIX(LHS, RHS, SCALE) - Perform fixed point multiplication on 2 integers with the same...
Definition ISDOpcodes.h:394
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ FMODF
FMODF - Decomposes the operand into integral and fractional parts, each having the same type and sign...
@ FATAN2
FATAN2 - atan2, inspired by libm.
@ FSINCOSPI
FSINCOSPI - Compute both the sine and cosine times pi more accurately than FSINCOS(pi*x),...
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ CLMUL
Carry-less multiplication operations.
Definition ISDOpcodes.h:778
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ BRIND
BRIND - Indirect branch.
@ BR_JT
BR_JT - Jumptable branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:804
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ MASKED_UDIV
Masked vector arithmetic that returns poison on disabled lanes.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:813
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition ISDOpcodes.h:735
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:953
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
LLVM_ABI bool isTargetIntrinsic(ID IID)
isTargetIntrinsic - Returns true if IID is an intrinsic specific to a certain target.
LLVM_ABI Libcall getSINCOSPI(EVT RetVT)
getSINCOSPI - Return the SINCOSPI_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getMODF(EVT VT)
getMODF - Return the MODF_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getSINCOS(EVT RetVT)
getSINCOS - Return the SINCOS_* value for the given types, or UNKNOWN_LIBCALL if there is none.
DiagnosticInfoOptimizationBase::Argument NV
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
Type * toScalarizedTy(Type *Ty)
A helper for converting vectorized types to scalarized (non-vector) types.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
bool isVectorizedTy(Type *Ty)
Returns true if Ty is a vector type or a struct of vector types where all vector types share the same...
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1151
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
ElementCount getVectorizedTypeVF(Type *Ty)
Returns the number of vector elements for a vectorized type.
LLVM_ABI ConstantRange getVScaleRange(const Function *F, unsigned BitWidth)
Determine the possible constant range of vscale with the given bit width, based on the vscale_range f...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
LLVM_ABI cl::opt< unsigned > PartialUnrollingThreshold
LLVM_ABI bool isVectorizedStructTy(StructType *StructTy)
Returns true if StructTy is an unpacked literal struct where all elements are vectors of matching ele...
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
ElementCount getVectorElementCount() const
Definition ValueTypes.h:373
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:61
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Attributes of a target dependent hardware loop.
static LLVM_ABI bool hasVectorMaskArgument(RTLIB::LibcallImpl Impl)
Returns true if the function has a vector mask argument, which is assumed to be the last argument.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
bool AllowPeeling
Allow peeling off loop iterations.
bool AllowLoopNestsPeeling
Allow peeling off loop iterations for loop nests.
bool PeelProfiledIterations
Allow peeling basing on profile.
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).