LLVM 20.0.0git
BasicTTIImpl.h
Go to the documentation of this file.
1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/BitVector.h"
34#include "llvm/IR/BasicBlock.h"
35#include "llvm/IR/Constant.h"
36#include "llvm/IR/Constants.h"
37#include "llvm/IR/DataLayout.h"
39#include "llvm/IR/InstrTypes.h"
40#include "llvm/IR/Instruction.h"
42#include "llvm/IR/Intrinsics.h"
43#include "llvm/IR/Operator.h"
44#include "llvm/IR/Type.h"
45#include "llvm/IR/Value.h"
53#include <algorithm>
54#include <cassert>
55#include <cstdint>
56#include <limits>
57#include <optional>
58#include <utility>
59
60namespace llvm {
61
62class Function;
63class GlobalValue;
64class LLVMContext;
65class ScalarEvolution;
66class SCEV;
67class TargetMachine;
68
69extern cl::opt<unsigned> PartialUnrollingThreshold;
70
71/// Base class which can be used to help build a TTI implementation.
72///
73/// This class provides as much implementation of the TTI interface as is
74/// possible using the target independent parts of the code generator.
75///
76/// In order to subclass it, your class must implement a getST() method to
77/// return the subtarget, and a getTLI() method to return the target lowering.
78/// We need these methods implemented in the derived class so that this class
79/// doesn't have to duplicate storage for them.
80template <typename T>
82private:
85
86 /// Helper function to access this as a T.
87 T *thisT() { return static_cast<T *>(this); }
88
89 /// Estimate a cost of Broadcast as an extract and sequence of insert
90 /// operations.
91 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
94 // Broadcast cost is equal to the cost of extracting the zero'th element
95 // plus the cost of inserting it into every element of the result vector.
96 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
97 CostKind, 0, nullptr, nullptr);
98
99 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
100 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
101 CostKind, i, nullptr, nullptr);
102 }
103 return Cost;
104 }
105
106 /// Estimate a cost of shuffle as a sequence of extract and insert
107 /// operations.
108 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
111 // Shuffle cost is equal to the cost of extracting element from its argument
112 // plus the cost of inserting them onto the result vector.
113
114 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
115 // index 0 of first vector, index 1 of second vector,index 2 of first
116 // vector and finally index 3 of second vector and insert them at index
117 // <0,1,2,3> of result vector.
118 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
119 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
120 CostKind, i, nullptr, nullptr);
121 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
122 CostKind, i, nullptr, nullptr);
123 }
124 return Cost;
125 }
126
127 /// Estimate a cost of subvector extraction as a sequence of extract and
128 /// insert operations.
129 InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
131 int Index,
132 FixedVectorType *SubVTy) {
133 assert(VTy && SubVTy &&
134 "Can only extract subvectors from vectors");
135 int NumSubElts = SubVTy->getNumElements();
136 assert((!isa<FixedVectorType>(VTy) ||
137 (Index + NumSubElts) <=
138 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
139 "SK_ExtractSubvector index out of range");
140
142 // Subvector extraction cost is equal to the cost of extracting element from
143 // the source type plus the cost of inserting them into the result vector
144 // type.
145 for (int i = 0; i != NumSubElts; ++i) {
146 Cost +=
147 thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
148 CostKind, i + Index, nullptr, nullptr);
149 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
150 CostKind, i, nullptr, nullptr);
151 }
152 return Cost;
153 }
154
155 /// Estimate a cost of subvector insertion as a sequence of extract and
156 /// insert operations.
157 InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
159 int Index,
160 FixedVectorType *SubVTy) {
161 assert(VTy && SubVTy &&
162 "Can only insert subvectors into vectors");
163 int NumSubElts = SubVTy->getNumElements();
164 assert((!isa<FixedVectorType>(VTy) ||
165 (Index + NumSubElts) <=
166 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
167 "SK_InsertSubvector index out of range");
168
170 // Subvector insertion cost is equal to the cost of extracting element from
171 // the source type plus the cost of inserting them into the result vector
172 // type.
173 for (int i = 0; i != NumSubElts; ++i) {
174 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
175 CostKind, i, nullptr, nullptr);
176 Cost +=
177 thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
178 i + Index, nullptr, nullptr);
179 }
180 return Cost;
181 }
182
183 /// Local query method delegates up to T which *must* implement this!
184 const TargetSubtargetInfo *getST() const {
185 return static_cast<const T *>(this)->getST();
186 }
187
188 /// Local query method delegates up to T which *must* implement this!
189 const TargetLoweringBase *getTLI() const {
190 return static_cast<const T *>(this)->getTLI();
191 }
192
193 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
194 switch (M) {
196 return ISD::UNINDEXED;
197 case TTI::MIM_PreInc:
198 return ISD::PRE_INC;
199 case TTI::MIM_PreDec:
200 return ISD::PRE_DEC;
201 case TTI::MIM_PostInc:
202 return ISD::POST_INC;
203 case TTI::MIM_PostDec:
204 return ISD::POST_DEC;
205 }
206 llvm_unreachable("Unexpected MemIndexedMode");
207 }
208
209 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
210 Align Alignment,
211 bool VariableMask,
212 bool IsGatherScatter,
214 unsigned AddressSpace = 0) {
215 // We cannot scalarize scalable vectors, so return Invalid.
216 if (isa<ScalableVectorType>(DataTy))
218
219 auto *VT = cast<FixedVectorType>(DataTy);
220 unsigned VF = VT->getNumElements();
221
222 // Assume the target does not have support for gather/scatter operations
223 // and provide a rough estimate.
224 //
225 // First, compute the cost of the individual memory operations.
226 InstructionCost AddrExtractCost =
227 IsGatherScatter
230 PointerType::get(VT->getElementType(), 0), VF),
231 /*Insert=*/false, /*Extract=*/true, CostKind)
232 : 0;
233
234 // The cost of the scalar loads/stores.
235 InstructionCost MemoryOpCost =
236 VF * thisT()->getMemoryOpCost(Opcode, VT->getElementType(), Alignment,
238
239 // Next, compute the cost of packing the result in a vector.
240 InstructionCost PackingCost =
241 getScalarizationOverhead(VT, Opcode != Instruction::Store,
242 Opcode == Instruction::Store, CostKind);
243
244 InstructionCost ConditionalCost = 0;
245 if (VariableMask) {
246 // Compute the cost of conditionally executing the memory operations with
247 // variable masks. This includes extracting the individual conditions, a
248 // branches and PHIs to combine the results.
249 // NOTE: Estimating the cost of conditionally executing the memory
250 // operations accurately is quite difficult and the current solution
251 // provides a very rough estimate only.
252 ConditionalCost =
255 /*Insert=*/false, /*Extract=*/true, CostKind) +
256 VF * (thisT()->getCFInstrCost(Instruction::Br, CostKind) +
257 thisT()->getCFInstrCost(Instruction::PHI, CostKind));
258 }
259
260 return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
261 }
262
263protected:
265 : BaseT(DL) {}
266 virtual ~BasicTTIImplBase() = default;
267
269
270public:
271 /// \name Scalar TTI Implementations
272 /// @{
274 unsigned AddressSpace, Align Alignment,
275 unsigned *Fast) const {
276 EVT E = EVT::getIntegerVT(Context, BitWidth);
277 return getTLI()->allowsMisalignedMemoryAccesses(
279 }
280
281 bool hasBranchDivergence(const Function *F = nullptr) { return false; }
282
283 bool isSourceOfDivergence(const Value *V) { return false; }
284
285 bool isAlwaysUniform(const Value *V) { return false; }
286
287 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
288 return false;
289 }
290
291 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
292 return true;
293 }
294
296 // Return an invalid address space.
297 return -1;
298 }
299
301 Intrinsic::ID IID) const {
302 return false;
303 }
304
305 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
306 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
307 }
308
309 unsigned getAssumedAddrSpace(const Value *V) const {
310 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
311 }
312
313 bool isSingleThreaded() const {
314 return getTLI()->getTargetMachine().Options.ThreadModel ==
316 }
317
318 std::pair<const Value *, unsigned>
320 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
321 }
322
324 Value *NewV) const {
325 return nullptr;
326 }
327
328 bool isLegalAddImmediate(int64_t imm) {
329 return getTLI()->isLegalAddImmediate(imm);
330 }
331
332 bool isLegalAddScalableImmediate(int64_t Imm) {
333 return getTLI()->isLegalAddScalableImmediate(Imm);
334 }
335
336 bool isLegalICmpImmediate(int64_t imm) {
337 return getTLI()->isLegalICmpImmediate(imm);
338 }
339
340 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
341 bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
342 Instruction *I = nullptr,
343 int64_t ScalableOffset = 0) {
345 AM.BaseGV = BaseGV;
346 AM.BaseOffs = BaseOffset;
347 AM.HasBaseReg = HasBaseReg;
348 AM.Scale = Scale;
349 AM.ScalableOffset = ScalableOffset;
350 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
351 }
352
353 int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) {
354 return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset);
355 }
356
357 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
358 Type *ScalarValTy) const {
359 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
360 auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
361 EVT VT = getTLI()->getValueType(DL, SrcTy);
362 if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
363 getTLI()->isOperationCustom(ISD::STORE, VT))
364 return true;
365
366 EVT ValVT =
367 getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
368 EVT LegalizedVT =
369 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
370 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
371 };
372 while (VF > 2 && IsSupportedByTarget(VF))
373 VF /= 2;
374 return VF;
375 }
376
378 const DataLayout &DL) const {
379 EVT VT = getTLI()->getValueType(DL, Ty);
380 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
381 }
382
384 const DataLayout &DL) const {
385 EVT VT = getTLI()->getValueType(DL, Ty);
386 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
387 }
388
391 }
392
395 }
396
400 }
401
404 }
405
408 }
409
411 StackOffset BaseOffset, bool HasBaseReg,
412 int64_t Scale, unsigned AddrSpace) {
414 AM.BaseGV = BaseGV;
415 AM.BaseOffs = BaseOffset.getFixed();
416 AM.HasBaseReg = HasBaseReg;
417 AM.Scale = Scale;
418 AM.ScalableOffset = BaseOffset.getScalable();
419 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
420 return 0;
421 return -1;
422 }
423
424 bool isTruncateFree(Type *Ty1, Type *Ty2) {
425 return getTLI()->isTruncateFree(Ty1, Ty2);
426 }
427
429 return getTLI()->isProfitableToHoist(I);
430 }
431
432 bool useAA() const { return getST()->useAA(); }
433
434 bool isTypeLegal(Type *Ty) {
435 EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
436 return getTLI()->isTypeLegal(VT);
437 }
438
439 unsigned getRegUsageForType(Type *Ty) {
440 EVT ETy = getTLI()->getValueType(DL, Ty);
441 return getTLI()->getNumRegisters(Ty->getContext(), ETy);
442 }
443
447 return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind);
448 }
449
451 unsigned &JumpTableSize,
453 BlockFrequencyInfo *BFI) {
454 /// Try to find the estimated number of clusters. Note that the number of
455 /// clusters identified in this function could be different from the actual
456 /// numbers found in lowering. This function ignore switches that are
457 /// lowered with a mix of jump table / bit test / BTree. This function was
458 /// initially intended to be used when estimating the cost of switch in
459 /// inline cost heuristic, but it's a generic cost model to be used in other
460 /// places (e.g., in loop unrolling).
461 unsigned N = SI.getNumCases();
462 const TargetLoweringBase *TLI = getTLI();
463 const DataLayout &DL = this->getDataLayout();
464
465 JumpTableSize = 0;
466 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
467
468 // Early exit if both a jump table and bit test are not allowed.
469 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
470 return N;
471
472 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
473 APInt MinCaseVal = MaxCaseVal;
474 for (auto CI : SI.cases()) {
475 const APInt &CaseVal = CI.getCaseValue()->getValue();
476 if (CaseVal.sgt(MaxCaseVal))
477 MaxCaseVal = CaseVal;
478 if (CaseVal.slt(MinCaseVal))
479 MinCaseVal = CaseVal;
480 }
481
482 // Check if suitable for a bit test
483 if (N <= DL.getIndexSizeInBits(0u)) {
485 for (auto I : SI.cases())
486 Dests.insert(I.getCaseSuccessor());
487
488 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
489 DL))
490 return 1;
491 }
492
493 // Check if suitable for a jump table.
494 if (IsJTAllowed) {
495 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
496 return N;
498 (MaxCaseVal - MinCaseVal)
499 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
500 // Check whether a range of clusters is dense enough for a jump table
501 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
502 JumpTableSize = Range;
503 return 1;
504 }
505 }
506 return N;
507 }
508
510 const TargetLoweringBase *TLI = getTLI();
511 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
512 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
513 }
514
516 const TargetMachine &TM = getTLI()->getTargetMachine();
517 // If non-PIC mode, do not generate a relative lookup table.
518 if (!TM.isPositionIndependent())
519 return false;
520
521 /// Relative lookup table entries consist of 32-bit offsets.
522 /// Do not generate relative lookup tables for large code models
523 /// in 64-bit achitectures where 32-bit offsets might not be enough.
524 if (TM.getCodeModel() == CodeModel::Medium ||
525 TM.getCodeModel() == CodeModel::Large)
526 return false;
527
528 const Triple &TargetTriple = TM.getTargetTriple();
529 if (!TargetTriple.isArch64Bit())
530 return false;
531
532 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
533 // there.
534 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
535 return false;
536
537 return true;
538 }
539
540 bool haveFastSqrt(Type *Ty) {
541 const TargetLoweringBase *TLI = getTLI();
542 EVT VT = TLI->getValueType(DL, Ty);
543 return TLI->isTypeLegal(VT) &&
545 }
546
548 return true;
549 }
550
552 // Check whether FADD is available, as a proxy for floating-point in
553 // general.
554 const TargetLoweringBase *TLI = getTLI();
555 EVT VT = TLI->getValueType(DL, Ty);
559 }
560
562 const Function &Fn) const {
563 switch (Inst.getOpcode()) {
564 default:
565 break;
566 case Instruction::SDiv:
567 case Instruction::SRem:
568 case Instruction::UDiv:
569 case Instruction::URem: {
570 if (!isa<ConstantInt>(Inst.getOperand(1)))
571 return false;
572 EVT VT = getTLI()->getValueType(DL, Inst.getType());
573 return !getTLI()->isIntDivCheap(VT, Fn.getAttributes());
574 }
575 };
576
577 return false;
578 }
579
580 unsigned getInliningThresholdMultiplier() const { return 1; }
581 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
582 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const {
583 return 0;
584 }
585
586 int getInlinerVectorBonusPercent() const { return 150; }
587
591 // This unrolling functionality is target independent, but to provide some
592 // motivation for its intended use, for x86:
593
594 // According to the Intel 64 and IA-32 Architectures Optimization Reference
595 // Manual, Intel Core models and later have a loop stream detector (and
596 // associated uop queue) that can benefit from partial unrolling.
597 // The relevant requirements are:
598 // - The loop must have no more than 4 (8 for Nehalem and later) branches
599 // taken, and none of them may be calls.
600 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
601
602 // According to the Software Optimization Guide for AMD Family 15h
603 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
604 // and loop buffer which can benefit from partial unrolling.
605 // The relevant requirements are:
606 // - The loop must have fewer than 16 branches
607 // - The loop must have less than 40 uops in all executed loop branches
608
609 // The number of taken branches in a loop is hard to estimate here, and
610 // benchmarking has revealed that it is better not to be conservative when
611 // estimating the branch count. As a result, we'll ignore the branch limits
612 // until someone finds a case where it matters in practice.
613
614 unsigned MaxOps;
615 const TargetSubtargetInfo *ST = getST();
616 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
618 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
619 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
620 else
621 return;
622
623 // Scan the loop: don't unroll loops with calls.
624 for (BasicBlock *BB : L->blocks()) {
625 for (Instruction &I : *BB) {
626 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
627 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
628 if (!thisT()->isLoweredToCall(F))
629 continue;
630 }
631
632 if (ORE) {
633 ORE->emit([&]() {
634 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
635 L->getHeader())
636 << "advising against unrolling the loop because it "
637 "contains a "
638 << ore::NV("Call", &I);
639 });
640 }
641 return;
642 }
643 }
644 }
645
646 // Enable runtime and partial unrolling up to the specified size.
647 // Enable using trip count upper bound to unroll loops.
648 UP.Partial = UP.Runtime = UP.UpperBound = true;
649 UP.PartialThreshold = MaxOps;
650
651 // Avoid unrolling when optimizing for size.
652 UP.OptSizeThreshold = 0;
654
655 // Set number of instructions optimized when "back edge"
656 // becomes "fall through" to default value of 2.
657 UP.BEInsns = 2;
658 }
659
662 PP.PeelCount = 0;
663 PP.AllowPeeling = true;
664 PP.AllowLoopNestsPeeling = false;
665 PP.PeelProfiledIterations = true;
666 }
667
669 AssumptionCache &AC,
670 TargetLibraryInfo *LibInfo,
671 HardwareLoopInfo &HWLoopInfo) {
672 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
673 }
674
677 }
678
680 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) {
681 return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
682 }
683
684 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
685 IntrinsicInst &II) {
687 }
688
689 std::optional<Value *>
691 APInt DemandedMask, KnownBits &Known,
692 bool &KnownBitsComputed) {
693 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
694 KnownBitsComputed);
695 }
696
698 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
699 APInt &UndefElts2, APInt &UndefElts3,
700 std::function<void(Instruction *, unsigned, APInt, APInt &)>
701 SimplifyAndSetOp) {
703 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
704 SimplifyAndSetOp);
705 }
706
707 virtual std::optional<unsigned>
709 return std::optional<unsigned>(
710 getST()->getCacheSize(static_cast<unsigned>(Level)));
711 }
712
713 virtual std::optional<unsigned>
715 std::optional<unsigned> TargetResult =
716 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
717
718 if (TargetResult)
719 return TargetResult;
720
721 return BaseT::getCacheAssociativity(Level);
722 }
723
724 virtual unsigned getCacheLineSize() const {
725 return getST()->getCacheLineSize();
726 }
727
728 virtual unsigned getPrefetchDistance() const {
729 return getST()->getPrefetchDistance();
730 }
731
732 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
733 unsigned NumStridedMemAccesses,
734 unsigned NumPrefetches,
735 bool HasCall) const {
736 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
737 NumPrefetches, HasCall);
738 }
739
740 virtual unsigned getMaxPrefetchIterationsAhead() const {
741 return getST()->getMaxPrefetchIterationsAhead();
742 }
743
744 virtual bool enableWritePrefetching() const {
745 return getST()->enableWritePrefetching();
746 }
747
748 virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
749 return getST()->shouldPrefetchAddressSpace(AS);
750 }
751
752 /// @}
753
754 /// \name Vector TTI Implementations
755 /// @{
756
758 return TypeSize::getFixed(32);
759 }
760
761 std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
762 std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
763 bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
764
765 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
766 /// are set if the demanded result elements need to be inserted and/or
767 /// extracted from vectors.
769 const APInt &DemandedElts,
770 bool Insert, bool Extract,
772 /// FIXME: a bitfield is not a reasonable abstraction for talking about
773 /// which elements are needed from a scalable vector
774 if (isa<ScalableVectorType>(InTy))
776 auto *Ty = cast<FixedVectorType>(InTy);
777
778 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
779 "Vector size mismatch");
780
782
783 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
784 if (!DemandedElts[i])
785 continue;
786 if (Insert)
787 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
788 CostKind, i, nullptr, nullptr);
789 if (Extract)
790 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
791 CostKind, i, nullptr, nullptr);
792 }
793
794 return Cost;
795 }
796
797 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
799 bool Extract,
801 if (isa<ScalableVectorType>(InTy))
803 auto *Ty = cast<FixedVectorType>(InTy);
804
805 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
806 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
807 CostKind);
808 }
809
810 /// Estimate the overhead of scalarizing an instructions unique
811 /// non-constant operands. The (potentially vector) types to use for each of
812 /// argument are passes via Tys.
817 assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
818
820 SmallPtrSet<const Value*, 4> UniqueOperands;
821 for (int I = 0, E = Args.size(); I != E; I++) {
822 // Disregard things like metadata arguments.
823 const Value *A = Args[I];
824 Type *Ty = Tys[I];
825 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
826 !Ty->isPtrOrPtrVectorTy())
827 continue;
828
829 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
830 if (auto *VecTy = dyn_cast<VectorType>(Ty))
831 Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
832 /*Extract*/ true, CostKind);
833 }
834 }
835
836 return Cost;
837 }
838
839 /// Estimate the overhead of scalarizing the inputs and outputs of an
840 /// instruction, with return type RetTy and arguments Args of type Tys. If
841 /// Args are unknown (empty), then the cost associated with one argument is
842 /// added as a heuristic.
848 RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
849 if (!Args.empty())
851 else
852 // When no information on arguments is provided, we add the cost
853 // associated with one argument as a heuristic.
854 Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
855 /*Extract*/ true, CostKind);
856
857 return Cost;
858 }
859
860 /// Estimate the cost of type-legalization and the legalized type.
861 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
862 LLVMContext &C = Ty->getContext();
863 EVT MTy = getTLI()->getValueType(DL, Ty);
864
866 // We keep legalizing the type until we find a legal kind. We assume that
867 // the only operation that costs anything is the split. After splitting
868 // we need to handle two types.
869 while (true) {
871
873 // Ensure we return a sensible simple VT here, since many callers of
874 // this function require it.
875 MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
876 return std::make_pair(InstructionCost::getInvalid(), VT);
877 }
878
879 if (LK.first == TargetLoweringBase::TypeLegal)
880 return std::make_pair(Cost, MTy.getSimpleVT());
881
882 if (LK.first == TargetLoweringBase::TypeSplitVector ||
884 Cost *= 2;
885
886 // Do not loop with f128 type.
887 if (MTy == LK.second)
888 return std::make_pair(Cost, MTy.getSimpleVT());
889
890 // Keep legalizing the type.
891 MTy = LK.second;
892 }
893 }
894
895 unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
896
898 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
901 ArrayRef<const Value *> Args = std::nullopt,
902 const Instruction *CxtI = nullptr) {
903 // Check if any of the operands are vector operands.
904 const TargetLoweringBase *TLI = getTLI();
905 int ISD = TLI->InstructionOpcodeToISD(Opcode);
906 assert(ISD && "Invalid opcode");
907
908 // TODO: Handle more cost kinds.
910 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
911 Opd1Info, Opd2Info,
912 Args, CxtI);
913
914 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
915
916 bool IsFloat = Ty->isFPOrFPVectorTy();
917 // Assume that floating point arithmetic operations cost twice as much as
918 // integer operations.
919 InstructionCost OpCost = (IsFloat ? 2 : 1);
920
921 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
922 // The operation is legal. Assume it costs 1.
923 // TODO: Once we have extract/insert subvector cost we need to use them.
924 return LT.first * OpCost;
925 }
926
927 if (!TLI->isOperationExpand(ISD, LT.second)) {
928 // If the operation is custom lowered, then assume that the code is twice
929 // as expensive.
930 return LT.first * 2 * OpCost;
931 }
932
933 // An 'Expand' of URem and SRem is special because it may default
934 // to expanding the operation into a sequence of sub-operations
935 // i.e. X % Y -> X-(X/Y)*Y.
936 if (ISD == ISD::UREM || ISD == ISD::SREM) {
937 bool IsSigned = ISD == ISD::SREM;
938 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
939 LT.second) ||
940 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
941 LT.second)) {
942 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
943 InstructionCost DivCost = thisT()->getArithmeticInstrCost(
944 DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
945 InstructionCost MulCost =
946 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
947 InstructionCost SubCost =
948 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
949 return DivCost + MulCost + SubCost;
950 }
951 }
952
953 // We cannot scalarize scalable vectors, so return Invalid.
954 if (isa<ScalableVectorType>(Ty))
956
957 // Else, assume that we need to scalarize this op.
958 // TODO: If one of the types get legalized by splitting, handle this
959 // similarly to what getCastInstrCost() does.
960 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
961 InstructionCost Cost = thisT()->getArithmeticInstrCost(
962 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
963 Args, CxtI);
964 // Return the cost of multiple scalar invocation plus the cost of
965 // inserting and extracting the values.
966 SmallVector<Type *> Tys(Args.size(), Ty);
967 return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
968 VTy->getNumElements() * Cost;
969 }
970
971 // We don't know anything about this scalar instruction.
972 return OpCost;
973 }
974
976 ArrayRef<int> Mask,
977 VectorType *Ty, int &Index,
978 VectorType *&SubTy) const {
979 if (Mask.empty())
980 return Kind;
981 int NumSrcElts = Ty->getElementCount().getKnownMinValue();
982 switch (Kind) {
984 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
985 return TTI::SK_Reverse;
986 if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
987 return TTI::SK_Broadcast;
988 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
989 (Index + Mask.size()) <= (size_t)NumSrcElts) {
990 SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size());
992 }
993 break;
995 int NumSubElts;
996 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
997 Mask, NumSrcElts, NumSubElts, Index)) {
998 if (Index + NumSubElts > NumSrcElts)
999 return Kind;
1000 SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts);
1002 }
1003 if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts))
1004 return TTI::SK_Select;
1005 if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts))
1006 return TTI::SK_Transpose;
1007 if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index))
1008 return TTI::SK_Splice;
1009 break;
1010 }
1011 case TTI::SK_Select:
1012 case TTI::SK_Reverse:
1013 case TTI::SK_Broadcast:
1014 case TTI::SK_Transpose:
1017 case TTI::SK_Splice:
1018 break;
1019 }
1020 return Kind;
1021 }
1022
1024 ArrayRef<int> Mask,
1026 VectorType *SubTp,
1027 ArrayRef<const Value *> Args = std::nullopt,
1028 const Instruction *CxtI = nullptr) {
1029 switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) {
1030 case TTI::SK_Broadcast:
1031 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1032 return getBroadcastShuffleOverhead(FVT, CostKind);
1034 case TTI::SK_Select:
1035 case TTI::SK_Splice:
1036 case TTI::SK_Reverse:
1037 case TTI::SK_Transpose:
1040 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1041 return getPermuteShuffleOverhead(FVT, CostKind);
1044 return getExtractSubvectorOverhead(Tp, CostKind, Index,
1045 cast<FixedVectorType>(SubTp));
1047 return getInsertSubvectorOverhead(Tp, CostKind, Index,
1048 cast<FixedVectorType>(SubTp));
1049 }
1050 llvm_unreachable("Unknown TTI::ShuffleKind");
1051 }
1052
1053 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1056 const Instruction *I = nullptr) {
1057 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
1058 return 0;
1059
1060 const TargetLoweringBase *TLI = getTLI();
1061 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1062 assert(ISD && "Invalid opcode");
1063 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1064 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1065
1066 TypeSize SrcSize = SrcLT.second.getSizeInBits();
1067 TypeSize DstSize = DstLT.second.getSizeInBits();
1068 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1069 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1070
1071 switch (Opcode) {
1072 default:
1073 break;
1074 case Instruction::Trunc:
1075 // Check for NOOP conversions.
1076 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
1077 return 0;
1078 [[fallthrough]];
1079 case Instruction::BitCast:
1080 // Bitcast between types that are legalized to the same type are free and
1081 // assume int to/from ptr of the same size is also free.
1082 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1083 SrcSize == DstSize)
1084 return 0;
1085 break;
1086 case Instruction::FPExt:
1087 if (I && getTLI()->isExtFree(I))
1088 return 0;
1089 break;
1090 case Instruction::ZExt:
1091 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
1092 return 0;
1093 [[fallthrough]];
1094 case Instruction::SExt:
1095 if (I && getTLI()->isExtFree(I))
1096 return 0;
1097
1098 // If this is a zext/sext of a load, return 0 if the corresponding
1099 // extending load exists on target and the result type is legal.
1100 if (CCH == TTI::CastContextHint::Normal) {
1101 EVT ExtVT = EVT::getEVT(Dst);
1102 EVT LoadVT = EVT::getEVT(Src);
1103 unsigned LType =
1104 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1105 if (DstLT.first == SrcLT.first &&
1106 TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
1107 return 0;
1108 }
1109 break;
1110 case Instruction::AddrSpaceCast:
1111 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
1112 Dst->getPointerAddressSpace()))
1113 return 0;
1114 break;
1115 }
1116
1117 auto *SrcVTy = dyn_cast<VectorType>(Src);
1118 auto *DstVTy = dyn_cast<VectorType>(Dst);
1119
1120 // If the cast is marked as legal (or promote) then assume low cost.
1121 if (SrcLT.first == DstLT.first &&
1122 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1123 return SrcLT.first;
1124
1125 // Handle scalar conversions.
1126 if (!SrcVTy && !DstVTy) {
1127 // Just check the op cost. If the operation is legal then assume it costs
1128 // 1.
1129 if (!TLI->isOperationExpand(ISD, DstLT.second))
1130 return 1;
1131
1132 // Assume that illegal scalar instruction are expensive.
1133 return 4;
1134 }
1135
1136 // Check vector-to-vector casts.
1137 if (DstVTy && SrcVTy) {
1138 // If the cast is between same-sized registers, then the check is simple.
1139 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1140
1141 // Assume that Zext is done using AND.
1142 if (Opcode == Instruction::ZExt)
1143 return SrcLT.first;
1144
1145 // Assume that sext is done using SHL and SRA.
1146 if (Opcode == Instruction::SExt)
1147 return SrcLT.first * 2;
1148
1149 // Just check the op cost. If the operation is legal then assume it
1150 // costs
1151 // 1 and multiply by the type-legalization overhead.
1152 if (!TLI->isOperationExpand(ISD, DstLT.second))
1153 return SrcLT.first * 1;
1154 }
1155
1156 // If we are legalizing by splitting, query the concrete TTI for the cost
1157 // of casting the original vector twice. We also need to factor in the
1158 // cost of the split itself. Count that as 1, to be consistent with
1159 // getTypeLegalizationCost().
1160 bool SplitSrc =
1161 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1163 bool SplitDst =
1164 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1166 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1167 DstVTy->getElementCount().isVector()) {
1168 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1169 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1170 T *TTI = static_cast<T *>(this);
1171 // If both types need to be split then the split is free.
1172 InstructionCost SplitCost =
1173 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1174 return SplitCost +
1175 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1176 CostKind, I));
1177 }
1178
1179 // Scalarization cost is Invalid, can't assume any num elements.
1180 if (isa<ScalableVectorType>(DstVTy))
1182
1183 // In other cases where the source or destination are illegal, assume
1184 // the operation will get scalarized.
1185 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1186 InstructionCost Cost = thisT()->getCastInstrCost(
1187 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1188
1189 // Return the cost of multiple scalar invocation plus the cost of
1190 // inserting and extracting the values.
1191 return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1192 CostKind) +
1193 Num * Cost;
1194 }
1195
1196 // We already handled vector-to-vector and scalar-to-scalar conversions.
1197 // This
1198 // is where we handle bitcast between vectors and scalars. We need to assume
1199 // that the conversion is scalarized in one way or another.
1200 if (Opcode == Instruction::BitCast) {
1201 // Illegal bitcasts are done by storing and loading from a stack slot.
1202 return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1203 /*Extract*/ true, CostKind)
1204 : 0) +
1205 (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1206 /*Extract*/ false, CostKind)
1207 : 0);
1208 }
1209
1210 llvm_unreachable("Unhandled cast");
1211 }
1212
1214 VectorType *VecTy, unsigned Index) {
1216 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1217 CostKind, Index, nullptr, nullptr) +
1218 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1220 }
1221
1223 const Instruction *I = nullptr) {
1224 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1225 }
1226
1227 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1228 CmpInst::Predicate VecPred,
1230 const Instruction *I = nullptr) {
1231 const TargetLoweringBase *TLI = getTLI();
1232 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1233 assert(ISD && "Invalid opcode");
1234
1235 // TODO: Handle other cost kinds.
1237 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1238 I);
1239
1240 // Selects on vectors are actually vector selects.
1241 if (ISD == ISD::SELECT) {
1242 assert(CondTy && "CondTy must exist");
1243 if (CondTy->isVectorTy())
1244 ISD = ISD::VSELECT;
1245 }
1246 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1247
1248 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1249 !TLI->isOperationExpand(ISD, LT.second)) {
1250 // The operation is legal. Assume it costs 1. Multiply
1251 // by the type-legalization overhead.
1252 return LT.first * 1;
1253 }
1254
1255 // Otherwise, assume that the cast is scalarized.
1256 // TODO: If one of the types get legalized by splitting, handle this
1257 // similarly to what getCastInstrCost() does.
1258 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1259 if (isa<ScalableVectorType>(ValTy))
1261
1262 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1263 if (CondTy)
1264 CondTy = CondTy->getScalarType();
1265 InstructionCost Cost = thisT()->getCmpSelInstrCost(
1266 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1267
1268 // Return the cost of multiple scalar invocation plus the cost of
1269 // inserting and extracting the values.
1270 return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1271 /*Extract*/ false, CostKind) +
1272 Num * Cost;
1273 }
1274
1275 // Unknown scalar opcode.
1276 return 1;
1277 }
1278
1281 unsigned Index, Value *Op0, Value *Op1) {
1282 return getRegUsageForType(Val->getScalarType());
1283 }
1284
1287 unsigned Index) {
1288 Value *Op0 = nullptr;
1289 Value *Op1 = nullptr;
1290 if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
1291 Op0 = IE->getOperand(0);
1292 Op1 = IE->getOperand(1);
1293 }
1294 return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1295 Op1);
1296 }
1297
1298 InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1299 int VF,
1300 const APInt &DemandedDstElts,
1302 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1303 "Unexpected size of DemandedDstElts.");
1304
1306
1307 auto *SrcVT = FixedVectorType::get(EltTy, VF);
1308 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1309
1310 // The Mask shuffling cost is extract all the elements of the Mask
1311 // and insert each of them Factor times into the wide vector:
1312 //
1313 // E.g. an interleaved group with factor 3:
1314 // %mask = icmp ult <8 x i32> %vec1, %vec2
1315 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1316 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1317 // The cost is estimated as extract all mask elements from the <8xi1> mask
1318 // vector and insert them factor times into the <24xi1> shuffled mask
1319 // vector.
1320 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1321 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1322 /*Insert*/ false,
1323 /*Extract*/ true, CostKind);
1324 Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1325 /*Insert*/ true,
1326 /*Extract*/ false, CostKind);
1327
1328 return Cost;
1329 }
1330
1332 getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1335 const Instruction *I = nullptr) {
1336 assert(!Src->isVoidTy() && "Invalid type");
1337 // Assume types, such as structs, are expensive.
1338 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1339 return 4;
1340 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1341
1342 // Assuming that all loads of legal types cost 1.
1343 InstructionCost Cost = LT.first;
1345 return Cost;
1346
1347 const DataLayout &DL = this->getDataLayout();
1348 if (Src->isVectorTy() &&
1349 // In practice it's not currently possible to have a change in lane
1350 // length for extending loads or truncating stores so both types should
1351 // have the same scalable property.
1353 LT.second.getSizeInBits())) {
1354 // This is a vector load that legalizes to a larger type than the vector
1355 // itself. Unless the corresponding extending load or truncating store is
1356 // legal, then this will scalarize.
1358 EVT MemVT = getTLI()->getValueType(DL, Src);
1359 if (Opcode == Instruction::Store)
1360 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1361 else
1362 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1363
1364 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1365 // This is a vector load/store for some illegal type that is scalarized.
1366 // We must account for the cost of building or decomposing the vector.
1368 cast<VectorType>(Src), Opcode != Instruction::Store,
1369 Opcode == Instruction::Store, CostKind);
1370 }
1371 }
1372
1373 return Cost;
1374 }
1375
1377 Align Alignment, unsigned AddressSpace,
1379 // TODO: Pass on AddressSpace when we have test coverage.
1380 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1381 CostKind);
1382 }
1383
1385 const Value *Ptr, bool VariableMask,
1386 Align Alignment,
1388 const Instruction *I = nullptr) {
1389 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1390 true, CostKind);
1391 }
1392
1394 const Value *Ptr, bool VariableMask,
1395 Align Alignment,
1397 const Instruction *I) {
1398 // For a target without strided memory operations (or for an illegal
1399 // operation type on one which does), assume we lower to a gather/scatter
1400 // operation. (Which may in turn be scalarized.)
1401 return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1402 Alignment, CostKind, I);
1403 }
1404
1406 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1407 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1408 bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1409
1410 // We cannot scalarize scalable vectors, so return Invalid.
1411 if (isa<ScalableVectorType>(VecTy))
1413
1414 auto *VT = cast<FixedVectorType>(VecTy);
1415
1416 unsigned NumElts = VT->getNumElements();
1417 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1418
1419 unsigned NumSubElts = NumElts / Factor;
1420 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1421
1422 // Firstly, the cost of load/store operation.
1424 if (UseMaskForCond || UseMaskForGaps)
1425 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1427 else
1428 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1429 CostKind);
1430
1431 // Legalize the vector type, and get the legalized and unlegalized type
1432 // sizes.
1433 MVT VecTyLT = getTypeLegalizationCost(VecTy).second;
1434 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1435 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1436
1437 // Scale the cost of the memory operation by the fraction of legalized
1438 // instructions that will actually be used. We shouldn't account for the
1439 // cost of dead instructions since they will be removed.
1440 //
1441 // E.g., An interleaved load of factor 8:
1442 // %vec = load <16 x i64>, <16 x i64>* %ptr
1443 // %v0 = shufflevector %vec, undef, <0, 8>
1444 //
1445 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1446 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1447 // type). The other loads are unused.
1448 //
1449 // TODO: Note that legalization can turn masked loads/stores into unmasked
1450 // (legalized) loads/stores. This can be reflected in the cost.
1451 if (Cost.isValid() && VecTySize > VecTyLTSize) {
1452 // The number of loads of a legal type it will take to represent a load
1453 // of the unlegalized vector type.
1454 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1455
1456 // The number of elements of the unlegalized type that correspond to a
1457 // single legal instruction.
1458 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1459
1460 // Determine which legal instructions will be used.
1461 BitVector UsedInsts(NumLegalInsts, false);
1462 for (unsigned Index : Indices)
1463 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1464 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1465
1466 // Scale the cost of the load by the fraction of legal instructions that
1467 // will be used.
1468 Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts);
1469 }
1470
1471 // Then plus the cost of interleave operation.
1472 assert(Indices.size() <= Factor &&
1473 "Interleaved memory op has too many members");
1474
1475 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1476 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1477
1478 APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1479 for (unsigned Index : Indices) {
1480 assert(Index < Factor && "Invalid index for interleaved memory op");
1481 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1482 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1483 }
1484
1485 if (Opcode == Instruction::Load) {
1486 // The interleave cost is similar to extract sub vectors' elements
1487 // from the wide vector, and insert them into sub vectors.
1488 //
1489 // E.g. An interleaved load of factor 2 (with one member of index 0):
1490 // %vec = load <8 x i32>, <8 x i32>* %ptr
1491 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1492 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1493 // <8 x i32> vector and insert them into a <4 x i32> vector.
1494 InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1495 SubVT, DemandedAllSubElts,
1496 /*Insert*/ true, /*Extract*/ false, CostKind);
1497 Cost += Indices.size() * InsSubCost;
1498 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1499 /*Insert*/ false,
1500 /*Extract*/ true, CostKind);
1501 } else {
1502 // The interleave cost is extract elements from sub vectors, and
1503 // insert them into the wide vector.
1504 //
1505 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1506 // (using VF=4):
1507 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1508 // %gaps.mask = <true, true, false, true, true, false,
1509 // true, true, false, true, true, false>
1510 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1511 // i32 Align, <12 x i1> %gaps.mask
1512 // The cost is estimated as extract all elements (of actual members,
1513 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1514 // i32> vector.
1515 InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1516 SubVT, DemandedAllSubElts,
1517 /*Insert*/ false, /*Extract*/ true, CostKind);
1518 Cost += ExtSubCost * Indices.size();
1519 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1520 /*Insert*/ true,
1521 /*Extract*/ false, CostKind);
1522 }
1523
1524 if (!UseMaskForCond)
1525 return Cost;
1526
1527 Type *I8Type = Type::getInt8Ty(VT->getContext());
1528
1529 Cost += thisT()->getReplicationShuffleCost(
1530 I8Type, Factor, NumSubElts,
1531 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1532 CostKind);
1533
1534 // The Gaps mask is invariant and created outside the loop, therefore the
1535 // cost of creating it is not accounted for here. However if we have both
1536 // a MaskForGaps and some other mask that guards the execution of the
1537 // memory access, we need to account for the cost of And-ing the two masks
1538 // inside the loop.
1539 if (UseMaskForGaps) {
1540 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1541 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1542 CostKind);
1543 }
1544
1545 return Cost;
1546 }
1547
1548 /// Get intrinsic cost based on arguments.
1551 // Check for generically free intrinsics.
1553 return 0;
1554
1555 // Assume that target intrinsics are cheap.
1556 Intrinsic::ID IID = ICA.getID();
1559
1560 if (ICA.isTypeBasedOnly())
1562
1563 Type *RetTy = ICA.getReturnType();
1564
1565 ElementCount RetVF =
1566 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1568 const IntrinsicInst *I = ICA.getInst();
1569 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1570 FastMathFlags FMF = ICA.getFlags();
1571 switch (IID) {
1572 default:
1573 break;
1574
1575 case Intrinsic::powi:
1576 if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1577 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1578 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1579 ShouldOptForSize)) {
1580 // The cost is modeled on the expansion performed by ExpandPowI in
1581 // SelectionDAGBuilder.
1582 APInt Exponent = RHSC->getValue().abs();
1583 unsigned ActiveBits = Exponent.getActiveBits();
1584 unsigned PopCount = Exponent.popcount();
1585 InstructionCost Cost = (ActiveBits + PopCount - 2) *
1586 thisT()->getArithmeticInstrCost(
1587 Instruction::FMul, RetTy, CostKind);
1588 if (RHSC->isNegative())
1589 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1590 CostKind);
1591 return Cost;
1592 }
1593 }
1594 break;
1595 case Intrinsic::cttz:
1596 // FIXME: If necessary, this should go in target-specific overrides.
1597 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1599 break;
1600
1601 case Intrinsic::ctlz:
1602 // FIXME: If necessary, this should go in target-specific overrides.
1603 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1605 break;
1606
1607 case Intrinsic::memcpy:
1608 return thisT()->getMemcpyCost(ICA.getInst());
1609
1610 case Intrinsic::masked_scatter: {
1611 const Value *Mask = Args[3];
1612 bool VarMask = !isa<Constant>(Mask);
1613 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1614 return thisT()->getGatherScatterOpCost(Instruction::Store,
1615 ICA.getArgTypes()[0], Args[1],
1616 VarMask, Alignment, CostKind, I);
1617 }
1618 case Intrinsic::masked_gather: {
1619 const Value *Mask = Args[2];
1620 bool VarMask = !isa<Constant>(Mask);
1621 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1622 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1623 VarMask, Alignment, CostKind, I);
1624 }
1625 case Intrinsic::experimental_vp_strided_store: {
1626 const Value *Data = Args[0];
1627 const Value *Ptr = Args[1];
1628 const Value *Mask = Args[3];
1629 const Value *EVL = Args[4];
1630 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1631 Type *EltTy = cast<VectorType>(Data->getType())->getElementType();
1632 Align Alignment =
1633 I->getParamAlign(1).value_or(thisT()->DL.getABITypeAlign(EltTy));
1634 return thisT()->getStridedMemoryOpCost(Instruction::Store,
1635 Data->getType(), Ptr, VarMask,
1636 Alignment, CostKind, I);
1637 }
1638 case Intrinsic::experimental_vp_strided_load: {
1639 const Value *Ptr = Args[0];
1640 const Value *Mask = Args[2];
1641 const Value *EVL = Args[3];
1642 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1643 Type *EltTy = cast<VectorType>(RetTy)->getElementType();
1644 Align Alignment =
1645 I->getParamAlign(0).value_or(thisT()->DL.getABITypeAlign(EltTy));
1646 return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr,
1647 VarMask, Alignment, CostKind, I);
1648 }
1649 case Intrinsic::experimental_stepvector: {
1650 if (isa<ScalableVectorType>(RetTy))
1652 // The cost of materialising a constant integer vector.
1654 }
1655 case Intrinsic::vector_extract: {
1656 // FIXME: Handle case where a scalable vector is extracted from a scalable
1657 // vector
1658 if (isa<ScalableVectorType>(RetTy))
1660 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1661 return thisT()->getShuffleCost(
1662 TTI::SK_ExtractSubvector, cast<VectorType>(Args[0]->getType()),
1663 std::nullopt, CostKind, Index, cast<VectorType>(RetTy));
1664 }
1665 case Intrinsic::vector_insert: {
1666 // FIXME: Handle case where a scalable vector is inserted into a scalable
1667 // vector
1668 if (isa<ScalableVectorType>(Args[1]->getType()))
1670 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1671 return thisT()->getShuffleCost(
1672 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()),
1673 std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType()));
1674 }
1675 case Intrinsic::vector_reverse: {
1676 return thisT()->getShuffleCost(
1677 TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt,
1678 CostKind, 0, cast<VectorType>(RetTy));
1679 }
1680 case Intrinsic::vector_splice: {
1681 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1682 return thisT()->getShuffleCost(
1683 TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt,
1684 CostKind, Index, cast<VectorType>(RetTy));
1685 }
1686 case Intrinsic::vector_reduce_add:
1687 case Intrinsic::vector_reduce_mul:
1688 case Intrinsic::vector_reduce_and:
1689 case Intrinsic::vector_reduce_or:
1690 case Intrinsic::vector_reduce_xor:
1691 case Intrinsic::vector_reduce_smax:
1692 case Intrinsic::vector_reduce_smin:
1693 case Intrinsic::vector_reduce_fmax:
1694 case Intrinsic::vector_reduce_fmin:
1695 case Intrinsic::vector_reduce_fmaximum:
1696 case Intrinsic::vector_reduce_fminimum:
1697 case Intrinsic::vector_reduce_umax:
1698 case Intrinsic::vector_reduce_umin: {
1699 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1701 }
1702 case Intrinsic::vector_reduce_fadd:
1703 case Intrinsic::vector_reduce_fmul: {
1705 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1707 }
1708 case Intrinsic::fshl:
1709 case Intrinsic::fshr: {
1710 const Value *X = Args[0];
1711 const Value *Y = Args[1];
1712 const Value *Z = Args[2];
1715 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z);
1716 const TTI::OperandValueInfo OpInfoBW =
1718 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1719 : TTI::OP_None};
1720
1721 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1722 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1724 Cost +=
1725 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1726 Cost +=
1727 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1728 Cost += thisT()->getArithmeticInstrCost(
1729 BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
1730 {OpInfoZ.Kind, TTI::OP_None});
1731 Cost += thisT()->getArithmeticInstrCost(
1732 BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
1733 {OpInfoZ.Kind, TTI::OP_None});
1734 // Non-constant shift amounts requires a modulo.
1735 if (!OpInfoZ.isConstant())
1736 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1737 CostKind, OpInfoZ, OpInfoBW);
1738 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1739 if (X != Y) {
1740 Type *CondTy = RetTy->getWithNewBitWidth(1);
1741 Cost +=
1742 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1744 Cost +=
1745 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1747 }
1748 return Cost;
1749 }
1750 case Intrinsic::get_active_lane_mask: {
1751 EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1752 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1753
1754 // If we're not expanding the intrinsic then we assume this is cheap
1755 // to implement.
1756 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1757 return getTypeLegalizationCost(RetTy).first;
1758 }
1759
1760 // Create the expanded types that will be used to calculate the uadd_sat
1761 // operation.
1762 Type *ExpRetTy = VectorType::get(
1763 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1764 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1766 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1767 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1769 return Cost;
1770 }
1771 case Intrinsic::experimental_cttz_elts: {
1772 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1773
1774 // If we're not expanding the intrinsic then we assume this is cheap
1775 // to implement.
1776 if (!getTLI()->shouldExpandCttzElements(ArgType))
1777 return getTypeLegalizationCost(RetTy).first;
1778
1779 // TODO: The costs below reflect the expansion code in
1780 // SelectionDAGBuilder, but we may want to sacrifice some accuracy in
1781 // favour of compile time.
1782
1783 // Find the smallest "sensible" element type to use for the expansion.
1784 bool ZeroIsPoison = !cast<ConstantInt>(Args[1])->isZero();
1785 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
1786 if (isa<ScalableVectorType>(ICA.getArgTypes()[0]) && I && I->getCaller())
1787 VScaleRange = getVScaleRange(I->getCaller(), 64);
1788
1789 unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
1790 RetTy, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
1791 Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth);
1792
1793 // Create the new vector type & get the vector length
1794 Type *NewVecTy = VectorType::get(
1795 NewEltTy, cast<VectorType>(Args[0]->getType())->getElementCount());
1796
1797 IntrinsicCostAttributes StepVecAttrs(Intrinsic::experimental_stepvector,
1798 NewVecTy, {}, FMF);
1800 thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
1801
1802 Cost +=
1803 thisT()->getArithmeticInstrCost(Instruction::Sub, NewVecTy, CostKind);
1804 Cost += thisT()->getCastInstrCost(Instruction::SExt, NewVecTy,
1805 Args[0]->getType(),
1807 Cost +=
1808 thisT()->getArithmeticInstrCost(Instruction::And, NewVecTy, CostKind);
1809
1810 IntrinsicCostAttributes ReducAttrs(Intrinsic::vector_reduce_umax,
1811 NewEltTy, NewVecTy, FMF, I, 1);
1812 Cost += thisT()->getTypeBasedIntrinsicInstrCost(ReducAttrs, CostKind);
1813 Cost +=
1814 thisT()->getArithmeticInstrCost(Instruction::Sub, NewEltTy, CostKind);
1815
1816 return Cost;
1817 }
1818 }
1819
1820 // VP Intrinsics should have the same cost as their non-vp counterpart.
1821 // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
1822 // counterpart when the vector length argument is smaller than the maximum
1823 // vector length.
1824 // TODO: Support other kinds of VPIntrinsics
1825 if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
1826 std::optional<unsigned> FOp =
1828 if (FOp) {
1829 if (ICA.getID() == Intrinsic::vp_load) {
1830 Align Alignment;
1831 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1832 Alignment = VPI->getPointerAlignment().valueOrOne();
1833 unsigned AS = 0;
1834 if (ICA.getArgs().size() > 1)
1835 if (auto *PtrTy =
1836 dyn_cast<PointerType>(ICA.getArgs()[0]->getType()))
1837 AS = PtrTy->getAddressSpace();
1838 return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
1839 AS, CostKind);
1840 }
1841 if (ICA.getID() == Intrinsic::vp_store) {
1842 Align Alignment;
1843 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1844 Alignment = VPI->getPointerAlignment().valueOrOne();
1845 unsigned AS = 0;
1846 if (ICA.getArgs().size() >= 2)
1847 if (auto *PtrTy =
1848 dyn_cast<PointerType>(ICA.getArgs()[1]->getType()))
1849 AS = PtrTy->getAddressSpace();
1850 return thisT()->getMemoryOpCost(*FOp, Args[0]->getType(), Alignment,
1851 AS, CostKind);
1852 }
1854 return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
1855 CostKind);
1856 }
1857 }
1858
1859 std::optional<Intrinsic::ID> FID =
1861 if (FID) {
1862 // Non-vp version will have same Args/Tys except mask and vector length.
1863 assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 &&
1864 "Expected VPIntrinsic to have Mask and Vector Length args and "
1865 "types");
1867
1868 // VPReduction intrinsics have a start value argument that their non-vp
1869 // counterparts do not have, except for the fadd and fmul non-vp
1870 // counterpart.
1872 *FID != Intrinsic::vector_reduce_fadd &&
1873 *FID != Intrinsic::vector_reduce_fmul)
1874 NewTys = NewTys.drop_front();
1875
1876 IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys,
1877 ICA.getFlags());
1878 return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
1879 }
1880 }
1881
1882 // Assume that we need to scalarize this intrinsic.)
1883 // Compute the scalarization overhead based on Args for a vector
1884 // intrinsic.
1885 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1886 if (RetVF.isVector() && !RetVF.isScalable()) {
1887 ScalarizationCost = 0;
1888 if (!RetTy->isVoidTy())
1889 ScalarizationCost += getScalarizationOverhead(
1890 cast<VectorType>(RetTy),
1891 /*Insert*/ true, /*Extract*/ false, CostKind);
1892 ScalarizationCost +=
1894 }
1895
1896 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1897 ScalarizationCost);
1898 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1899 }
1900
1901 /// Get intrinsic cost based on argument types.
1902 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1903 /// cost of scalarizing the arguments and the return value will be computed
1904 /// based on types.
1908 Intrinsic::ID IID = ICA.getID();
1909 Type *RetTy = ICA.getReturnType();
1910 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1911 FastMathFlags FMF = ICA.getFlags();
1912 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1913 bool SkipScalarizationCost = ICA.skipScalarizationCost();
1914
1915 VectorType *VecOpTy = nullptr;
1916 if (!Tys.empty()) {
1917 // The vector reduction operand is operand 0 except for fadd/fmul.
1918 // Their operand 0 is a scalar start value, so the vector op is operand 1.
1919 unsigned VecTyIndex = 0;
1920 if (IID == Intrinsic::vector_reduce_fadd ||
1921 IID == Intrinsic::vector_reduce_fmul)
1922 VecTyIndex = 1;
1923 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1924 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1925 }
1926
1927 // Library call cost - other than size, make it expensive.
1928 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1929 unsigned ISD = 0;
1930 switch (IID) {
1931 default: {
1932 // Scalable vectors cannot be scalarized, so return Invalid.
1933 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1934 return isa<ScalableVectorType>(Ty);
1935 }))
1937
1938 // Assume that we need to scalarize this intrinsic.
1939 InstructionCost ScalarizationCost =
1940 SkipScalarizationCost ? ScalarizationCostPassed : 0;
1941 unsigned ScalarCalls = 1;
1942 Type *ScalarRetTy = RetTy;
1943 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1944 if (!SkipScalarizationCost)
1945 ScalarizationCost = getScalarizationOverhead(
1946 RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
1947 ScalarCalls = std::max(ScalarCalls,
1948 cast<FixedVectorType>(RetVTy)->getNumElements());
1949 ScalarRetTy = RetTy->getScalarType();
1950 }
1951 SmallVector<Type *, 4> ScalarTys;
1952 for (Type *Ty : Tys) {
1953 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1954 if (!SkipScalarizationCost)
1955 ScalarizationCost += getScalarizationOverhead(
1956 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
1957 ScalarCalls = std::max(ScalarCalls,
1958 cast<FixedVectorType>(VTy)->getNumElements());
1959 Ty = Ty->getScalarType();
1960 }
1961 ScalarTys.push_back(Ty);
1962 }
1963 if (ScalarCalls == 1)
1964 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1965
1966 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1967 InstructionCost ScalarCost =
1968 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1969
1970 return ScalarCalls * ScalarCost + ScalarizationCost;
1971 }
1972 // Look for intrinsics that can be lowered directly or turned into a scalar
1973 // intrinsic call.
1974 case Intrinsic::sqrt:
1975 ISD = ISD::FSQRT;
1976 break;
1977 case Intrinsic::sin:
1978 ISD = ISD::FSIN;
1979 break;
1980 case Intrinsic::cos:
1981 ISD = ISD::FCOS;
1982 break;
1983 case Intrinsic::tan:
1984 ISD = ISD::FTAN;
1985 break;
1986 case Intrinsic::asin:
1987 ISD = ISD::FASIN;
1988 break;
1989 case Intrinsic::acos:
1990 ISD = ISD::FACOS;
1991 break;
1992 case Intrinsic::atan:
1993 ISD = ISD::FATAN;
1994 break;
1995 case Intrinsic::sinh:
1996 ISD = ISD::FSINH;
1997 break;
1998 case Intrinsic::cosh:
1999 ISD = ISD::FCOSH;
2000 break;
2001 case Intrinsic::tanh:
2002 ISD = ISD::FTANH;
2003 break;
2004 case Intrinsic::exp:
2005 ISD = ISD::FEXP;
2006 break;
2007 case Intrinsic::exp2:
2008 ISD = ISD::FEXP2;
2009 break;
2010 case Intrinsic::exp10:
2011 ISD = ISD::FEXP10;
2012 break;
2013 case Intrinsic::log:
2014 ISD = ISD::FLOG;
2015 break;
2016 case Intrinsic::log10:
2017 ISD = ISD::FLOG10;
2018 break;
2019 case Intrinsic::log2:
2020 ISD = ISD::FLOG2;
2021 break;
2022 case Intrinsic::fabs:
2023 ISD = ISD::FABS;
2024 break;
2025 case Intrinsic::canonicalize:
2026 ISD = ISD::FCANONICALIZE;
2027 break;
2028 case Intrinsic::minnum:
2029 ISD = ISD::FMINNUM;
2030 break;
2031 case Intrinsic::maxnum:
2032 ISD = ISD::FMAXNUM;
2033 break;
2034 case Intrinsic::minimum:
2035 ISD = ISD::FMINIMUM;
2036 break;
2037 case Intrinsic::maximum:
2038 ISD = ISD::FMAXIMUM;
2039 break;
2040 case Intrinsic::minimumnum:
2041 ISD = ISD::FMINIMUMNUM;
2042 break;
2043 case Intrinsic::maximumnum:
2044 ISD = ISD::FMAXIMUMNUM;
2045 break;
2046 case Intrinsic::copysign:
2047 ISD = ISD::FCOPYSIGN;
2048 break;
2049 case Intrinsic::floor:
2050 ISD = ISD::FFLOOR;
2051 break;
2052 case Intrinsic::ceil:
2053 ISD = ISD::FCEIL;
2054 break;
2055 case Intrinsic::trunc:
2056 ISD = ISD::FTRUNC;
2057 break;
2058 case Intrinsic::nearbyint:
2059 ISD = ISD::FNEARBYINT;
2060 break;
2061 case Intrinsic::rint:
2062 ISD = ISD::FRINT;
2063 break;
2064 case Intrinsic::lrint:
2065 ISD = ISD::LRINT;
2066 break;
2067 case Intrinsic::llrint:
2068 ISD = ISD::LLRINT;
2069 break;
2070 case Intrinsic::round:
2071 ISD = ISD::FROUND;
2072 break;
2073 case Intrinsic::roundeven:
2074 ISD = ISD::FROUNDEVEN;
2075 break;
2076 case Intrinsic::pow:
2077 ISD = ISD::FPOW;
2078 break;
2079 case Intrinsic::fma:
2080 ISD = ISD::FMA;
2081 break;
2082 case Intrinsic::fmuladd:
2083 ISD = ISD::FMA;
2084 break;
2085 case Intrinsic::experimental_constrained_fmuladd:
2086 ISD = ISD::STRICT_FMA;
2087 break;
2088 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
2089 case Intrinsic::lifetime_start:
2090 case Intrinsic::lifetime_end:
2091 case Intrinsic::sideeffect:
2092 case Intrinsic::pseudoprobe:
2093 case Intrinsic::arithmetic_fence:
2094 return 0;
2095 case Intrinsic::masked_store: {
2096 Type *Ty = Tys[0];
2097 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2098 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
2099 CostKind);
2100 }
2101 case Intrinsic::masked_load: {
2102 Type *Ty = RetTy;
2103 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2104 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
2105 CostKind);
2106 }
2107 case Intrinsic::vector_reduce_add:
2108 case Intrinsic::vector_reduce_mul:
2109 case Intrinsic::vector_reduce_and:
2110 case Intrinsic::vector_reduce_or:
2111 case Intrinsic::vector_reduce_xor:
2112 return thisT()->getArithmeticReductionCost(
2113 getArithmeticReductionInstruction(IID), VecOpTy, std::nullopt,
2114 CostKind);
2115 case Intrinsic::vector_reduce_fadd:
2116 case Intrinsic::vector_reduce_fmul:
2117 return thisT()->getArithmeticReductionCost(
2118 getArithmeticReductionInstruction(IID), VecOpTy, FMF, CostKind);
2119 case Intrinsic::vector_reduce_smax:
2120 case Intrinsic::vector_reduce_smin:
2121 case Intrinsic::vector_reduce_umax:
2122 case Intrinsic::vector_reduce_umin:
2123 case Intrinsic::vector_reduce_fmax:
2124 case Intrinsic::vector_reduce_fmin:
2125 case Intrinsic::vector_reduce_fmaximum:
2126 case Intrinsic::vector_reduce_fminimum:
2127 return thisT()->getMinMaxReductionCost(getMinMaxReductionIntrinsicOp(IID),
2128 VecOpTy, ICA.getFlags(), CostKind);
2129 case Intrinsic::abs:
2130 ISD = ISD::ABS;
2131 break;
2132 case Intrinsic::smax:
2133 ISD = ISD::SMAX;
2134 break;
2135 case Intrinsic::smin:
2136 ISD = ISD::SMIN;
2137 break;
2138 case Intrinsic::umax:
2139 ISD = ISD::UMAX;
2140 break;
2141 case Intrinsic::umin:
2142 ISD = ISD::UMIN;
2143 break;
2144 case Intrinsic::sadd_sat:
2145 ISD = ISD::SADDSAT;
2146 break;
2147 case Intrinsic::ssub_sat:
2148 ISD = ISD::SSUBSAT;
2149 break;
2150 case Intrinsic::uadd_sat:
2151 ISD = ISD::UADDSAT;
2152 break;
2153 case Intrinsic::usub_sat:
2154 ISD = ISD::USUBSAT;
2155 break;
2156 case Intrinsic::smul_fix:
2157 ISD = ISD::SMULFIX;
2158 break;
2159 case Intrinsic::umul_fix:
2160 ISD = ISD::UMULFIX;
2161 break;
2162 case Intrinsic::sadd_with_overflow:
2163 ISD = ISD::SADDO;
2164 break;
2165 case Intrinsic::ssub_with_overflow:
2166 ISD = ISD::SSUBO;
2167 break;
2168 case Intrinsic::uadd_with_overflow:
2169 ISD = ISD::UADDO;
2170 break;
2171 case Intrinsic::usub_with_overflow:
2172 ISD = ISD::USUBO;
2173 break;
2174 case Intrinsic::smul_with_overflow:
2175 ISD = ISD::SMULO;
2176 break;
2177 case Intrinsic::umul_with_overflow:
2178 ISD = ISD::UMULO;
2179 break;
2180 case Intrinsic::fptosi_sat:
2181 ISD = ISD::FP_TO_SINT_SAT;
2182 break;
2183 case Intrinsic::fptoui_sat:
2184 ISD = ISD::FP_TO_UINT_SAT;
2185 break;
2186 case Intrinsic::ctpop:
2187 ISD = ISD::CTPOP;
2188 // In case of legalization use TCC_Expensive. This is cheaper than a
2189 // library call but still not a cheap instruction.
2190 SingleCallCost = TargetTransformInfo::TCC_Expensive;
2191 break;
2192 case Intrinsic::ctlz:
2193 ISD = ISD::CTLZ;
2194 break;
2195 case Intrinsic::cttz:
2196 ISD = ISD::CTTZ;
2197 break;
2198 case Intrinsic::bswap:
2199 ISD = ISD::BSWAP;
2200 break;
2201 case Intrinsic::bitreverse:
2202 ISD = ISD::BITREVERSE;
2203 break;
2204 }
2205
2206 auto *ST = dyn_cast<StructType>(RetTy);
2207 Type *LegalizeTy = ST ? ST->getContainedType(0) : RetTy;
2208 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(LegalizeTy);
2209
2210 const TargetLoweringBase *TLI = getTLI();
2211
2212 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2213 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2214 TLI->isFAbsFree(LT.second)) {
2215 return 0;
2216 }
2217
2218 // The operation is legal. Assume it costs 1.
2219 // If the type is split to multiple registers, assume that there is some
2220 // overhead to this.
2221 // TODO: Once we have extract/insert subvector cost we need to use them.
2222 if (LT.first > 1)
2223 return (LT.first * 2);
2224 else
2225 return (LT.first * 1);
2226 } else if (!TLI->isOperationExpand(ISD, LT.second)) {
2227 // If the operation is custom lowered then assume
2228 // that the code is twice as expensive.
2229 return (LT.first * 2);
2230 }
2231
2232 switch (IID) {
2233 case Intrinsic::fmuladd: {
2234 // If we can't lower fmuladd into an FMA estimate the cost as a floating
2235 // point mul followed by an add.
2236
2237 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2238 CostKind) +
2239 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2240 CostKind);
2241 }
2242 case Intrinsic::experimental_constrained_fmuladd: {
2243 IntrinsicCostAttributes FMulAttrs(
2244 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2245 IntrinsicCostAttributes FAddAttrs(
2246 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2247 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2248 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2249 }
2250 case Intrinsic::smin:
2251 case Intrinsic::smax:
2252 case Intrinsic::umin:
2253 case Intrinsic::umax: {
2254 // minmax(X,Y) = select(icmp(X,Y),X,Y)
2255 Type *CondTy = RetTy->getWithNewBitWidth(1);
2256 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
2257 CmpInst::Predicate Pred =
2258 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
2260 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2261 Pred, CostKind);
2262 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2263 Pred, CostKind);
2264 return Cost;
2265 }
2266 case Intrinsic::sadd_with_overflow:
2267 case Intrinsic::ssub_with_overflow: {
2268 Type *SumTy = RetTy->getContainedType(0);
2269 Type *OverflowTy = RetTy->getContainedType(1);
2270 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
2271 ? BinaryOperator::Add
2272 : BinaryOperator::Sub;
2273
2274 // Add:
2275 // Overflow -> (Result < LHS) ^ (RHS < 0)
2276 // Sub:
2277 // Overflow -> (Result < LHS) ^ (RHS > 0)
2279 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2280 Cost +=
2281 2 * thisT()->getCmpSelInstrCost(Instruction::ICmp, SumTy, OverflowTy,
2283 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2284 CostKind);
2285 return Cost;
2286 }
2287 case Intrinsic::uadd_with_overflow:
2288 case Intrinsic::usub_with_overflow: {
2289 Type *SumTy = RetTy->getContainedType(0);
2290 Type *OverflowTy = RetTy->getContainedType(1);
2291 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2292 ? BinaryOperator::Add
2293 : BinaryOperator::Sub;
2294 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2297
2299 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2300 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
2301 OverflowTy, Pred, CostKind);
2302 return Cost;
2303 }
2304 case Intrinsic::smul_with_overflow:
2305 case Intrinsic::umul_with_overflow: {
2306 Type *MulTy = RetTy->getContainedType(0);
2307 Type *OverflowTy = RetTy->getContainedType(1);
2308 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2309 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
2310 bool IsSigned = IID == Intrinsic::smul_with_overflow;
2311
2312 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2314
2316 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2317 Cost +=
2318 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2319 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2320 CCH, CostKind);
2321 Cost += thisT()->getArithmeticInstrCost(
2322 Instruction::LShr, ExtTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2324
2325 if (IsSigned)
2326 Cost += thisT()->getArithmeticInstrCost(
2327 Instruction::AShr, MulTy, CostKind,
2330
2331 Cost += thisT()->getCmpSelInstrCost(
2332 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2333 return Cost;
2334 }
2335 case Intrinsic::sadd_sat:
2336 case Intrinsic::ssub_sat: {
2337 // Assume a default expansion.
2338 Type *CondTy = RetTy->getWithNewBitWidth(1);
2339
2340 Type *OpTy = StructType::create({RetTy, CondTy});
2341 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
2342 ? Intrinsic::sadd_with_overflow
2343 : Intrinsic::ssub_with_overflow;
2345
2346 // SatMax -> Overflow && SumDiff < 0
2347 // SatMin -> Overflow && SumDiff >= 0
2349 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2350 nullptr, ScalarizationCostPassed);
2351 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2352 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2353 Pred, CostKind);
2354 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
2355 CondTy, Pred, CostKind);
2356 return Cost;
2357 }
2358 case Intrinsic::uadd_sat:
2359 case Intrinsic::usub_sat: {
2360 Type *CondTy = RetTy->getWithNewBitWidth(1);
2361
2362 Type *OpTy = StructType::create({RetTy, CondTy});
2363 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
2364 ? Intrinsic::uadd_with_overflow
2365 : Intrinsic::usub_with_overflow;
2366
2368 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2369 nullptr, ScalarizationCostPassed);
2370 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2371 Cost +=
2372 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2374 return Cost;
2375 }
2376 case Intrinsic::smul_fix:
2377 case Intrinsic::umul_fix: {
2378 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
2379 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
2380
2381 unsigned ExtOp =
2382 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
2384
2386 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
2387 Cost +=
2388 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2389 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
2390 CCH, CostKind);
2391 Cost += thisT()->getArithmeticInstrCost(
2392 Instruction::LShr, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2394 Cost += thisT()->getArithmeticInstrCost(
2395 Instruction::Shl, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2397 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
2398 return Cost;
2399 }
2400 case Intrinsic::abs: {
2401 // abs(X) = select(icmp(X,0),X,sub(0,X))
2402 Type *CondTy = RetTy->getWithNewBitWidth(1);
2405 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2406 Pred, CostKind);
2407 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2408 Pred, CostKind);
2409 // TODO: Should we add an OperandValueProperties::OP_Zero property?
2410 Cost += thisT()->getArithmeticInstrCost(
2411 BinaryOperator::Sub, RetTy, CostKind,
2413 return Cost;
2414 }
2415 case Intrinsic::fptosi_sat:
2416 case Intrinsic::fptoui_sat: {
2417 if (Tys.empty())
2418 break;
2419 Type *FromTy = Tys[0];
2420 bool IsSigned = IID == Intrinsic::fptosi_sat;
2421
2423 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2424 {FromTy, FromTy});
2425 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2426 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2427 {FromTy, FromTy});
2428 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2429 Cost += thisT()->getCastInstrCost(
2430 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2432 if (IsSigned) {
2433 Type *CondTy = RetTy->getWithNewBitWidth(1);
2434 Cost += thisT()->getCmpSelInstrCost(
2435 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2436 Cost += thisT()->getCmpSelInstrCost(
2437 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2438 }
2439 return Cost;
2440 }
2441 default:
2442 break;
2443 }
2444
2445 // Else, assume that we need to scalarize this intrinsic. For math builtins
2446 // this will emit a costly libcall, adding call overhead and spills. Make it
2447 // very expensive.
2448 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2449 // Scalable vectors cannot be scalarized, so return Invalid.
2450 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2451 return isa<ScalableVectorType>(Ty);
2452 }))
2454
2455 InstructionCost ScalarizationCost =
2456 SkipScalarizationCost
2457 ? ScalarizationCostPassed
2458 : getScalarizationOverhead(RetVTy, /*Insert*/ true,
2459 /*Extract*/ false, CostKind);
2460
2461 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
2462 SmallVector<Type *, 4> ScalarTys;
2463 for (Type *Ty : Tys) {
2464 if (Ty->isVectorTy())
2465 Ty = Ty->getScalarType();
2466 ScalarTys.push_back(Ty);
2467 }
2468 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2469 InstructionCost ScalarCost =
2470 thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2471 for (Type *Ty : Tys) {
2472 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
2473 if (!ICA.skipScalarizationCost())
2474 ScalarizationCost += getScalarizationOverhead(
2475 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2476 ScalarCalls = std::max(ScalarCalls,
2477 cast<FixedVectorType>(VTy)->getNumElements());
2478 }
2479 }
2480 return ScalarCalls * ScalarCost + ScalarizationCost;
2481 }
2482
2483 // This is going to be turned into a library call, make it expensive.
2484 return SingleCallCost;
2485 }
2486
2487 /// Compute a cost of the given call instruction.
2488 ///
2489 /// Compute the cost of calling function F with return type RetTy and
2490 /// argument types Tys. F might be nullptr, in this case the cost of an
2491 /// arbitrary call with the specified signature will be returned.
2492 /// This is used, for instance, when we estimate call of a vector
2493 /// counterpart of the given function.
2494 /// \param F Called function, might be nullptr.
2495 /// \param RetTy Return value types.
2496 /// \param Tys Argument types.
2497 /// \returns The cost of Call instruction.
2499 ArrayRef<Type *> Tys,
2501 return 10;
2502 }
2503
2504 unsigned getNumberOfParts(Type *Tp) {
2505 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2506 return LT.first.isValid() ? *LT.first.getValue() : 0;
2507 }
2508
2510 const SCEV *) {
2511 return 0;
2512 }
2513
2514 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2515 /// We're assuming that reduction operation are performing the following way:
2516 ///
2517 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2518 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2519 /// \----------------v-------------/ \----------v------------/
2520 /// n/2 elements n/2 elements
2521 /// %red1 = op <n x t> %val, <n x t> val1
2522 /// After this operation we have a vector %red1 where only the first n/2
2523 /// elements are meaningful, the second n/2 elements are undefined and can be
2524 /// dropped. All other operations are actually working with the vector of
2525 /// length n/2, not n, though the real vector length is still n.
2526 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2527 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2528 /// \----------------v-------------/ \----------v------------/
2529 /// n/4 elements 3*n/4 elements
2530 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
2531 /// length n/2, the resulting vector has length n/4 etc.
2532 ///
2533 /// The cost model should take into account that the actual length of the
2534 /// vector is reduced on each iteration.
2537 // Targets must implement a default value for the scalable case, since
2538 // we don't know how many lanes the vector has.
2539 if (isa<ScalableVectorType>(Ty))
2541
2542 Type *ScalarTy = Ty->getElementType();
2543 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2544 if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2545 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2546 NumVecElts >= 2) {
2547 // Or reduction for i1 is represented as:
2548 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2549 // %res = cmp ne iReduxWidth %val, 0
2550 // And reduction for i1 is represented as:
2551 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2552 // %res = cmp eq iReduxWidth %val, 11111
2553 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2554 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2556 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2559 }
2560 unsigned NumReduxLevels = Log2_32(NumVecElts);
2561 InstructionCost ArithCost = 0;
2562 InstructionCost ShuffleCost = 0;
2563 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2564 unsigned LongVectorCount = 0;
2565 unsigned MVTLen =
2566 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2567 while (NumVecElts > MVTLen) {
2568 NumVecElts /= 2;
2569 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2570 ShuffleCost +=
2571 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2572 CostKind, NumVecElts, SubTy);
2573 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2574 Ty = SubTy;
2575 ++LongVectorCount;
2576 }
2577
2578 NumReduxLevels -= LongVectorCount;
2579
2580 // The minimal length of the vector is limited by the real length of vector
2581 // operations performed on the current platform. That's why several final
2582 // reduction operations are performed on the vectors with the same
2583 // architecture-dependent length.
2584
2585 // By default reductions need one shuffle per reduction level.
2586 ShuffleCost +=
2587 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2588 std::nullopt, CostKind, 0, Ty);
2589 ArithCost +=
2590 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2591 return ShuffleCost + ArithCost +
2592 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2593 CostKind, 0, nullptr, nullptr);
2594 }
2595
2596 /// Try to calculate the cost of performing strict (in-order) reductions,
2597 /// which involves doing a sequence of floating point additions in lane
2598 /// order, starting with an initial value. For example, consider a scalar
2599 /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2600 ///
2601 /// Vector = <float %v0, float %v1, float %v2, float %v3>
2602 ///
2603 /// %add1 = %InitVal + %v0
2604 /// %add2 = %add1 + %v1
2605 /// %add3 = %add2 + %v2
2606 /// %add4 = %add3 + %v3
2607 ///
2608 /// As a simple estimate we can say the cost of such a reduction is 4 times
2609 /// the cost of a scalar FP addition. We can only estimate the costs for
2610 /// fixed-width vectors here because for scalable vectors we do not know the
2611 /// runtime number of operations.
2614 // Targets must implement a default value for the scalable case, since
2615 // we don't know how many lanes the vector has.
2616 if (isa<ScalableVectorType>(Ty))
2618
2619 auto *VTy = cast<FixedVectorType>(Ty);
2621 VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
2622 InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2623 Opcode, VTy->getElementType(), CostKind);
2624 ArithCost *= VTy->getNumElements();
2625
2626 return ExtractCost + ArithCost;
2627 }
2628
2630 std::optional<FastMathFlags> FMF,
2632 assert(Ty && "Unknown reduction vector type");
2634 return getOrderedReductionCost(Opcode, Ty, CostKind);
2635 return getTreeReductionCost(Opcode, Ty, CostKind);
2636 }
2637
2638 /// Try to calculate op costs for min/max reduction operations.
2639 /// \param CondTy Conditional type for the Select instruction.
2641 FastMathFlags FMF,
2643 // Targets must implement a default value for the scalable case, since
2644 // we don't know how many lanes the vector has.
2645 if (isa<ScalableVectorType>(Ty))
2647
2648 Type *ScalarTy = Ty->getElementType();
2649 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2650 unsigned NumReduxLevels = Log2_32(NumVecElts);
2651 InstructionCost MinMaxCost = 0;
2652 InstructionCost ShuffleCost = 0;
2653 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2654 unsigned LongVectorCount = 0;
2655 unsigned MVTLen =
2656 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2657 while (NumVecElts > MVTLen) {
2658 NumVecElts /= 2;
2659 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2660
2661 ShuffleCost +=
2662 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2663 CostKind, NumVecElts, SubTy);
2664
2665 IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF);
2666 MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind);
2667 Ty = SubTy;
2668 ++LongVectorCount;
2669 }
2670
2671 NumReduxLevels -= LongVectorCount;
2672
2673 // The minimal length of the vector is limited by the real length of vector
2674 // operations performed on the current platform. That's why several final
2675 // reduction opertions are perfomed on the vectors with the same
2676 // architecture-dependent length.
2677 ShuffleCost +=
2678 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2679 std::nullopt, CostKind, 0, Ty);
2680 IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF);
2681 MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind);
2682 // The last min/max should be in vector registers and we counted it above.
2683 // So just need a single extractelement.
2684 return ShuffleCost + MinMaxCost +
2685 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2686 CostKind, 0, nullptr, nullptr);
2687 }
2688
2689 InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
2690 Type *ResTy, VectorType *Ty,
2691 FastMathFlags FMF,
2693 // Without any native support, this is equivalent to the cost of
2694 // vecreduce.opcode(ext(Ty A)).
2695 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2696 InstructionCost RedCost =
2697 thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
2698 InstructionCost ExtCost = thisT()->getCastInstrCost(
2699 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2701
2702 return RedCost + ExtCost;
2703 }
2704
2706 VectorType *Ty,
2708 // Without any native support, this is equivalent to the cost of
2709 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
2710 // vecreduce.add(mul(A, B)).
2711 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2712 InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2713 Instruction::Add, ExtTy, std::nullopt, CostKind);
2714 InstructionCost ExtCost = thisT()->getCastInstrCost(
2715 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2717
2718 InstructionCost MulCost =
2719 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2720
2721 return RedCost + MulCost + 2 * ExtCost;
2722 }
2723
2725
2726 /// @}
2727};
2728
2729/// Concrete BasicTTIImpl that can be used if no further customization
2730/// is needed.
2731class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2733
2734 friend class BasicTTIImplBase<BasicTTIImpl>;
2735
2736 const TargetSubtargetInfo *ST;
2737 const TargetLoweringBase *TLI;
2738
2739 const TargetSubtargetInfo *getST() const { return ST; }
2740 const TargetLoweringBase *getTLI() const { return TLI; }
2741
2742public:
2743 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2744};
2745
2746} // end namespace llvm
2747
2748#endif // LLVM_CODEGEN_BASICTTIIMPL_H
This file implements a class to represent arbitrary precision integral constant values and operations...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getNumElements(Type *Ty)
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
This file provides helpers for the implementation of a TargetTransformInfo-conforming class.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:214
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1310
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1181
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1448
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1110
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:180
an instruction to allocate memory on the stack
Definition: Instructions.h:61
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
Base class which can be used to help build a TTI implementation.
Definition: BasicTTIImpl.h:81
bool isTypeLegal(Type *Ty)
Definition: BasicTTIImpl.h:434
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:287
virtual unsigned getPrefetchDistance() const
Definition: BasicTTIImpl.h:728
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:588
bool preferToKeepConstantsAttached(const Instruction &Inst, const Function &Fn) const
Definition: BasicTTIImpl.h:561
unsigned getMaxInterleaveFactor(ElementCount VF)
Definition: BasicTTIImpl.h:895
unsigned getNumberOfParts(Type *Tp)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
Definition: BasicTTIImpl.h:757
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:762
InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate the cost of performing strict (in-order) reductions, which involves doing a sequence...
bool isTruncateFree(Type *Ty1, Type *Ty2)
Definition: BasicTTIImpl.h:424
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: BasicTTIImpl.h:668
InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
Definition: BasicTTIImpl.h:675
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
Definition: BasicTTIImpl.h:748
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
bool isLegalICmpImmediate(int64_t imm)
Definition: BasicTTIImpl.h:336
bool isProfitableToHoist(Instruction *I)
Definition: BasicTTIImpl.h:428
virtual unsigned getMaxPrefetchIterationsAhead() const
Definition: BasicTTIImpl.h:740
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:761
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:975
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
unsigned getRegUsageForType(Type *Ty)
Definition: BasicTTIImpl.h:439
bool shouldBuildRelLookupTables() const
Definition: BasicTTIImpl.h:515
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
Definition: BasicTTIImpl.h:582
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JumpTableSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
Definition: BasicTTIImpl.h:450
bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:377
bool shouldDropLSRSolutionIfLessProfitable() const
Definition: BasicTTIImpl.h:402
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2)
Definition: BasicTTIImpl.h:389
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed)
Definition: BasicTTIImpl.h:690
bool shouldFoldTerminatingConditionAfterLSR() const
Definition: BasicTTIImpl.h:397
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Definition: BasicTTIImpl.h:732
bool hasBranchDivergence(const Function *F=nullptr)
Definition: BasicTTIImpl.h:281
bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:383
unsigned getAssumedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:309
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instructions unique non-constant operands.
Definition: BasicTTIImpl.h:814
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:768
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset)
Definition: BasicTTIImpl.h:353
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:444
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
Definition: BasicTTIImpl.h:547
virtual std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:708
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace)
Definition: BasicTTIImpl.h:410
bool isAlwaysUniform(const Value *V)
Definition: BasicTTIImpl.h:285
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true)
Definition: BasicTTIImpl.h:680
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace, Align Alignment, unsigned *Fast) const
Definition: BasicTTIImpl.h:273
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition: BasicTTIImpl.h:357
InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
Definition: BasicTTIImpl.h:798
virtual std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:714
virtual bool enableWritePrefetching() const
Definition: BasicTTIImpl.h:744
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
Definition: BasicTTIImpl.h:323
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:660
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
Definition: BasicTTIImpl.h:300
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:897
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getFPOpCost(Type *Ty)
Definition: BasicTTIImpl.h:551
InstructionCost getVectorSplitCost()
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:861
bool haveFastSqrt(Type *Ty)
Definition: BasicTTIImpl.h:540
std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:319
unsigned getInliningThresholdMultiplier() const
Definition: BasicTTIImpl.h:580
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
virtual ~BasicTTIImplBase()=default
bool isLegalAddScalableImmediate(int64_t Imm)
Definition: BasicTTIImpl.h:332
InstructionCost getScalarizationOverhead(VectorType *RetTy, ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing the inputs and outputs of an instruction, with return type RetTy...
Definition: BasicTTIImpl.h:843
bool isVScaleKnownToBeAPowerOfTwo() const
Definition: BasicTTIImpl.h:763
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II)
Definition: BasicTTIImpl.h:684
bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const
Definition: BasicTTIImpl.h:291
bool isLegalAddImmediate(int64_t imm)
Definition: BasicTTIImpl.h:328
unsigned getFlatAddressSpace()
Definition: BasicTTIImpl.h:295
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
virtual unsigned getCacheLineSize() const
Definition: BasicTTIImpl.h:724
bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:305
bool isSourceOfDivergence(const Value *V)
Definition: BasicTTIImpl.h:283
int getInlinerVectorBonusPercent() const
Definition: BasicTTIImpl.h:586
InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on argument types.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp)
Definition: BasicTTIImpl.h:697
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:340
bool isSingleThreaded() const
Definition: BasicTTIImpl.h:313
BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
Definition: BasicTTIImpl.h:264
unsigned adjustInliningThreshold(const CallBase *CB)
Definition: BasicTTIImpl.h:581
bool isProfitableLSRChainElement(Instruction *I)
Definition: BasicTTIImpl.h:406
Concrete BasicTTIImpl that can be used if no further customization is needed.
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
BitVector & set()
Definition: BitVector.h:351
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1104
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:767
This class represents a range of values.
Definition: ConstantRange.h:47
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:442
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:861
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:378
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:326
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:680
bool isTargetIntrinsic() const
isTargetIntrinsic - Returns true if this function is an intrinsic and the intrinsic is specific to a ...
Definition: Function.cpp:955
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:357
The core instruction combiner logic.
Definition: InstCombiner.h:47
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:266
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
InstructionCost getScalarizationCost() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Return the minimum stride necessary to trigger software prefetching.
virtual bool enableWritePrefetching() const
virtual unsigned getMaxPrefetchIterationsAhead() const
Return the maximum prefetch distance in terms of loop iterations.
virtual unsigned getPrefetchDistance() const
Return the preferred prefetch distance in terms of instructions.
virtual std::optional< unsigned > getCacheAssociativity(unsigned Level) const
Return the cache associatvity for the given level of cache.
virtual std::optional< unsigned > getCacheLineSize(unsigned Level) const
Return the target cache line size in bytes at a given level.
Machine Value Type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Analysis providing profile information.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isSpliceMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is a splice mask, concatenating the two inputs together and then ext...
static bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isTransposeMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask is a transpose mask.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
size_type size() const
Definition: SmallPtrSet.h:96
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:368
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:503
bool empty() const
Definition: SmallVector.h:95
size_t size() const
Definition: SmallVector.h:92
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:587
void push_back(const T &Elt)
Definition: SmallVector.h:427
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1210
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:501
Multiway switch.
Provides information about what library functions are available for the current target.
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
bool isIndexedStoreLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool isLegalICmpImmediate(int64_t) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
const TargetMachine & getTargetMachine() const
virtual bool isZExtFree(Type *FromTy, Type *ToTy) const
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, uint64_t Range, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
Return true if lowering to a jump table is suitable for a set of case clusters which may contain NumC...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
virtual bool isCheapToSpeculateCttz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic cttz.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
unsigned getBitWidthForCttzElements(Type *RetTy, ElementCount EC, bool ZeroIsPoison, const ConstantRange *VScaleRange) const
Return the minimum number of bits required to hold the maximum possible number of trailing zero vecto...
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
virtual bool isTruncateFree(Type *FromTy, Type *ToTy) const
Return true if it's free to truncate a value of type FromTy to type ToTy.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps, const APInt &Low, const APInt &High, const DataLayout &DL) const
Return true if lowering to a bit test is suitable for a set of case clusters which contains NumDests ...
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
virtual bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const
Return how this store with truncation should be treated: either it is legal, needs to be promoted to ...
LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return how this load with extension should be treated: either it is legal, needs to be promoted to a ...
virtual bool isIntDivCheap(EVT VT, AttributeList Attr) const
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isProfitableToHoist(Instruction *I) const
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
virtual bool isCheapToSpeculateCtlz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic ctlz.
virtual int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const
Return the prefered common base offset.
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
virtual bool isLegalAddScalableImmediate(int64_t) const
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
bool isBeneficialToExpandPowI(int64_t Exponent, bool OptForSize) const
Return true if it is beneficial to expand an @llvm.powi.
virtual bool isFAbsFree(EVT VT) const
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
virtual bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AddrSpace, Instruction *I=nullptr) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
virtual std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
If the specified predicate checks whether a generic pointer falls within a specified address space,...
virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast between SrcAS and DestAS is a noop.
virtual unsigned getAssumedAddrSpace(const Value *V) const
If the specified generic pointer could be assumed as a pointer to a specific address space,...
TargetOptions Options
ThreadModel::Model ThreadModel
ThreadModel - This flag specifies the type of threading model to assume for things like atomics.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual bool useAA() const
Enable use of alias analysis during code generation (during MI scheduling, DAGCombine,...
const DataLayout & getDataLayout() const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isProfitableLSRChainElement(Instruction *I) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info, ArrayRef< const Value * > Args, const Instruction *CxtI=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I) const
bool isLoweredToCall(const Function *F) const
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
CRTP base class for use as a mix-in that aids implementing a TargetTransformInfo-compatible class.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Basic
The cost of a typical 'add' instruction.
MemIndexedMode
The type of load/store indexing.
@ MIM_PostInc
Post-incrementing.
@ MIM_PostDec
Post-decrementing.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
CacheLevel
The possible cache levels.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:373
bool isArch64Bit() const
Test whether the architecture is 64-bit.
Definition: Triple.cpp:1651
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
Definition: Triple.h:558
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:261
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:230
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt8Ty(LLVMContext &C)
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:258
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:212
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:343
Value * getOperand(unsigned i) const
Definition: User.h:169
static bool isVPBinOp(Intrinsic::ID ID)
static std::optional< unsigned > getFunctionalOpcodeForVP(Intrinsic::ID ID)
static std::optional< Intrinsic::ID > getFunctionalIntrinsicIDForVP(Intrinsic::ID ID)
static bool isVPIntrinsic(Intrinsic::ID)
static bool isVPReduction(Intrinsic::ID ID)
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:507
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:664
Type * getElementType() const
Definition: DerivedTypes.h:436
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:218
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:743
@ SMULFIX
RESULT = [US]MULFIX(LHS, RHS, SCALE) - Perform fixed point multiplication on 2 integers with the same...
Definition: ISDOpcodes.h:374
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:497
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:716
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1120
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1124
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:514
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:756
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:906
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1028
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:696
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:765
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1047
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:905
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:507
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1052
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1552
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:988
AddressSpace
Definition: NVPTXBaseInfo.h:21
unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
Definition: LoopUtils.cpp:959
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
ConstantRange getVScaleRange(const Function *F, unsigned BitWidth)
Determine the possible constant range of vscale with the given bit width, based on the vscale_range f...
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
InstructionCost Cost
cl::opt< unsigned > PartialUnrollingThreshold
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:341
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:275
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:307
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
Attributes of a target dependent hardware loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
bool AllowPeeling
Allow peeling off loop iterations.
bool AllowLoopNestsPeeling
Allow peeling off loop iterations for loop nests.
bool PeelProfiledIterations
Allow peeling basing on profile.
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).