LLVM 17.0.0git
BasicTTIImpl.h
Go to the documentation of this file.
1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/BitVector.h"
33#include "llvm/IR/BasicBlock.h"
34#include "llvm/IR/Constant.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DataLayout.h"
38#include "llvm/IR/InstrTypes.h"
39#include "llvm/IR/Instruction.h"
41#include "llvm/IR/Intrinsics.h"
42#include "llvm/IR/Operator.h"
43#include "llvm/IR/Type.h"
44#include "llvm/IR/Value.h"
51#include <algorithm>
52#include <cassert>
53#include <cstdint>
54#include <limits>
55#include <optional>
56#include <utility>
57
58namespace llvm {
59
60class Function;
61class GlobalValue;
62class LLVMContext;
63class ScalarEvolution;
64class SCEV;
65class TargetMachine;
66
67extern cl::opt<unsigned> PartialUnrollingThreshold;
68
69/// Base class which can be used to help build a TTI implementation.
70///
71/// This class provides as much implementation of the TTI interface as is
72/// possible using the target independent parts of the code generator.
73///
74/// In order to subclass it, your class must implement a getST() method to
75/// return the subtarget, and a getTLI() method to return the target lowering.
76/// We need these methods implemented in the derived class so that this class
77/// doesn't have to duplicate storage for them.
78template <typename T>
80private:
83
84 /// Helper function to access this as a T.
85 T *thisT() { return static_cast<T *>(this); }
86
87 /// Estimate a cost of Broadcast as an extract and sequence of insert
88 /// operations.
89 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
92 // Broadcast cost is equal to the cost of extracting the zero'th element
93 // plus the cost of inserting it into every element of the result vector.
94 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
95 CostKind, 0, nullptr, nullptr);
96
97 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
98 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
99 CostKind, i, nullptr, nullptr);
100 }
101 return Cost;
102 }
103
104 /// Estimate a cost of shuffle as a sequence of extract and insert
105 /// operations.
106 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
109 // Shuffle cost is equal to the cost of extracting element from its argument
110 // plus the cost of inserting them onto the result vector.
111
112 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
113 // index 0 of first vector, index 1 of second vector,index 2 of first
114 // vector and finally index 3 of second vector and insert them at index
115 // <0,1,2,3> of result vector.
116 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
117 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
118 CostKind, i, nullptr, nullptr);
119 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
120 CostKind, i, nullptr, nullptr);
121 }
122 return Cost;
123 }
124
125 /// Estimate a cost of subvector extraction as a sequence of extract and
126 /// insert operations.
127 InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
129 int Index,
130 FixedVectorType *SubVTy) {
131 assert(VTy && SubVTy &&
132 "Can only extract subvectors from vectors");
133 int NumSubElts = SubVTy->getNumElements();
134 assert((!isa<FixedVectorType>(VTy) ||
135 (Index + NumSubElts) <=
136 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
137 "SK_ExtractSubvector index out of range");
138
140 // Subvector extraction cost is equal to the cost of extracting element from
141 // the source type plus the cost of inserting them into the result vector
142 // type.
143 for (int i = 0; i != NumSubElts; ++i) {
144 Cost +=
145 thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
146 CostKind, i + Index, nullptr, nullptr);
147 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
148 CostKind, i, nullptr, nullptr);
149 }
150 return Cost;
151 }
152
153 /// Estimate a cost of subvector insertion as a sequence of extract and
154 /// insert operations.
155 InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
157 int Index,
158 FixedVectorType *SubVTy) {
159 assert(VTy && SubVTy &&
160 "Can only insert subvectors into vectors");
161 int NumSubElts = SubVTy->getNumElements();
162 assert((!isa<FixedVectorType>(VTy) ||
163 (Index + NumSubElts) <=
164 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
165 "SK_InsertSubvector index out of range");
166
168 // Subvector insertion cost is equal to the cost of extracting element from
169 // the source type plus the cost of inserting them into the result vector
170 // type.
171 for (int i = 0; i != NumSubElts; ++i) {
172 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
173 CostKind, i, nullptr, nullptr);
174 Cost +=
175 thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
176 i + Index, nullptr, nullptr);
177 }
178 return Cost;
179 }
180
181 /// Local query method delegates up to T which *must* implement this!
182 const TargetSubtargetInfo *getST() const {
183 return static_cast<const T *>(this)->getST();
184 }
185
186 /// Local query method delegates up to T which *must* implement this!
187 const TargetLoweringBase *getTLI() const {
188 return static_cast<const T *>(this)->getTLI();
189 }
190
191 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
192 switch (M) {
194 return ISD::UNINDEXED;
195 case TTI::MIM_PreInc:
196 return ISD::PRE_INC;
197 case TTI::MIM_PreDec:
198 return ISD::PRE_DEC;
199 case TTI::MIM_PostInc:
200 return ISD::POST_INC;
201 case TTI::MIM_PostDec:
202 return ISD::POST_DEC;
203 }
204 llvm_unreachable("Unexpected MemIndexedMode");
205 }
206
207 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
208 Align Alignment,
209 bool VariableMask,
210 bool IsGatherScatter,
212 // We cannot scalarize scalable vectors, so return Invalid.
213 if (isa<ScalableVectorType>(DataTy))
215
216 auto *VT = cast<FixedVectorType>(DataTy);
217 // Assume the target does not have support for gather/scatter operations
218 // and provide a rough estimate.
219 //
220 // First, compute the cost of the individual memory operations.
221 InstructionCost AddrExtractCost =
222 IsGatherScatter
223 ? getVectorInstrCost(Instruction::ExtractElement,
225 PointerType::get(VT->getElementType(), 0),
226 VT->getNumElements()),
227 CostKind, -1, nullptr, nullptr)
228 : 0;
229 InstructionCost LoadCost =
230 VT->getNumElements() *
231 (AddrExtractCost +
232 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
233
234 // Next, compute the cost of packing the result in a vector.
235 InstructionCost PackingCost =
236 getScalarizationOverhead(VT, Opcode != Instruction::Store,
237 Opcode == Instruction::Store, CostKind);
238
239 InstructionCost ConditionalCost = 0;
240 if (VariableMask) {
241 // Compute the cost of conditionally executing the memory operations with
242 // variable masks. This includes extracting the individual conditions, a
243 // branches and PHIs to combine the results.
244 // NOTE: Estimating the cost of conditionally executing the memory
245 // operations accurately is quite difficult and the current solution
246 // provides a very rough estimate only.
247 ConditionalCost =
248 VT->getNumElements() *
250 Instruction::ExtractElement,
252 VT->getNumElements()),
253 CostKind, -1, nullptr, nullptr) +
254 getCFInstrCost(Instruction::Br, CostKind) +
255 getCFInstrCost(Instruction::PHI, CostKind));
256 }
257
258 return LoadCost + PackingCost + ConditionalCost;
259 }
260
261protected:
263 : BaseT(DL) {}
264 virtual ~BasicTTIImplBase() = default;
265
267
268public:
269 /// \name Scalar TTI Implementations
270 /// @{
272 unsigned AddressSpace, Align Alignment,
273 unsigned *Fast) const {
275 return getTLI()->allowsMisalignedMemoryAccesses(
277 }
278
279 bool hasBranchDivergence() { return false; }
280
281 bool isSourceOfDivergence(const Value *V) { return false; }
282
283 bool isAlwaysUniform(const Value *V) { return false; }
284
285 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
286 return false;
287 }
288
290 // Return an invalid address space.
291 return -1;
292 }
293
295 Intrinsic::ID IID) const {
296 return false;
297 }
298
299 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
300 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
301 }
302
303 unsigned getAssumedAddrSpace(const Value *V) const {
304 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
305 }
306
307 bool isSingleThreaded() const {
308 return getTLI()->getTargetMachine().Options.ThreadModel ==
310 }
311
312 std::pair<const Value *, unsigned>
314 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
315 }
316
318 Value *NewV) const {
319 return nullptr;
320 }
321
322 bool isLegalAddImmediate(int64_t imm) {
323 return getTLI()->isLegalAddImmediate(imm);
324 }
325
326 bool isLegalICmpImmediate(int64_t imm) {
327 return getTLI()->isLegalICmpImmediate(imm);
328 }
329
330 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
331 bool HasBaseReg, int64_t Scale,
332 unsigned AddrSpace, Instruction *I = nullptr) {
334 AM.BaseGV = BaseGV;
335 AM.BaseOffs = BaseOffset;
336 AM.HasBaseReg = HasBaseReg;
337 AM.Scale = Scale;
338 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
339 }
340
341 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
342 Type *ScalarValTy) const {
343 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
344 auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
345 EVT VT = getTLI()->getValueType(DL, SrcTy);
346 if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
347 getTLI()->isOperationCustom(ISD::STORE, VT))
348 return true;
349
350 EVT ValVT =
351 getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
352 EVT LegalizedVT =
353 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
354 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
355 };
356 while (VF > 2 && IsSupportedByTarget(VF))
357 VF /= 2;
358 return VF;
359 }
360
362 const DataLayout &DL) const {
363 EVT VT = getTLI()->getValueType(DL, Ty);
364 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
365 }
366
368 const DataLayout &DL) const {
369 EVT VT = getTLI()->getValueType(DL, Ty);
370 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
371 }
372
375 }
376
379 }
380
383 }
384
386 int64_t BaseOffset, bool HasBaseReg,
387 int64_t Scale, unsigned AddrSpace) {
389 AM.BaseGV = BaseGV;
390 AM.BaseOffs = BaseOffset;
391 AM.HasBaseReg = HasBaseReg;
392 AM.Scale = Scale;
393 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
394 return 0;
395 return -1;
396 }
397
398 bool isTruncateFree(Type *Ty1, Type *Ty2) {
399 return getTLI()->isTruncateFree(Ty1, Ty2);
400 }
401
403 return getTLI()->isProfitableToHoist(I);
404 }
405
406 bool useAA() const { return getST()->useAA(); }
407
408 bool isTypeLegal(Type *Ty) {
409 EVT VT = getTLI()->getValueType(DL, Ty);
410 return getTLI()->isTypeLegal(VT);
411 }
412
413 unsigned getRegUsageForType(Type *Ty) {
414 EVT ETy = getTLI()->getValueType(DL, Ty);
415 return getTLI()->getNumRegisters(Ty->getContext(), ETy);
416 }
417
421 return BaseT::getGEPCost(PointeeType, Ptr, Operands, CostKind);
422 }
423
425 unsigned &JumpTableSize,
427 BlockFrequencyInfo *BFI) {
428 /// Try to find the estimated number of clusters. Note that the number of
429 /// clusters identified in this function could be different from the actual
430 /// numbers found in lowering. This function ignore switches that are
431 /// lowered with a mix of jump table / bit test / BTree. This function was
432 /// initially intended to be used when estimating the cost of switch in
433 /// inline cost heuristic, but it's a generic cost model to be used in other
434 /// places (e.g., in loop unrolling).
435 unsigned N = SI.getNumCases();
436 const TargetLoweringBase *TLI = getTLI();
437 const DataLayout &DL = this->getDataLayout();
438
439 JumpTableSize = 0;
440 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
441
442 // Early exit if both a jump table and bit test are not allowed.
443 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
444 return N;
445
446 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
447 APInt MinCaseVal = MaxCaseVal;
448 for (auto CI : SI.cases()) {
449 const APInt &CaseVal = CI.getCaseValue()->getValue();
450 if (CaseVal.sgt(MaxCaseVal))
451 MaxCaseVal = CaseVal;
452 if (CaseVal.slt(MinCaseVal))
453 MinCaseVal = CaseVal;
454 }
455
456 // Check if suitable for a bit test
457 if (N <= DL.getIndexSizeInBits(0u)) {
459 for (auto I : SI.cases())
460 Dests.insert(I.getCaseSuccessor());
461
462 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
463 DL))
464 return 1;
465 }
466
467 // Check if suitable for a jump table.
468 if (IsJTAllowed) {
469 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
470 return N;
471 uint64_t Range =
472 (MaxCaseVal - MinCaseVal)
473 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
474 // Check whether a range of clusters is dense enough for a jump table
475 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
476 JumpTableSize = Range;
477 return 1;
478 }
479 }
480 return N;
481 }
482
484 const TargetLoweringBase *TLI = getTLI();
485 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
486 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
487 }
488
490 const TargetMachine &TM = getTLI()->getTargetMachine();
491 // If non-PIC mode, do not generate a relative lookup table.
492 if (!TM.isPositionIndependent())
493 return false;
494
495 /// Relative lookup table entries consist of 32-bit offsets.
496 /// Do not generate relative lookup tables for large code models
497 /// in 64-bit achitectures where 32-bit offsets might not be enough.
498 if (TM.getCodeModel() == CodeModel::Medium ||
499 TM.getCodeModel() == CodeModel::Large)
500 return false;
501
502 Triple TargetTriple = TM.getTargetTriple();
503 if (!TargetTriple.isArch64Bit())
504 return false;
505
506 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
507 // there.
508 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
509 return false;
510
511 return true;
512 }
513
514 bool haveFastSqrt(Type *Ty) {
515 const TargetLoweringBase *TLI = getTLI();
516 EVT VT = TLI->getValueType(DL, Ty);
517 return TLI->isTypeLegal(VT) &&
519 }
520
522 return true;
523 }
524
526 // Check whether FADD is available, as a proxy for floating-point in
527 // general.
528 const TargetLoweringBase *TLI = getTLI();
529 EVT VT = TLI->getValueType(DL, Ty);
533 }
534
535 unsigned getInliningThresholdMultiplier() { return 1; }
536 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
537
538 int getInlinerVectorBonusPercent() { return 150; }
539
543 // This unrolling functionality is target independent, but to provide some
544 // motivation for its intended use, for x86:
545
546 // According to the Intel 64 and IA-32 Architectures Optimization Reference
547 // Manual, Intel Core models and later have a loop stream detector (and
548 // associated uop queue) that can benefit from partial unrolling.
549 // The relevant requirements are:
550 // - The loop must have no more than 4 (8 for Nehalem and later) branches
551 // taken, and none of them may be calls.
552 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
553
554 // According to the Software Optimization Guide for AMD Family 15h
555 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
556 // and loop buffer which can benefit from partial unrolling.
557 // The relevant requirements are:
558 // - The loop must have fewer than 16 branches
559 // - The loop must have less than 40 uops in all executed loop branches
560
561 // The number of taken branches in a loop is hard to estimate here, and
562 // benchmarking has revealed that it is better not to be conservative when
563 // estimating the branch count. As a result, we'll ignore the branch limits
564 // until someone finds a case where it matters in practice.
565
566 unsigned MaxOps;
567 const TargetSubtargetInfo *ST = getST();
568 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
570 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
571 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
572 else
573 return;
574
575 // Scan the loop: don't unroll loops with calls.
576 for (BasicBlock *BB : L->blocks()) {
577 for (Instruction &I : *BB) {
578 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
579 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
580 if (!thisT()->isLoweredToCall(F))
581 continue;
582 }
583
584 if (ORE) {
585 ORE->emit([&]() {
586 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
587 L->getHeader())
588 << "advising against unrolling the loop because it "
589 "contains a "
590 << ore::NV("Call", &I);
591 });
592 }
593 return;
594 }
595 }
596 }
597
598 // Enable runtime and partial unrolling up to the specified size.
599 // Enable using trip count upper bound to unroll loops.
600 UP.Partial = UP.Runtime = UP.UpperBound = true;
601 UP.PartialThreshold = MaxOps;
602
603 // Avoid unrolling when optimizing for size.
604 UP.OptSizeThreshold = 0;
606
607 // Set number of instructions optimized when "back edge"
608 // becomes "fall through" to default value of 2.
609 UP.BEInsns = 2;
610 }
611
614 PP.PeelCount = 0;
615 PP.AllowPeeling = true;
616 PP.AllowLoopNestsPeeling = false;
617 PP.PeelProfiledIterations = true;
618 }
619
621 AssumptionCache &AC,
622 TargetLibraryInfo *LibInfo,
623 HardwareLoopInfo &HWLoopInfo) {
624 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
625 }
626
629 }
630
632 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) {
633 return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
634 }
635
636 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
637 IntrinsicInst &II) {
638 return BaseT::instCombineIntrinsic(IC, II);
639 }
640
641 std::optional<Value *>
643 APInt DemandedMask, KnownBits &Known,
644 bool &KnownBitsComputed) {
645 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
646 KnownBitsComputed);
647 }
648
650 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
651 APInt &UndefElts2, APInt &UndefElts3,
652 std::function<void(Instruction *, unsigned, APInt, APInt &)>
653 SimplifyAndSetOp) {
655 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
656 SimplifyAndSetOp);
657 }
658
659 virtual std::optional<unsigned>
661 return std::optional<unsigned>(
662 getST()->getCacheSize(static_cast<unsigned>(Level)));
663 }
664
665 virtual std::optional<unsigned>
667 std::optional<unsigned> TargetResult =
668 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
669
670 if (TargetResult)
671 return TargetResult;
672
673 return BaseT::getCacheAssociativity(Level);
674 }
675
676 virtual unsigned getCacheLineSize() const {
677 return getST()->getCacheLineSize();
678 }
679
680 virtual unsigned getPrefetchDistance() const {
681 return getST()->getPrefetchDistance();
682 }
683
684 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
685 unsigned NumStridedMemAccesses,
686 unsigned NumPrefetches,
687 bool HasCall) const {
688 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
689 NumPrefetches, HasCall);
690 }
691
692 virtual unsigned getMaxPrefetchIterationsAhead() const {
693 return getST()->getMaxPrefetchIterationsAhead();
694 }
695
696 virtual bool enableWritePrefetching() const {
697 return getST()->enableWritePrefetching();
698 }
699
700 virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
701 return getST()->shouldPrefetchAddressSpace(AS);
702 }
703
704 /// @}
705
706 /// \name Vector TTI Implementations
707 /// @{
708
710 return TypeSize::getFixed(32);
711 }
712
713 std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
714 std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
715 bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
716
717 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
718 /// are set if the demanded result elements need to be inserted and/or
719 /// extracted from vectors.
721 const APInt &DemandedElts,
722 bool Insert, bool Extract,
724 /// FIXME: a bitfield is not a reasonable abstraction for talking about
725 /// which elements are needed from a scalable vector
726 if (isa<ScalableVectorType>(InTy))
728 auto *Ty = cast<FixedVectorType>(InTy);
729
730 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
731 "Vector size mismatch");
732
734
735 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
736 if (!DemandedElts[i])
737 continue;
738 if (Insert)
739 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
740 CostKind, i, nullptr, nullptr);
741 if (Extract)
742 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
743 CostKind, i, nullptr, nullptr);
744 }
745
746 return Cost;
747 }
748
749 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
751 bool Extract,
753 if (isa<ScalableVectorType>(InTy))
755 auto *Ty = cast<FixedVectorType>(InTy);
756
757 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
758 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
759 CostKind);
760 }
761
762 /// Estimate the overhead of scalarizing an instructions unique
763 /// non-constant operands. The (potentially vector) types to use for each of
764 /// argument are passes via Tys.
769 assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
770
772 SmallPtrSet<const Value*, 4> UniqueOperands;
773 for (int I = 0, E = Args.size(); I != E; I++) {
774 // Disregard things like metadata arguments.
775 const Value *A = Args[I];
776 Type *Ty = Tys[I];
777 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
778 !Ty->isPtrOrPtrVectorTy())
779 continue;
780
781 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
782 if (auto *VecTy = dyn_cast<VectorType>(Ty))
783 Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
784 /*Extract*/ true, CostKind);
785 }
786 }
787
788 return Cost;
789 }
790
791 /// Estimate the overhead of scalarizing the inputs and outputs of an
792 /// instruction, with return type RetTy and arguments Args of type Tys. If
793 /// Args are unknown (empty), then the cost associated with one argument is
794 /// added as a heuristic.
800 RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
801 if (!Args.empty())
803 else
804 // When no information on arguments is provided, we add the cost
805 // associated with one argument as a heuristic.
806 Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
807 /*Extract*/ true, CostKind);
808
809 return Cost;
810 }
811
812 /// Estimate the cost of type-legalization and the legalized type.
813 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
814 LLVMContext &C = Ty->getContext();
815 EVT MTy = getTLI()->getValueType(DL, Ty);
816
818 // We keep legalizing the type until we find a legal kind. We assume that
819 // the only operation that costs anything is the split. After splitting
820 // we need to handle two types.
821 while (true) {
823
825 // Ensure we return a sensible simple VT here, since many callers of
826 // this function require it.
827 MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
828 return std::make_pair(InstructionCost::getInvalid(), VT);
829 }
830
831 if (LK.first == TargetLoweringBase::TypeLegal)
832 return std::make_pair(Cost, MTy.getSimpleVT());
833
834 if (LK.first == TargetLoweringBase::TypeSplitVector ||
836 Cost *= 2;
837
838 // Do not loop with f128 type.
839 if (MTy == LK.second)
840 return std::make_pair(Cost, MTy.getSimpleVT());
841
842 // Keep legalizing the type.
843 MTy = LK.second;
844 }
845 }
846
847 unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
848
850 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
853 ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
854 const Instruction *CxtI = nullptr) {
855 // Check if any of the operands are vector operands.
856 const TargetLoweringBase *TLI = getTLI();
857 int ISD = TLI->InstructionOpcodeToISD(Opcode);
858 assert(ISD && "Invalid opcode");
859
860 // TODO: Handle more cost kinds.
862 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
863 Opd1Info, Opd2Info,
864 Args, CxtI);
865
866 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
867
868 bool IsFloat = Ty->isFPOrFPVectorTy();
869 // Assume that floating point arithmetic operations cost twice as much as
870 // integer operations.
871 InstructionCost OpCost = (IsFloat ? 2 : 1);
872
873 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
874 // The operation is legal. Assume it costs 1.
875 // TODO: Once we have extract/insert subvector cost we need to use them.
876 return LT.first * OpCost;
877 }
878
879 if (!TLI->isOperationExpand(ISD, LT.second)) {
880 // If the operation is custom lowered, then assume that the code is twice
881 // as expensive.
882 return LT.first * 2 * OpCost;
883 }
884
885 // An 'Expand' of URem and SRem is special because it may default
886 // to expanding the operation into a sequence of sub-operations
887 // i.e. X % Y -> X-(X/Y)*Y.
888 if (ISD == ISD::UREM || ISD == ISD::SREM) {
889 bool IsSigned = ISD == ISD::SREM;
890 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
891 LT.second) ||
892 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
893 LT.second)) {
894 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
895 InstructionCost DivCost = thisT()->getArithmeticInstrCost(
896 DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
897 InstructionCost MulCost =
898 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
899 InstructionCost SubCost =
900 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
901 return DivCost + MulCost + SubCost;
902 }
903 }
904
905 // We cannot scalarize scalable vectors, so return Invalid.
906 if (isa<ScalableVectorType>(Ty))
908
909 // Else, assume that we need to scalarize this op.
910 // TODO: If one of the types get legalized by splitting, handle this
911 // similarly to what getCastInstrCost() does.
912 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
913 InstructionCost Cost = thisT()->getArithmeticInstrCost(
914 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
915 Args, CxtI);
916 // Return the cost of multiple scalar invocation plus the cost of
917 // inserting and extracting the values.
918 SmallVector<Type *> Tys(Args.size(), Ty);
919 return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
920 VTy->getNumElements() * Cost;
921 }
922
923 // We don't know anything about this scalar instruction.
924 return OpCost;
925 }
926
928 ArrayRef<int> Mask) const {
929 int Limit = Mask.size() * 2;
930 if (Mask.empty() ||
931 // Extra check required by isSingleSourceMaskImpl function (called by
932 // ShuffleVectorInst::isSingleSourceMask).
933 any_of(Mask, [Limit](int I) { return I >= Limit; }))
934 return Kind;
935 int Index;
936 switch (Kind) {
939 return TTI::SK_Reverse;
941 return TTI::SK_Broadcast;
942 break;
945 return TTI::SK_Select;
947 return TTI::SK_Transpose;
949 return TTI::SK_Splice;
950 break;
951 case TTI::SK_Select:
952 case TTI::SK_Reverse:
957 case TTI::SK_Splice:
958 break;
959 }
960 return Kind;
961 }
962
964 ArrayRef<int> Mask,
966 VectorType *SubTp,
967 ArrayRef<const Value *> Args = std::nullopt) {
968
969 switch (improveShuffleKindFromMask(Kind, Mask)) {
971 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
972 return getBroadcastShuffleOverhead(FVT, CostKind);
974 case TTI::SK_Select:
975 case TTI::SK_Splice:
976 case TTI::SK_Reverse:
980 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
981 return getPermuteShuffleOverhead(FVT, CostKind);
984 return getExtractSubvectorOverhead(Tp, CostKind, Index,
985 cast<FixedVectorType>(SubTp));
987 return getInsertSubvectorOverhead(Tp, CostKind, Index,
988 cast<FixedVectorType>(SubTp));
989 }
990 llvm_unreachable("Unknown TTI::ShuffleKind");
991 }
992
993 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
996 const Instruction *I = nullptr) {
997 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
998 return 0;
999
1000 const TargetLoweringBase *TLI = getTLI();
1001 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1002 assert(ISD && "Invalid opcode");
1003 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1004 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1005
1006 TypeSize SrcSize = SrcLT.second.getSizeInBits();
1007 TypeSize DstSize = DstLT.second.getSizeInBits();
1008 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1009 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1010
1011 switch (Opcode) {
1012 default:
1013 break;
1014 case Instruction::Trunc:
1015 // Check for NOOP conversions.
1016 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
1017 return 0;
1018 [[fallthrough]];
1019 case Instruction::BitCast:
1020 // Bitcast between types that are legalized to the same type are free and
1021 // assume int to/from ptr of the same size is also free.
1022 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1023 SrcSize == DstSize)
1024 return 0;
1025 break;
1026 case Instruction::FPExt:
1027 if (I && getTLI()->isExtFree(I))
1028 return 0;
1029 break;
1030 case Instruction::ZExt:
1031 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
1032 return 0;
1033 [[fallthrough]];
1034 case Instruction::SExt:
1035 if (I && getTLI()->isExtFree(I))
1036 return 0;
1037
1038 // If this is a zext/sext of a load, return 0 if the corresponding
1039 // extending load exists on target and the result type is legal.
1040 if (CCH == TTI::CastContextHint::Normal) {
1041 EVT ExtVT = EVT::getEVT(Dst);
1042 EVT LoadVT = EVT::getEVT(Src);
1043 unsigned LType =
1044 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1045 if (DstLT.first == SrcLT.first &&
1046 TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
1047 return 0;
1048 }
1049 break;
1050 case Instruction::AddrSpaceCast:
1051 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
1052 Dst->getPointerAddressSpace()))
1053 return 0;
1054 break;
1055 }
1056
1057 auto *SrcVTy = dyn_cast<VectorType>(Src);
1058 auto *DstVTy = dyn_cast<VectorType>(Dst);
1059
1060 // If the cast is marked as legal (or promote) then assume low cost.
1061 if (SrcLT.first == DstLT.first &&
1062 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1063 return SrcLT.first;
1064
1065 // Handle scalar conversions.
1066 if (!SrcVTy && !DstVTy) {
1067 // Just check the op cost. If the operation is legal then assume it costs
1068 // 1.
1069 if (!TLI->isOperationExpand(ISD, DstLT.second))
1070 return 1;
1071
1072 // Assume that illegal scalar instruction are expensive.
1073 return 4;
1074 }
1075
1076 // Check vector-to-vector casts.
1077 if (DstVTy && SrcVTy) {
1078 // If the cast is between same-sized registers, then the check is simple.
1079 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1080
1081 // Assume that Zext is done using AND.
1082 if (Opcode == Instruction::ZExt)
1083 return SrcLT.first;
1084
1085 // Assume that sext is done using SHL and SRA.
1086 if (Opcode == Instruction::SExt)
1087 return SrcLT.first * 2;
1088
1089 // Just check the op cost. If the operation is legal then assume it
1090 // costs
1091 // 1 and multiply by the type-legalization overhead.
1092 if (!TLI->isOperationExpand(ISD, DstLT.second))
1093 return SrcLT.first * 1;
1094 }
1095
1096 // If we are legalizing by splitting, query the concrete TTI for the cost
1097 // of casting the original vector twice. We also need to factor in the
1098 // cost of the split itself. Count that as 1, to be consistent with
1099 // getTypeLegalizationCost().
1100 bool SplitSrc =
1101 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1103 bool SplitDst =
1104 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1106 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1107 DstVTy->getElementCount().isVector()) {
1108 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1109 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1110 T *TTI = static_cast<T *>(this);
1111 // If both types need to be split then the split is free.
1112 InstructionCost SplitCost =
1113 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1114 return SplitCost +
1115 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1116 CostKind, I));
1117 }
1118
1119 // Scalarization cost is Invalid, can't assume any num elements.
1120 if (isa<ScalableVectorType>(DstVTy))
1122
1123 // In other cases where the source or destination are illegal, assume
1124 // the operation will get scalarized.
1125 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1126 InstructionCost Cost = thisT()->getCastInstrCost(
1127 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1128
1129 // Return the cost of multiple scalar invocation plus the cost of
1130 // inserting and extracting the values.
1131 return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1132 CostKind) +
1133 Num * Cost;
1134 }
1135
1136 // We already handled vector-to-vector and scalar-to-scalar conversions.
1137 // This
1138 // is where we handle bitcast between vectors and scalars. We need to assume
1139 // that the conversion is scalarized in one way or another.
1140 if (Opcode == Instruction::BitCast) {
1141 // Illegal bitcasts are done by storing and loading from a stack slot.
1142 return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1143 /*Extract*/ true, CostKind)
1144 : 0) +
1145 (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1146 /*Extract*/ false, CostKind)
1147 : 0);
1148 }
1149
1150 llvm_unreachable("Unhandled cast");
1151 }
1152
1154 VectorType *VecTy, unsigned Index) {
1156 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1157 CostKind, Index, nullptr, nullptr) +
1158 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1160 }
1161
1163 const Instruction *I = nullptr) {
1164 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1165 }
1166
1167 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1168 CmpInst::Predicate VecPred,
1170 const Instruction *I = nullptr) {
1171 const TargetLoweringBase *TLI = getTLI();
1172 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1173 assert(ISD && "Invalid opcode");
1174
1175 // TODO: Handle other cost kinds.
1177 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1178 I);
1179
1180 // Selects on vectors are actually vector selects.
1181 if (ISD == ISD::SELECT) {
1182 assert(CondTy && "CondTy must exist");
1183 if (CondTy->isVectorTy())
1184 ISD = ISD::VSELECT;
1185 }
1186 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1187
1188 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1189 !TLI->isOperationExpand(ISD, LT.second)) {
1190 // The operation is legal. Assume it costs 1. Multiply
1191 // by the type-legalization overhead.
1192 return LT.first * 1;
1193 }
1194
1195 // Otherwise, assume that the cast is scalarized.
1196 // TODO: If one of the types get legalized by splitting, handle this
1197 // similarly to what getCastInstrCost() does.
1198 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1199 if (isa<ScalableVectorType>(ValTy))
1201
1202 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1203 if (CondTy)
1204 CondTy = CondTy->getScalarType();
1205 InstructionCost Cost = thisT()->getCmpSelInstrCost(
1206 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1207
1208 // Return the cost of multiple scalar invocation plus the cost of
1209 // inserting and extracting the values.
1210 return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1211 /*Extract*/ false, CostKind) +
1212 Num * Cost;
1213 }
1214
1215 // Unknown scalar opcode.
1216 return 1;
1217 }
1218
1221 unsigned Index, Value *Op0, Value *Op1) {
1222 return getRegUsageForType(Val->getScalarType());
1223 }
1224
1227 unsigned Index) {
1228 Value *Op0 = nullptr;
1229 Value *Op1 = nullptr;
1230 if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
1231 Op0 = IE->getOperand(0);
1232 Op1 = IE->getOperand(1);
1233 }
1234 return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1235 Op1);
1236 }
1237
1238 InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1239 int VF,
1240 const APInt &DemandedDstElts,
1242 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1243 "Unexpected size of DemandedDstElts.");
1244
1246
1247 auto *SrcVT = FixedVectorType::get(EltTy, VF);
1248 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1249
1250 // The Mask shuffling cost is extract all the elements of the Mask
1251 // and insert each of them Factor times into the wide vector:
1252 //
1253 // E.g. an interleaved group with factor 3:
1254 // %mask = icmp ult <8 x i32> %vec1, %vec2
1255 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1256 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1257 // The cost is estimated as extract all mask elements from the <8xi1> mask
1258 // vector and insert them factor times into the <24xi1> shuffled mask
1259 // vector.
1260 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1261 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1262 /*Insert*/ false,
1263 /*Extract*/ true, CostKind);
1264 Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1265 /*Insert*/ true,
1266 /*Extract*/ false, CostKind);
1267
1268 return Cost;
1269 }
1270
1272 getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1275 const Instruction *I = nullptr) {
1276 assert(!Src->isVoidTy() && "Invalid type");
1277 // Assume types, such as structs, are expensive.
1278 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1279 return 4;
1280 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1281
1282 // Assuming that all loads of legal types cost 1.
1283 InstructionCost Cost = LT.first;
1285 return Cost;
1286
1287 const DataLayout &DL = this->getDataLayout();
1288 if (Src->isVectorTy() &&
1289 // In practice it's not currently possible to have a change in lane
1290 // length for extending loads or truncating stores so both types should
1291 // have the same scalable property.
1293 LT.second.getSizeInBits())) {
1294 // This is a vector load that legalizes to a larger type than the vector
1295 // itself. Unless the corresponding extending load or truncating store is
1296 // legal, then this will scalarize.
1298 EVT MemVT = getTLI()->getValueType(DL, Src);
1299 if (Opcode == Instruction::Store)
1300 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1301 else
1302 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1303
1304 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1305 // This is a vector load/store for some illegal type that is scalarized.
1306 // We must account for the cost of building or decomposing the vector.
1308 cast<VectorType>(Src), Opcode != Instruction::Store,
1309 Opcode == Instruction::Store, CostKind);
1310 }
1311 }
1312
1313 return Cost;
1314 }
1315
1317 Align Alignment, unsigned AddressSpace,
1319 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1320 CostKind);
1321 }
1322
1324 const Value *Ptr, bool VariableMask,
1325 Align Alignment,
1327 const Instruction *I = nullptr) {
1328 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1329 true, CostKind);
1330 }
1331
1333 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1334 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1335 bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1336
1337 // We cannot scalarize scalable vectors, so return Invalid.
1338 if (isa<ScalableVectorType>(VecTy))
1340
1341 auto *VT = cast<FixedVectorType>(VecTy);
1342
1343 unsigned NumElts = VT->getNumElements();
1344 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1345
1346 unsigned NumSubElts = NumElts / Factor;
1347 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1348
1349 // Firstly, the cost of load/store operation.
1351 if (UseMaskForCond || UseMaskForGaps)
1352 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1354 else
1355 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1356 CostKind);
1357
1358 // Legalize the vector type, and get the legalized and unlegalized type
1359 // sizes.
1360 MVT VecTyLT = getTypeLegalizationCost(VecTy).second;
1361 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1362 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1363
1364 // Scale the cost of the memory operation by the fraction of legalized
1365 // instructions that will actually be used. We shouldn't account for the
1366 // cost of dead instructions since they will be removed.
1367 //
1368 // E.g., An interleaved load of factor 8:
1369 // %vec = load <16 x i64>, <16 x i64>* %ptr
1370 // %v0 = shufflevector %vec, undef, <0, 8>
1371 //
1372 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1373 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1374 // type). The other loads are unused.
1375 //
1376 // TODO: Note that legalization can turn masked loads/stores into unmasked
1377 // (legalized) loads/stores. This can be reflected in the cost.
1378 if (Cost.isValid() && VecTySize > VecTyLTSize) {
1379 // The number of loads of a legal type it will take to represent a load
1380 // of the unlegalized vector type.
1381 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1382
1383 // The number of elements of the unlegalized type that correspond to a
1384 // single legal instruction.
1385 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1386
1387 // Determine which legal instructions will be used.
1388 BitVector UsedInsts(NumLegalInsts, false);
1389 for (unsigned Index : Indices)
1390 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1391 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1392
1393 // Scale the cost of the load by the fraction of legal instructions that
1394 // will be used.
1395 Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts);
1396 }
1397
1398 // Then plus the cost of interleave operation.
1399 assert(Indices.size() <= Factor &&
1400 "Interleaved memory op has too many members");
1401
1402 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1403 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1404
1405 APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1406 for (unsigned Index : Indices) {
1407 assert(Index < Factor && "Invalid index for interleaved memory op");
1408 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1409 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1410 }
1411
1412 if (Opcode == Instruction::Load) {
1413 // The interleave cost is similar to extract sub vectors' elements
1414 // from the wide vector, and insert them into sub vectors.
1415 //
1416 // E.g. An interleaved load of factor 2 (with one member of index 0):
1417 // %vec = load <8 x i32>, <8 x i32>* %ptr
1418 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1419 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1420 // <8 x i32> vector and insert them into a <4 x i32> vector.
1421 InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1422 SubVT, DemandedAllSubElts,
1423 /*Insert*/ true, /*Extract*/ false, CostKind);
1424 Cost += Indices.size() * InsSubCost;
1425 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1426 /*Insert*/ false,
1427 /*Extract*/ true, CostKind);
1428 } else {
1429 // The interleave cost is extract elements from sub vectors, and
1430 // insert them into the wide vector.
1431 //
1432 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1433 // (using VF=4):
1434 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1435 // %gaps.mask = <true, true, false, true, true, false,
1436 // true, true, false, true, true, false>
1437 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1438 // i32 Align, <12 x i1> %gaps.mask
1439 // The cost is estimated as extract all elements (of actual members,
1440 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1441 // i32> vector.
1442 InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1443 SubVT, DemandedAllSubElts,
1444 /*Insert*/ false, /*Extract*/ true, CostKind);
1445 Cost += ExtSubCost * Indices.size();
1446 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1447 /*Insert*/ true,
1448 /*Extract*/ false, CostKind);
1449 }
1450
1451 if (!UseMaskForCond)
1452 return Cost;
1453
1454 Type *I8Type = Type::getInt8Ty(VT->getContext());
1455
1456 Cost += thisT()->getReplicationShuffleCost(
1457 I8Type, Factor, NumSubElts,
1458 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1459 CostKind);
1460
1461 // The Gaps mask is invariant and created outside the loop, therefore the
1462 // cost of creating it is not accounted for here. However if we have both
1463 // a MaskForGaps and some other mask that guards the execution of the
1464 // memory access, we need to account for the cost of And-ing the two masks
1465 // inside the loop.
1466 if (UseMaskForGaps) {
1467 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1468 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1469 CostKind);
1470 }
1471
1472 return Cost;
1473 }
1474
1475 /// Get intrinsic cost based on arguments.
1478 // Check for generically free intrinsics.
1480 return 0;
1481
1482 // Assume that target intrinsics are cheap.
1483 Intrinsic::ID IID = ICA.getID();
1486
1487 if (ICA.isTypeBasedOnly())
1489
1490 Type *RetTy = ICA.getReturnType();
1491
1492 ElementCount RetVF =
1493 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1495 const IntrinsicInst *I = ICA.getInst();
1496 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1497 FastMathFlags FMF = ICA.getFlags();
1498 switch (IID) {
1499 default:
1500 break;
1501
1502 case Intrinsic::powi:
1503 if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1504 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1505 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1506 ShouldOptForSize)) {
1507 // The cost is modeled on the expansion performed by ExpandPowI in
1508 // SelectionDAGBuilder.
1509 APInt Exponent = RHSC->getValue().abs();
1510 unsigned ActiveBits = Exponent.getActiveBits();
1511 unsigned PopCount = Exponent.popcount();
1512 InstructionCost Cost = (ActiveBits + PopCount - 2) *
1513 thisT()->getArithmeticInstrCost(
1514 Instruction::FMul, RetTy, CostKind);
1515 if (RHSC->isNegative())
1516 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1517 CostKind);
1518 return Cost;
1519 }
1520 }
1521 break;
1522 case Intrinsic::cttz:
1523 // FIXME: If necessary, this should go in target-specific overrides.
1524 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1526 break;
1527
1528 case Intrinsic::ctlz:
1529 // FIXME: If necessary, this should go in target-specific overrides.
1530 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1532 break;
1533
1534 case Intrinsic::memcpy:
1535 return thisT()->getMemcpyCost(ICA.getInst());
1536
1537 case Intrinsic::masked_scatter: {
1538 const Value *Mask = Args[3];
1539 bool VarMask = !isa<Constant>(Mask);
1540 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1541 return thisT()->getGatherScatterOpCost(Instruction::Store,
1542 ICA.getArgTypes()[0], Args[1],
1543 VarMask, Alignment, CostKind, I);
1544 }
1545 case Intrinsic::masked_gather: {
1546 const Value *Mask = Args[2];
1547 bool VarMask = !isa<Constant>(Mask);
1548 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1549 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1550 VarMask, Alignment, CostKind, I);
1551 }
1552 case Intrinsic::experimental_stepvector: {
1553 if (isa<ScalableVectorType>(RetTy))
1555 // The cost of materialising a constant integer vector.
1557 }
1558 case Intrinsic::vector_extract: {
1559 // FIXME: Handle case where a scalable vector is extracted from a scalable
1560 // vector
1561 if (isa<ScalableVectorType>(RetTy))
1563 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1564 return thisT()->getShuffleCost(
1565 TTI::SK_ExtractSubvector, cast<VectorType>(Args[0]->getType()),
1566 std::nullopt, CostKind, Index, cast<VectorType>(RetTy));
1567 }
1568 case Intrinsic::vector_insert: {
1569 // FIXME: Handle case where a scalable vector is inserted into a scalable
1570 // vector
1571 if (isa<ScalableVectorType>(Args[1]->getType()))
1573 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1574 return thisT()->getShuffleCost(
1575 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()),
1576 std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType()));
1577 }
1578 case Intrinsic::experimental_vector_reverse: {
1579 return thisT()->getShuffleCost(
1580 TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt,
1581 CostKind, 0, cast<VectorType>(RetTy));
1582 }
1583 case Intrinsic::experimental_vector_splice: {
1584 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1585 return thisT()->getShuffleCost(
1586 TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt,
1587 CostKind, Index, cast<VectorType>(RetTy));
1588 }
1589 case Intrinsic::vector_reduce_add:
1590 case Intrinsic::vector_reduce_mul:
1591 case Intrinsic::vector_reduce_and:
1592 case Intrinsic::vector_reduce_or:
1593 case Intrinsic::vector_reduce_xor:
1594 case Intrinsic::vector_reduce_smax:
1595 case Intrinsic::vector_reduce_smin:
1596 case Intrinsic::vector_reduce_fmax:
1597 case Intrinsic::vector_reduce_fmin:
1598 case Intrinsic::vector_reduce_umax:
1599 case Intrinsic::vector_reduce_umin: {
1600 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1602 }
1603 case Intrinsic::vector_reduce_fadd:
1604 case Intrinsic::vector_reduce_fmul: {
1606 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1608 }
1609 case Intrinsic::fshl:
1610 case Intrinsic::fshr: {
1611 const Value *X = Args[0];
1612 const Value *Y = Args[1];
1613 const Value *Z = Args[2];
1616 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z);
1617 const TTI::OperandValueInfo OpInfoBW =
1619 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1620 : TTI::OP_None};
1621
1622 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1623 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1625 Cost +=
1626 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1627 Cost +=
1628 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1629 Cost += thisT()->getArithmeticInstrCost(
1630 BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
1631 {OpInfoZ.Kind, TTI::OP_None});
1632 Cost += thisT()->getArithmeticInstrCost(
1633 BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
1634 {OpInfoZ.Kind, TTI::OP_None});
1635 // Non-constant shift amounts requires a modulo.
1636 if (!OpInfoZ.isConstant())
1637 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1638 CostKind, OpInfoZ, OpInfoBW);
1639 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1640 if (X != Y) {
1641 Type *CondTy = RetTy->getWithNewBitWidth(1);
1642 Cost +=
1643 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1645 Cost +=
1646 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1648 }
1649 return Cost;
1650 }
1651 case Intrinsic::get_active_lane_mask: {
1652 EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1653 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1654
1655 // If we're not expanding the intrinsic then we assume this is cheap
1656 // to implement.
1657 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1658 return getTypeLegalizationCost(RetTy).first;
1659 }
1660
1661 // Create the expanded types that will be used to calculate the uadd_sat
1662 // operation.
1663 Type *ExpRetTy = VectorType::get(
1664 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1665 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1667 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1668 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1670 return Cost;
1671 }
1672 }
1673
1674 // Assume that we need to scalarize this intrinsic.
1675 // Compute the scalarization overhead based on Args for a vector
1676 // intrinsic.
1677 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1678 if (RetVF.isVector() && !RetVF.isScalable()) {
1679 ScalarizationCost = 0;
1680 if (!RetTy->isVoidTy())
1681 ScalarizationCost += getScalarizationOverhead(
1682 cast<VectorType>(RetTy),
1683 /*Insert*/ true, /*Extract*/ false, CostKind);
1684 ScalarizationCost +=
1686 }
1687
1688 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1689 ScalarizationCost);
1690 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1691 }
1692
1693 /// Get intrinsic cost based on argument types.
1694 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1695 /// cost of scalarizing the arguments and the return value will be computed
1696 /// based on types.
1700 Intrinsic::ID IID = ICA.getID();
1701 Type *RetTy = ICA.getReturnType();
1702 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1703 FastMathFlags FMF = ICA.getFlags();
1704 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1705 bool SkipScalarizationCost = ICA.skipScalarizationCost();
1706
1707 VectorType *VecOpTy = nullptr;
1708 if (!Tys.empty()) {
1709 // The vector reduction operand is operand 0 except for fadd/fmul.
1710 // Their operand 0 is a scalar start value, so the vector op is operand 1.
1711 unsigned VecTyIndex = 0;
1712 if (IID == Intrinsic::vector_reduce_fadd ||
1713 IID == Intrinsic::vector_reduce_fmul)
1714 VecTyIndex = 1;
1715 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1716 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1717 }
1718
1719 // Library call cost - other than size, make it expensive.
1720 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1721 unsigned ISD = 0;
1722 switch (IID) {
1723 default: {
1724 // Scalable vectors cannot be scalarized, so return Invalid.
1725 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1726 return isa<ScalableVectorType>(Ty);
1727 }))
1729
1730 // Assume that we need to scalarize this intrinsic.
1731 InstructionCost ScalarizationCost =
1732 SkipScalarizationCost ? ScalarizationCostPassed : 0;
1733 unsigned ScalarCalls = 1;
1734 Type *ScalarRetTy = RetTy;
1735 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1736 if (!SkipScalarizationCost)
1737 ScalarizationCost = getScalarizationOverhead(
1738 RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
1739 ScalarCalls = std::max(ScalarCalls,
1740 cast<FixedVectorType>(RetVTy)->getNumElements());
1741 ScalarRetTy = RetTy->getScalarType();
1742 }
1743 SmallVector<Type *, 4> ScalarTys;
1744 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1745 Type *Ty = Tys[i];
1746 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1747 if (!SkipScalarizationCost)
1748 ScalarizationCost += getScalarizationOverhead(
1749 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
1750 ScalarCalls = std::max(ScalarCalls,
1751 cast<FixedVectorType>(VTy)->getNumElements());
1752 Ty = Ty->getScalarType();
1753 }
1754 ScalarTys.push_back(Ty);
1755 }
1756 if (ScalarCalls == 1)
1757 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1758
1759 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1760 InstructionCost ScalarCost =
1761 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1762
1763 return ScalarCalls * ScalarCost + ScalarizationCost;
1764 }
1765 // Look for intrinsics that can be lowered directly or turned into a scalar
1766 // intrinsic call.
1767 case Intrinsic::sqrt:
1768 ISD = ISD::FSQRT;
1769 break;
1770 case Intrinsic::sin:
1771 ISD = ISD::FSIN;
1772 break;
1773 case Intrinsic::cos:
1774 ISD = ISD::FCOS;
1775 break;
1776 case Intrinsic::exp:
1777 ISD = ISD::FEXP;
1778 break;
1779 case Intrinsic::exp2:
1780 ISD = ISD::FEXP2;
1781 break;
1782 case Intrinsic::log:
1783 ISD = ISD::FLOG;
1784 break;
1785 case Intrinsic::log10:
1786 ISD = ISD::FLOG10;
1787 break;
1788 case Intrinsic::log2:
1789 ISD = ISD::FLOG2;
1790 break;
1791 case Intrinsic::fabs:
1792 ISD = ISD::FABS;
1793 break;
1794 case Intrinsic::canonicalize:
1795 ISD = ISD::FCANONICALIZE;
1796 break;
1797 case Intrinsic::minnum:
1798 ISD = ISD::FMINNUM;
1799 break;
1800 case Intrinsic::maxnum:
1801 ISD = ISD::FMAXNUM;
1802 break;
1803 case Intrinsic::minimum:
1804 ISD = ISD::FMINIMUM;
1805 break;
1806 case Intrinsic::maximum:
1807 ISD = ISD::FMAXIMUM;
1808 break;
1809 case Intrinsic::copysign:
1810 ISD = ISD::FCOPYSIGN;
1811 break;
1812 case Intrinsic::floor:
1813 ISD = ISD::FFLOOR;
1814 break;
1815 case Intrinsic::ceil:
1816 ISD = ISD::FCEIL;
1817 break;
1818 case Intrinsic::trunc:
1819 ISD = ISD::FTRUNC;
1820 break;
1821 case Intrinsic::nearbyint:
1822 ISD = ISD::FNEARBYINT;
1823 break;
1824 case Intrinsic::rint:
1825 ISD = ISD::FRINT;
1826 break;
1827 case Intrinsic::round:
1828 ISD = ISD::FROUND;
1829 break;
1830 case Intrinsic::roundeven:
1831 ISD = ISD::FROUNDEVEN;
1832 break;
1833 case Intrinsic::pow:
1834 ISD = ISD::FPOW;
1835 break;
1836 case Intrinsic::fma:
1837 ISD = ISD::FMA;
1838 break;
1839 case Intrinsic::fmuladd:
1840 ISD = ISD::FMA;
1841 break;
1842 case Intrinsic::experimental_constrained_fmuladd:
1843 ISD = ISD::STRICT_FMA;
1844 break;
1845 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1846 case Intrinsic::lifetime_start:
1847 case Intrinsic::lifetime_end:
1848 case Intrinsic::sideeffect:
1849 case Intrinsic::pseudoprobe:
1850 case Intrinsic::arithmetic_fence:
1851 return 0;
1852 case Intrinsic::masked_store: {
1853 Type *Ty = Tys[0];
1854 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1855 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
1856 CostKind);
1857 }
1858 case Intrinsic::masked_load: {
1859 Type *Ty = RetTy;
1860 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1861 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
1862 CostKind);
1863 }
1864 case Intrinsic::vector_reduce_add:
1865 return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
1866 std::nullopt, CostKind);
1867 case Intrinsic::vector_reduce_mul:
1868 return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
1869 std::nullopt, CostKind);
1870 case Intrinsic::vector_reduce_and:
1871 return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
1872 std::nullopt, CostKind);
1873 case Intrinsic::vector_reduce_or:
1874 return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
1875 std::nullopt, CostKind);
1876 case Intrinsic::vector_reduce_xor:
1877 return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
1878 std::nullopt, CostKind);
1879 case Intrinsic::vector_reduce_fadd:
1880 return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
1881 FMF, CostKind);
1882 case Intrinsic::vector_reduce_fmul:
1883 return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
1884 FMF, CostKind);
1885 case Intrinsic::vector_reduce_smax:
1886 case Intrinsic::vector_reduce_smin:
1887 case Intrinsic::vector_reduce_fmax:
1888 case Intrinsic::vector_reduce_fmin:
1889 return thisT()->getMinMaxReductionCost(
1890 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1891 /*IsUnsigned=*/false, ICA.getFlags(), CostKind);
1892 case Intrinsic::vector_reduce_umax:
1893 case Intrinsic::vector_reduce_umin:
1894 return thisT()->getMinMaxReductionCost(
1895 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1896 /*IsUnsigned=*/true, ICA.getFlags(), CostKind);
1897 case Intrinsic::abs: {
1898 // abs(X) = select(icmp(X,0),X,sub(0,X))
1899 Type *CondTy = RetTy->getWithNewBitWidth(1);
1902 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1903 Pred, CostKind);
1904 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1905 Pred, CostKind);
1906 // TODO: Should we add an OperandValueProperties::OP_Zero property?
1907 Cost += thisT()->getArithmeticInstrCost(
1908 BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None});
1909 return Cost;
1910 }
1911 case Intrinsic::smax:
1912 case Intrinsic::smin:
1913 case Intrinsic::umax:
1914 case Intrinsic::umin: {
1915 // minmax(X,Y) = select(icmp(X,Y),X,Y)
1916 Type *CondTy = RetTy->getWithNewBitWidth(1);
1917 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
1918 CmpInst::Predicate Pred =
1919 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
1921 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1922 Pred, CostKind);
1923 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1924 Pred, CostKind);
1925 return Cost;
1926 }
1927 case Intrinsic::sadd_sat:
1928 case Intrinsic::ssub_sat: {
1929 Type *CondTy = RetTy->getWithNewBitWidth(1);
1930
1931 Type *OpTy = StructType::create({RetTy, CondTy});
1932 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
1933 ? Intrinsic::sadd_with_overflow
1934 : Intrinsic::ssub_with_overflow;
1936
1937 // SatMax -> Overflow && SumDiff < 0
1938 // SatMin -> Overflow && SumDiff >= 0
1940 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1941 nullptr, ScalarizationCostPassed);
1942 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1943 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1944 Pred, CostKind);
1945 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
1946 CondTy, Pred, CostKind);
1947 return Cost;
1948 }
1949 case Intrinsic::uadd_sat:
1950 case Intrinsic::usub_sat: {
1951 Type *CondTy = RetTy->getWithNewBitWidth(1);
1952
1953 Type *OpTy = StructType::create({RetTy, CondTy});
1954 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
1955 ? Intrinsic::uadd_with_overflow
1956 : Intrinsic::usub_with_overflow;
1957
1959 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1960 nullptr, ScalarizationCostPassed);
1961 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1962 Cost +=
1963 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1965 return Cost;
1966 }
1967 case Intrinsic::smul_fix:
1968 case Intrinsic::umul_fix: {
1969 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
1970 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
1971
1972 unsigned ExtOp =
1973 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1975
1977 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
1978 Cost +=
1979 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1980 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
1981 CCH, CostKind);
1982 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
1983 CostKind,
1986 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
1989 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
1990 return Cost;
1991 }
1992 case Intrinsic::sadd_with_overflow:
1993 case Intrinsic::ssub_with_overflow: {
1994 Type *SumTy = RetTy->getContainedType(0);
1995 Type *OverflowTy = RetTy->getContainedType(1);
1996 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
1997 ? BinaryOperator::Add
1998 : BinaryOperator::Sub;
1999
2000 // Add:
2001 // Overflow -> (Result < LHS) ^ (RHS < 0)
2002 // Sub:
2003 // Overflow -> (Result < LHS) ^ (RHS > 0)
2005 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2006 Cost += 2 * thisT()->getCmpSelInstrCost(
2007 Instruction::ICmp, SumTy, OverflowTy,
2009 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2010 CostKind);
2011 return Cost;
2012 }
2013 case Intrinsic::uadd_with_overflow:
2014 case Intrinsic::usub_with_overflow: {
2015 Type *SumTy = RetTy->getContainedType(0);
2016 Type *OverflowTy = RetTy->getContainedType(1);
2017 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2018 ? BinaryOperator::Add
2019 : BinaryOperator::Sub;
2020 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2023
2025 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2026 Cost +=
2027 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
2028 Pred, CostKind);
2029 return Cost;
2030 }
2031 case Intrinsic::smul_with_overflow:
2032 case Intrinsic::umul_with_overflow: {
2033 Type *MulTy = RetTy->getContainedType(0);
2034 Type *OverflowTy = RetTy->getContainedType(1);
2035 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2036 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
2037 bool IsSigned = IID == Intrinsic::smul_with_overflow;
2038
2039 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2041
2043 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2044 Cost +=
2045 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2046 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2047 CCH, CostKind);
2048 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
2049 CostKind,
2052
2053 if (IsSigned)
2054 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
2055 CostKind,
2058
2059 Cost += thisT()->getCmpSelInstrCost(
2060 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2061 return Cost;
2062 }
2063 case Intrinsic::fptosi_sat:
2064 case Intrinsic::fptoui_sat: {
2065 if (Tys.empty())
2066 break;
2067 Type *FromTy = Tys[0];
2068 bool IsSigned = IID == Intrinsic::fptosi_sat;
2069
2071 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2072 {FromTy, FromTy});
2073 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2074 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2075 {FromTy, FromTy});
2076 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2077 Cost += thisT()->getCastInstrCost(
2078 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2080 if (IsSigned) {
2081 Type *CondTy = RetTy->getWithNewBitWidth(1);
2082 Cost += thisT()->getCmpSelInstrCost(
2083 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2084 Cost += thisT()->getCmpSelInstrCost(
2085 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2086 }
2087 return Cost;
2088 }
2089 case Intrinsic::ctpop:
2090 ISD = ISD::CTPOP;
2091 // In case of legalization use TCC_Expensive. This is cheaper than a
2092 // library call but still not a cheap instruction.
2093 SingleCallCost = TargetTransformInfo::TCC_Expensive;
2094 break;
2095 case Intrinsic::ctlz:
2096 ISD = ISD::CTLZ;
2097 break;
2098 case Intrinsic::cttz:
2099 ISD = ISD::CTTZ;
2100 break;
2101 case Intrinsic::bswap:
2102 ISD = ISD::BSWAP;
2103 break;
2104 case Intrinsic::bitreverse:
2105 ISD = ISD::BITREVERSE;
2106 break;
2107 }
2108
2109 const TargetLoweringBase *TLI = getTLI();
2110 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
2111
2112 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2113 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2114 TLI->isFAbsFree(LT.second)) {
2115 return 0;
2116 }
2117
2118 // The operation is legal. Assume it costs 1.
2119 // If the type is split to multiple registers, assume that there is some
2120 // overhead to this.
2121 // TODO: Once we have extract/insert subvector cost we need to use them.
2122 if (LT.first > 1)
2123 return (LT.first * 2);
2124 else
2125 return (LT.first * 1);
2126 } else if (!TLI->isOperationExpand(ISD, LT.second)) {
2127 // If the operation is custom lowered then assume
2128 // that the code is twice as expensive.
2129 return (LT.first * 2);
2130 }
2131
2132 // If we can't lower fmuladd into an FMA estimate the cost as a floating
2133 // point mul followed by an add.
2134 if (IID == Intrinsic::fmuladd)
2135 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2136 CostKind) +
2137 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2138 CostKind);
2139 if (IID == Intrinsic::experimental_constrained_fmuladd) {
2140 IntrinsicCostAttributes FMulAttrs(
2141 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2142 IntrinsicCostAttributes FAddAttrs(
2143 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2144 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2145 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2146 }
2147
2148 // Else, assume that we need to scalarize this intrinsic. For math builtins
2149 // this will emit a costly libcall, adding call overhead and spills. Make it
2150 // very expensive.
2151 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2152 // Scalable vectors cannot be scalarized, so return Invalid.
2153 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2154 return isa<ScalableVectorType>(Ty);
2155 }))
2157
2158 InstructionCost ScalarizationCost =
2159 SkipScalarizationCost
2160 ? ScalarizationCostPassed
2161 : getScalarizationOverhead(RetVTy, /*Insert*/ true,
2162 /*Extract*/ false, CostKind);
2163
2164 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
2165 SmallVector<Type *, 4> ScalarTys;
2166 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
2167 Type *Ty = Tys[i];
2168 if (Ty->isVectorTy())
2169 Ty = Ty->getScalarType();
2170 ScalarTys.push_back(Ty);
2171 }
2172 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2173 InstructionCost ScalarCost =
2174 thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2175 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
2176 if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
2177 if (!ICA.skipScalarizationCost())
2178 ScalarizationCost += getScalarizationOverhead(
2179 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2180 ScalarCalls = std::max(ScalarCalls,
2181 cast<FixedVectorType>(VTy)->getNumElements());
2182 }
2183 }
2184 return ScalarCalls * ScalarCost + ScalarizationCost;
2185 }
2186
2187 // This is going to be turned into a library call, make it expensive.
2188 return SingleCallCost;
2189 }
2190
2191 /// Compute a cost of the given call instruction.
2192 ///
2193 /// Compute the cost of calling function F with return type RetTy and
2194 /// argument types Tys. F might be nullptr, in this case the cost of an
2195 /// arbitrary call with the specified signature will be returned.
2196 /// This is used, for instance, when we estimate call of a vector
2197 /// counterpart of the given function.
2198 /// \param F Called function, might be nullptr.
2199 /// \param RetTy Return value types.
2200 /// \param Tys Argument types.
2201 /// \returns The cost of Call instruction.
2203 ArrayRef<Type *> Tys,
2205 return 10;
2206 }
2207
2208 unsigned getNumberOfParts(Type *Tp) {
2209 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2210 return LT.first.isValid() ? *LT.first.getValue() : 0;
2211 }
2212
2214 const SCEV *) {
2215 return 0;
2216 }
2217
2218 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2219 /// We're assuming that reduction operation are performing the following way:
2220 ///
2221 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2222 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2223 /// \----------------v-------------/ \----------v------------/
2224 /// n/2 elements n/2 elements
2225 /// %red1 = op <n x t> %val, <n x t> val1
2226 /// After this operation we have a vector %red1 where only the first n/2
2227 /// elements are meaningful, the second n/2 elements are undefined and can be
2228 /// dropped. All other operations are actually working with the vector of
2229 /// length n/2, not n, though the real vector length is still n.
2230 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2231 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2232 /// \----------------v-------------/ \----------v------------/
2233 /// n/4 elements 3*n/4 elements
2234 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
2235 /// length n/2, the resulting vector has length n/4 etc.
2236 ///
2237 /// The cost model should take into account that the actual length of the
2238 /// vector is reduced on each iteration.
2241 // Targets must implement a default value for the scalable case, since
2242 // we don't know how many lanes the vector has.
2243 if (isa<ScalableVectorType>(Ty))
2245
2246 Type *ScalarTy = Ty->getElementType();
2247 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2248 if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2249 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2250 NumVecElts >= 2) {
2251 // Or reduction for i1 is represented as:
2252 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2253 // %res = cmp ne iReduxWidth %val, 0
2254 // And reduction for i1 is represented as:
2255 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2256 // %res = cmp eq iReduxWidth %val, 11111
2257 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2258 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2260 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2263 }
2264 unsigned NumReduxLevels = Log2_32(NumVecElts);
2265 InstructionCost ArithCost = 0;
2266 InstructionCost ShuffleCost = 0;
2267 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2268 unsigned LongVectorCount = 0;
2269 unsigned MVTLen =
2270 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2271 while (NumVecElts > MVTLen) {
2272 NumVecElts /= 2;
2273 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2274 ShuffleCost +=
2275 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2276 CostKind, NumVecElts, SubTy);
2277 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2278 Ty = SubTy;
2279 ++LongVectorCount;
2280 }
2281
2282 NumReduxLevels -= LongVectorCount;
2283
2284 // The minimal length of the vector is limited by the real length of vector
2285 // operations performed on the current platform. That's why several final
2286 // reduction operations are performed on the vectors with the same
2287 // architecture-dependent length.
2288
2289 // By default reductions need one shuffle per reduction level.
2290 ShuffleCost +=
2291 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2292 std::nullopt, CostKind, 0, Ty);
2293 ArithCost +=
2294 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2295 return ShuffleCost + ArithCost +
2296 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2297 CostKind, 0, nullptr, nullptr);
2298 }
2299
2300 /// Try to calculate the cost of performing strict (in-order) reductions,
2301 /// which involves doing a sequence of floating point additions in lane
2302 /// order, starting with an initial value. For example, consider a scalar
2303 /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2304 ///
2305 /// Vector = <float %v0, float %v1, float %v2, float %v3>
2306 ///
2307 /// %add1 = %InitVal + %v0
2308 /// %add2 = %add1 + %v1
2309 /// %add3 = %add2 + %v2
2310 /// %add4 = %add3 + %v3
2311 ///
2312 /// As a simple estimate we can say the cost of such a reduction is 4 times
2313 /// the cost of a scalar FP addition. We can only estimate the costs for
2314 /// fixed-width vectors here because for scalable vectors we do not know the
2315 /// runtime number of operations.
2318 // Targets must implement a default value for the scalable case, since
2319 // we don't know how many lanes the vector has.
2320 if (isa<ScalableVectorType>(Ty))
2322
2323 auto *VTy = cast<FixedVectorType>(Ty);
2325 VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
2326 InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2327 Opcode, VTy->getElementType(), CostKind);
2328 ArithCost *= VTy->getNumElements();
2329
2330 return ExtractCost + ArithCost;
2331 }
2332
2334 std::optional<FastMathFlags> FMF,
2336 assert(Ty && "Unknown reduction vector type");
2338 return getOrderedReductionCost(Opcode, Ty, CostKind);
2339 return getTreeReductionCost(Opcode, Ty, CostKind);
2340 }
2341
2342 /// Try to calculate op costs for min/max reduction operations.
2343 /// \param CondTy Conditional type for the Select instruction.
2345 bool IsUnsigned, FastMathFlags FMF,
2347 // Targets must implement a default value for the scalable case, since
2348 // we don't know how many lanes the vector has.
2349 if (isa<ScalableVectorType>(Ty))
2351
2352 Type *ScalarTy = Ty->getElementType();
2353 Type *ScalarCondTy = CondTy->getElementType();
2354 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2355 unsigned NumReduxLevels = Log2_32(NumVecElts);
2356 unsigned CmpOpcode;
2357 if (Ty->isFPOrFPVectorTy()) {
2358 CmpOpcode = Instruction::FCmp;
2359 } else {
2360 assert(Ty->isIntOrIntVectorTy() &&
2361 "expecting floating point or integer type for min/max reduction");
2362 CmpOpcode = Instruction::ICmp;
2363 }
2364 InstructionCost MinMaxCost = 0;
2365 InstructionCost ShuffleCost = 0;
2366 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2367 unsigned LongVectorCount = 0;
2368 unsigned MVTLen =
2369 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2370 while (NumVecElts > MVTLen) {
2371 NumVecElts /= 2;
2372 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2373 CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
2374
2375 ShuffleCost +=
2376 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2377 CostKind, NumVecElts, SubTy);
2378 MinMaxCost +=
2379 thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
2381 thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
2383 Ty = SubTy;
2384 ++LongVectorCount;
2385 }
2386
2387 NumReduxLevels -= LongVectorCount;
2388
2389 // The minimal length of the vector is limited by the real length of vector
2390 // operations performed on the current platform. That's why several final
2391 // reduction opertions are perfomed on the vectors with the same
2392 // architecture-dependent length.
2393 ShuffleCost +=
2394 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2395 std::nullopt, CostKind, 0, Ty);
2396 MinMaxCost +=
2397 NumReduxLevels *
2398 (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
2400 thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
2402 // The last min/max should be in vector registers and we counted it above.
2403 // So just need a single extractelement.
2404 return ShuffleCost + MinMaxCost +
2405 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2406 CostKind, 0, nullptr, nullptr);
2407 }
2408
2409 InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
2410 Type *ResTy, VectorType *Ty,
2411 FastMathFlags FMF,
2413 // Without any native support, this is equivalent to the cost of
2414 // vecreduce.opcode(ext(Ty A)).
2415 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2416 InstructionCost RedCost =
2417 thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
2418 InstructionCost ExtCost = thisT()->getCastInstrCost(
2419 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2421
2422 return RedCost + ExtCost;
2423 }
2424
2426 VectorType *Ty,
2428 // Without any native support, this is equivalent to the cost of
2429 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
2430 // vecreduce.add(mul(A, B)).
2431 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2432 InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2433 Instruction::Add, ExtTy, std::nullopt, CostKind);
2434 InstructionCost ExtCost = thisT()->getCastInstrCost(
2435 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2437
2438 InstructionCost MulCost =
2439 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2440
2441 return RedCost + MulCost + 2 * ExtCost;
2442 }
2443
2445
2446 /// @}
2447};
2448
2449/// Concrete BasicTTIImpl that can be used if no further customization
2450/// is needed.
2451class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2453
2454 friend class BasicTTIImplBase<BasicTTIImpl>;
2455
2456 const TargetSubtargetInfo *ST;
2457 const TargetLoweringBase *TLI;
2458
2459 const TargetSubtargetInfo *getST() const { return ST; }
2460 const TargetLoweringBase *getTLI() const { return TLI; }
2461
2462public:
2463 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2464};
2465
2466} // end namespace llvm
2467
2468#endif // LLVM_CODEGEN_BASICTTIIMPL_H
This file implements a class to represent arbitrary precision integral constant values and operations...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
This file provides helpers for the implementation of a TargetTransformInfo-conforming class.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:75
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:214
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1312
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1183
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1443
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1112
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:177
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:163
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
Base class which can be used to help build a TTI implementation.
Definition: BasicTTIImpl.h:79
bool isTypeLegal(Type *Ty)
Definition: BasicTTIImpl.h:408
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:285
virtual unsigned getPrefetchDistance() const
Definition: BasicTTIImpl.h:680
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace)
Definition: BasicTTIImpl.h:385
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:540
unsigned getMaxInterleaveFactor(ElementCount VF)
Definition: BasicTTIImpl.h:847
unsigned getNumberOfParts(Type *Tp)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
Definition: BasicTTIImpl.h:709
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:714
InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate the cost of performing strict (in-order) reductions, which involves doing a sequence...
bool isTruncateFree(Type *Ty1, Type *Ty2)
Definition: BasicTTIImpl.h:398
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: BasicTTIImpl.h:620
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:849
InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
Definition: BasicTTIImpl.h:627
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
Definition: BasicTTIImpl.h:700
bool isLegalICmpImmediate(int64_t imm)
Definition: BasicTTIImpl.h:326
bool isProfitableToHoist(Instruction *I)
Definition: BasicTTIImpl.h:402
virtual unsigned getMaxPrefetchIterationsAhead() const
Definition: BasicTTIImpl.h:692
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:713
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
unsigned getRegUsageForType(Type *Ty)
Definition: BasicTTIImpl.h:413
bool shouldBuildRelLookupTables() const
Definition: BasicTTIImpl.h:489
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
Definition: BasicTTIImpl.h:963
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JumpTableSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
Definition: BasicTTIImpl.h:424
bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:361
bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2)
Definition: BasicTTIImpl.h:373
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed)
Definition: BasicTTIImpl.h:642
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Definition: BasicTTIImpl.h:684
bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:367
unsigned getAssumedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:303
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instructions unique non-constant operands.
Definition: BasicTTIImpl.h:766
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:720
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
Definition: BasicTTIImpl.h:521
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
unsigned getInliningThresholdMultiplier()
Definition: BasicTTIImpl.h:535
virtual std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:660
bool isAlwaysUniform(const Value *V)
Definition: BasicTTIImpl.h:283
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr)
Definition: BasicTTIImpl.h:330
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true)
Definition: BasicTTIImpl.h:632
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace, Align Alignment, unsigned *Fast) const
Definition: BasicTTIImpl.h:271
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition: BasicTTIImpl.h:341
InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
Definition: BasicTTIImpl.h:750
virtual std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:666
virtual bool enableWritePrefetching() const
Definition: BasicTTIImpl.h:696
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
Definition: BasicTTIImpl.h:317
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:612
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
Definition: BasicTTIImpl.h:294
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getFPOpCost(Type *Ty)
Definition: BasicTTIImpl.h:525
InstructionCost getVectorSplitCost()
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:813
bool haveFastSqrt(Type *Ty)
Definition: BasicTTIImpl.h:514
std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:313
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:927
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
virtual ~BasicTTIImplBase()=default
InstructionCost getScalarizationOverhead(VectorType *RetTy, ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing the inputs and outputs of an instruction, with return type RetTy...
Definition: BasicTTIImpl.h:795
bool isVScaleKnownToBeAPowerOfTwo() const
Definition: BasicTTIImpl.h:715
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:418
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II)
Definition: BasicTTIImpl.h:636
bool isLegalAddImmediate(int64_t imm)
Definition: BasicTTIImpl.h:322
unsigned getFlatAddressSpace()
Definition: BasicTTIImpl.h:289
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:993
virtual unsigned getCacheLineSize() const
Definition: BasicTTIImpl.h:676
bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:299
bool isSourceOfDivergence(const Value *V)
Definition: BasicTTIImpl.h:281
InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on argument types.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp)
Definition: BasicTTIImpl.h:649
bool isSingleThreaded() const
Definition: BasicTTIImpl.h:307
BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
Definition: BasicTTIImpl.h:262
unsigned adjustInliningThreshold(const CallBase *CB)
Definition: BasicTTIImpl.h:536
bool isProfitableLSRChainElement(Instruction *I)
Definition: BasicTTIImpl.h:381
Concrete BasicTTIImpl that can be used if no further customization is needed.
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
BitVector & set()
Definition: BitVector.h:351
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1190
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1058
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:711
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:734
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:738
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:736
@ ICMP_EQ
equal
Definition: InstrTypes.h:732
@ ICMP_NE
not equal
Definition: InstrTypes.h:733
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:721
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:480
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:416
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:306
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:291
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:302
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:536
unsigned getNumElements() const
Definition: DerivedTypes.h:579
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:754
bool isTargetIntrinsic() const
isTargetIntrinsic - Returns true if this function is an intrinsic and the intrinsic is specific to a ...
Definition: Function.cpp:848
The core instruction combiner logic.
Definition: InstCombiner.h:45
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:339
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
InstructionCost getScalarizationCost() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:47
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Return the minimum stride necessary to trigger software prefetching.
virtual bool enableWritePrefetching() const
virtual unsigned getMaxPrefetchIterationsAhead() const
Return the maximum prefetch distance in terms of loop iterations.
virtual unsigned getPrefetchDistance() const
Return the preferred prefetch distance in terms of instructions.
virtual std::optional< unsigned > getCacheAssociativity(unsigned Level) const
Return the cache associatvity for the given level of cache.
virtual std::optional< unsigned > getCacheLineSize(unsigned Level) const
Return the target cache line size in bytes at a given level.
Machine Value Type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Analysis providing profile information.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isSelectMask(ArrayRef< int > Mask)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
static bool isReverseMask(ArrayRef< int > Mask)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isZeroEltSplatMask(ArrayRef< int > Mask)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isSpliceMask(ArrayRef< int > Mask, int &Index)
Return true if this shuffle mask is a splice mask, concatenating the two inputs together and then ext...
static bool isTransposeMask(ArrayRef< int > Mask)
Return true if this shuffle mask is a transpose mask.
size_type size() const
Definition: SmallPtrSet.h:93
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:574
Multiway switch.
Provides information about what library functions are available for the current target.
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
bool isIndexedStoreLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool isLegalICmpImmediate(int64_t) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
const TargetMachine & getTargetMachine() const
virtual bool isZExtFree(Type *FromTy, Type *ToTy) const
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, uint64_t Range, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
Return true if lowering to a jump table is suitable for a set of case clusters which may contain NumC...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
virtual bool isCheapToSpeculateCttz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic cttz.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
virtual bool isTruncateFree(Type *FromTy, Type *ToTy) const
Return true if it's free to truncate a value of type FromTy to type ToTy.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps, const APInt &Low, const APInt &High, const DataLayout &DL) const
Return true if lowering to a bit test is suitable for a set of case clusters which contains NumDests ...
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
virtual bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const
Return how this store with truncation should be treated: either it is legal, needs to be promoted to ...
LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return how this load with extension should be treated: either it is legal, needs to be promoted to a ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isProfitableToHoist(Instruction *I) const
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
virtual bool isCheapToSpeculateCtlz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic ctlz.
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
bool isBeneficialToExpandPowI(int64_t Exponent, bool OptForSize) const
Return true if it is beneficial to expand an @llvm.powi.
virtual bool isFAbsFree(EVT VT) const
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
virtual bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AddrSpace, Instruction *I=nullptr) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
virtual std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
If the specified predicate checks whether a generic pointer falls within a specified address space,...
virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast between SrcAS and DestAS is a noop.
virtual unsigned getAssumedAddrSpace(const Value *V) const
If the specified generic pointer could be assumed as a pointer to a specific address space,...
TargetOptions Options
ThreadModel::Model ThreadModel
ThreadModel - This flag specifies the type of threading model to assume for things like atomics.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual bool useAA() const
Enable use of alias analysis during code generation (during MI scheduling, DAGCombine,...
const DataLayout & getDataLayout() const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isProfitableLSRChainElement(Instruction *I) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info, ArrayRef< const Value * > Args, const Instruction *CxtI=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I) const
bool isLoweredToCall(const Function *F) const
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
CRTP base class for use as a mix-in that aids implementing a TargetTransformInfo-compatible class.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Basic
The cost of a typical 'add' instruction.
MemIndexedMode
The type of load/store indexing.
@ MIM_PostInc
Post-incrementing.
@ MIM_PostDec
Post-decrementing.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
CacheLevel
The possible cache levels.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:356
bool isArch64Bit() const
Test whether the architecture is 64-bit.
Definition: Triple.cpp:1464
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, or DriverKit).
Definition: Triple.h:519
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:322
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:235
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:262
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:217
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:400
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:504
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:738
Type * getElementType() const
Definition: DerivedTypes.h:433
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:198
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:166
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:713
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:482
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:999
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1003
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:499
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:726
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:955
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:735
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:968
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:492
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1396
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:522
AddressSpace
Definition: NVPTXBaseInfo.h:21
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1826
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:382
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:184
InstructionCost Cost
cl::opt< unsigned > PartialUnrollingThreshold
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:129
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:616
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:299
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
Attributes of a target dependent hardware loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
bool AllowPeeling
Allow peeling off loop iterations.
bool AllowLoopNestsPeeling
Allow peeling off loop iterations for loop nests.
bool PeelProfiledIterations
Allow peeling basing on profile.
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).