LLVM 19.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "ARMSubtarget.h"
12#include "llvm/ADT/APInt.h"
19#include "llvm/IR/BasicBlock.h"
20#include "llvm/IR/DataLayout.h"
22#include "llvm/IR/Instruction.h"
25#include "llvm/IR/Intrinsics.h"
26#include "llvm/IR/IntrinsicsARM.h"
28#include "llvm/IR/Type.h"
37#include <algorithm>
38#include <cassert>
39#include <cstdint>
40#include <optional>
41#include <utility>
42
43using namespace llvm;
44
45#define DEBUG_TYPE "armtti"
46
48 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
49 cl::desc("Enable the generation of masked loads and stores"));
50
52 "disable-arm-loloops", cl::Hidden, cl::init(false),
53 cl::desc("Disable the generation of low-overhead loops"));
54
55static cl::opt<bool>
56 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
57 cl::desc("Enable the generation of WLS loops"));
58
60
62
64
65/// Convert a vector load intrinsic into a simple llvm load instruction.
66/// This is beneficial when the underlying object being addressed comes
67/// from a constant, since we get constant-folding for free.
68static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
69 InstCombiner::BuilderTy &Builder) {
70 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
71
72 if (!IntrAlign)
73 return nullptr;
74
75 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
76 ? MemAlign
77 : IntrAlign->getLimitedValue();
78
79 if (!isPowerOf2_32(Alignment))
80 return nullptr;
81
82 auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
83 PointerType::get(II.getType(), 0));
84 return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
85}
86
88 const Function *Callee) const {
89 const TargetMachine &TM = getTLI()->getTargetMachine();
90 const FeatureBitset &CallerBits =
91 TM.getSubtargetImpl(*Caller)->getFeatureBits();
92 const FeatureBitset &CalleeBits =
93 TM.getSubtargetImpl(*Callee)->getFeatureBits();
94
95 // To inline a callee, all features not in the allowed list must match exactly.
96 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
97 (CalleeBits & ~InlineFeaturesAllowed);
98 // For features in the allowed list, the callee's features must be a subset of
99 // the callers'.
100 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
101 (CalleeBits & InlineFeaturesAllowed);
102 return MatchExact && MatchSubset;
103}
104
107 ScalarEvolution *SE) const {
108 if (ST->hasMVEIntegerOps())
110
111 if (L->getHeader()->getParent()->hasOptSize())
112 return TTI::AMK_None;
113
114 if (ST->isMClass() && ST->isThumb2() &&
115 L->getNumBlocks() == 1)
116 return TTI::AMK_PreIndexed;
117
118 return TTI::AMK_None;
119}
120
121std::optional<Instruction *>
123 using namespace PatternMatch;
124 Intrinsic::ID IID = II.getIntrinsicID();
125 switch (IID) {
126 default:
127 break;
128 case Intrinsic::arm_neon_vld1: {
129 Align MemAlign =
132 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
133 return IC.replaceInstUsesWith(II, V);
134 }
135 break;
136 }
137
138 case Intrinsic::arm_neon_vld2:
139 case Intrinsic::arm_neon_vld3:
140 case Intrinsic::arm_neon_vld4:
141 case Intrinsic::arm_neon_vld2lane:
142 case Intrinsic::arm_neon_vld3lane:
143 case Intrinsic::arm_neon_vld4lane:
144 case Intrinsic::arm_neon_vst1:
145 case Intrinsic::arm_neon_vst2:
146 case Intrinsic::arm_neon_vst3:
147 case Intrinsic::arm_neon_vst4:
148 case Intrinsic::arm_neon_vst2lane:
149 case Intrinsic::arm_neon_vst3lane:
150 case Intrinsic::arm_neon_vst4lane: {
151 Align MemAlign =
154 unsigned AlignArg = II.arg_size() - 1;
155 Value *AlignArgOp = II.getArgOperand(AlignArg);
156 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
157 if (Align && *Align < MemAlign) {
158 return IC.replaceOperand(
159 II, AlignArg,
160 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
161 false));
162 }
163 break;
164 }
165
166 case Intrinsic::arm_mve_pred_i2v: {
167 Value *Arg = II.getArgOperand(0);
168 Value *ArgArg;
169 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
170 PatternMatch::m_Value(ArgArg))) &&
171 II.getType() == ArgArg->getType()) {
172 return IC.replaceInstUsesWith(II, ArgArg);
173 }
174 Constant *XorMask;
175 if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
176 PatternMatch::m_Value(ArgArg)),
177 PatternMatch::m_Constant(XorMask))) &&
178 II.getType() == ArgArg->getType()) {
179 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
180 if (CI->getValue().trunc(16).isAllOnes()) {
181 auto TrueVector = IC.Builder.CreateVectorSplat(
182 cast<FixedVectorType>(II.getType())->getNumElements(),
183 IC.Builder.getTrue());
184 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
185 }
186 }
187 }
188 KnownBits ScalarKnown(32);
189 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
190 ScalarKnown, 0)) {
191 return &II;
192 }
193 break;
194 }
195 case Intrinsic::arm_mve_pred_v2i: {
196 Value *Arg = II.getArgOperand(0);
197 Value *ArgArg;
198 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
199 PatternMatch::m_Value(ArgArg)))) {
200 return IC.replaceInstUsesWith(II, ArgArg);
201 }
202 if (!II.getMetadata(LLVMContext::MD_range)) {
203 Type *IntTy32 = Type::getInt32Ty(II.getContext());
204 Metadata *M[] = {
205 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
206 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
207 II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
208 II.setMetadata(LLVMContext::MD_noundef,
209 MDNode::get(II.getContext(), std::nullopt));
210 return &II;
211 }
212 break;
213 }
214 case Intrinsic::arm_mve_vadc:
215 case Intrinsic::arm_mve_vadc_predicated: {
216 unsigned CarryOp =
217 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
218 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
219 "Bad type for intrinsic!");
220
221 KnownBits CarryKnown(32);
222 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
223 CarryKnown)) {
224 return &II;
225 }
226 break;
227 }
228 case Intrinsic::arm_mve_vmldava: {
229 Instruction *I = cast<Instruction>(&II);
230 if (I->hasOneUse()) {
231 auto *User = cast<Instruction>(*I->user_begin());
232 Value *OpZ;
233 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
234 match(I->getOperand(3), m_Zero())) {
235 Value *OpX = I->getOperand(4);
236 Value *OpY = I->getOperand(5);
237 Type *OpTy = OpX->getType();
238
240 Value *V =
241 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
242 {I->getOperand(0), I->getOperand(1),
243 I->getOperand(2), OpZ, OpX, OpY});
244
246 return IC.eraseInstFromFunction(*User);
247 }
248 }
249 return std::nullopt;
250 }
251 }
252 return std::nullopt;
253}
254
256 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
257 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
258 std::function<void(Instruction *, unsigned, APInt, APInt &)>
259 SimplifyAndSetOp) const {
260
261 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
262 // opcode specifying a Top/Bottom instruction, which can change between
263 // instructions.
264 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
265 unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
266 unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
267
268 // The only odd/even lanes of operand 0 will only be demanded depending
269 // on whether this is a top/bottom instruction.
270 APInt DemandedElts =
271 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
272 : APInt::getHighBitsSet(2, 1));
273 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
274 // The other lanes will be defined from the inserted elements.
275 UndefElts &= APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
276 : APInt::getHighBitsSet(2, 1));
277 return std::nullopt;
278 };
279
280 switch (II.getIntrinsicID()) {
281 default:
282 break;
283 case Intrinsic::arm_mve_vcvt_narrow:
284 SimplifyNarrowInstrTopBottom(2);
285 break;
286 case Intrinsic::arm_mve_vqmovn:
287 SimplifyNarrowInstrTopBottom(4);
288 break;
289 case Intrinsic::arm_mve_vshrn:
290 SimplifyNarrowInstrTopBottom(7);
291 break;
292 }
293
294 return std::nullopt;
295}
296
299 assert(Ty->isIntegerTy());
300
301 unsigned Bits = Ty->getPrimitiveSizeInBits();
302 if (Bits == 0 || Imm.getActiveBits() >= 64)
303 return 4;
304
305 int64_t SImmVal = Imm.getSExtValue();
306 uint64_t ZImmVal = Imm.getZExtValue();
307 if (!ST->isThumb()) {
308 if ((SImmVal >= 0 && SImmVal < 65536) ||
309 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
310 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
311 return 1;
312 return ST->hasV6T2Ops() ? 2 : 3;
313 }
314 if (ST->isThumb2()) {
315 if ((SImmVal >= 0 && SImmVal < 65536) ||
316 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
317 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
318 return 1;
319 return ST->hasV6T2Ops() ? 2 : 3;
320 }
321 // Thumb1, any i8 imm cost 1.
322 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
323 return 1;
324 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
325 return 2;
326 // Load from constantpool.
327 return 3;
328}
329
330// Constants smaller than 256 fit in the immediate field of
331// Thumb1 instructions so we return a zero cost and 1 otherwise.
333 const APInt &Imm, Type *Ty) {
334 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
335 return 0;
336
337 return 1;
338}
339
340// Checks whether Inst is part of a min(max()) or max(min()) pattern
341// that will match to an SSAT instruction. Returns the instruction being
342// saturated, or null if no saturation pattern was found.
343static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
344 Value *LHS, *RHS;
345 ConstantInt *C;
347
348 if (InstSPF == SPF_SMAX &&
350 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
351
352 auto isSSatMin = [&](Value *MinInst) {
353 if (isa<SelectInst>(MinInst)) {
354 Value *MinLHS, *MinRHS;
355 ConstantInt *MinC;
356 SelectPatternFlavor MinSPF =
357 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
358 if (MinSPF == SPF_SMIN &&
360 MinC->getValue() == ((-Imm) - 1))
361 return true;
362 }
363 return false;
364 };
365
366 if (isSSatMin(Inst->getOperand(1)))
367 return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
368 if (Inst->hasNUses(2) &&
369 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
370 return Inst->getOperand(1);
371 }
372 return nullptr;
373}
374
375// Look for a FP Saturation pattern, where the instruction can be simplified to
376// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
377static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
378 if (Imm.getBitWidth() != 64 ||
379 Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
380 return false;
381 Value *FP = isSSATMinMaxPattern(Inst, Imm);
382 if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
383 FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
384 if (!FP)
385 return false;
386 return isa<FPToSIInst>(FP);
387}
388
390 const APInt &Imm, Type *Ty,
392 Instruction *Inst) {
393 // Division by a constant can be turned into multiplication, but only if we
394 // know it's constant. So it's not so much that the immediate is cheap (it's
395 // not), but that the alternative is worse.
396 // FIXME: this is probably unneeded with GlobalISel.
397 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
398 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
399 Idx == 1)
400 return 0;
401
402 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
403 // splitting any large offsets.
404 if (Opcode == Instruction::GetElementPtr && Idx != 0)
405 return 0;
406
407 if (Opcode == Instruction::And) {
408 // UXTB/UXTH
409 if (Imm == 255 || Imm == 65535)
410 return 0;
411 // Conversion to BIC is free, and means we can use ~Imm instead.
412 return std::min(getIntImmCost(Imm, Ty, CostKind),
413 getIntImmCost(~Imm, Ty, CostKind));
414 }
415
416 if (Opcode == Instruction::Add)
417 // Conversion to SUB is free, and means we can use -Imm instead.
418 return std::min(getIntImmCost(Imm, Ty, CostKind),
419 getIntImmCost(-Imm, Ty, CostKind));
420
421 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
422 Ty->getIntegerBitWidth() == 32) {
423 int64_t NegImm = -Imm.getSExtValue();
424 if (ST->isThumb2() && NegImm < 1<<12)
425 // icmp X, #-C -> cmn X, #C
426 return 0;
427 if (ST->isThumb() && NegImm < 1<<8)
428 // icmp X, #-C -> adds X, #C
429 return 0;
430 }
431
432 // xor a, -1 can always be folded to MVN
433 if (Opcode == Instruction::Xor && Imm.isAllOnes())
434 return 0;
435
436 // Ensures negative constant of min(max()) or max(min()) patterns that
437 // match to SSAT instructions don't get hoisted
438 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
439 Ty->getIntegerBitWidth() <= 32) {
440 if (isSSATMinMaxPattern(Inst, Imm) ||
441 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
442 isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
443 return 0;
444 }
445
446 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
447 return 0;
448
449 // We can convert <= -1 to < 0, which is generally quite cheap.
450 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
451 ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
452 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
453 return std::min(getIntImmCost(Imm, Ty, CostKind),
454 getIntImmCost(Imm + 1, Ty, CostKind));
455 }
456
457 return getIntImmCost(Imm, Ty, CostKind);
458}
459
462 const Instruction *I) {
464 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
465 // FIXME: The vectorizer is highly sensistive to the cost of these
466 // instructions, which suggests that it may be using the costs incorrectly.
467 // But, for now, just make them free to avoid performance regressions for
468 // vector targets.
469 return 0;
470 }
471 return BaseT::getCFInstrCost(Opcode, CostKind, I);
472}
473
475 Type *Src,
478 const Instruction *I) {
479 int ISD = TLI->InstructionOpcodeToISD(Opcode);
480 assert(ISD && "Invalid opcode");
481
482 // TODO: Allow non-throughput costs that aren't binary.
483 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
485 return Cost == 0 ? 0 : 1;
486 return Cost;
487 };
488 auto IsLegalFPType = [this](EVT VT) {
489 EVT EltVT = VT.getScalarType();
490 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
491 (EltVT == MVT::f64 && ST->hasFP64()) ||
492 (EltVT == MVT::f16 && ST->hasFullFP16());
493 };
494
495 EVT SrcTy = TLI->getValueType(DL, Src);
496 EVT DstTy = TLI->getValueType(DL, Dst);
497
498 if (!SrcTy.isSimple() || !DstTy.isSimple())
499 return AdjustCost(
500 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
501
502 // Extending masked load/Truncating masked stores is expensive because we
503 // currently don't split them. This means that we'll likely end up
504 // loading/storing each element individually (hence the high cost).
505 if ((ST->hasMVEIntegerOps() &&
506 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
507 Opcode == Instruction::SExt)) ||
508 (ST->hasMVEFloatOps() &&
509 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
510 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
511 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
512 return 2 * DstTy.getVectorNumElements() *
514
515 // The extend of other kinds of load is free
516 if (CCH == TTI::CastContextHint::Normal ||
518 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
519 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
520 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
521 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
522 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
523 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
524 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
525 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
526 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
527 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
528 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
529 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
530 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
531 };
532 if (const auto *Entry = ConvertCostTableLookup(
533 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
534 return AdjustCost(Entry->Cost);
535
536 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
537 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
538 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
539 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
540 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
541 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
542 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
543 // The following extend from a legal type to an illegal type, so need to
544 // split the load. This introduced an extra load operation, but the
545 // extend is still "free".
546 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
547 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
548 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
549 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
550 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
551 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
552 };
553 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
554 if (const auto *Entry =
555 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
556 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
557 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
558 }
559
560 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
561 // FPExtends are similar but also require the VCVT instructions.
562 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
563 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
564 };
565 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
566 if (const auto *Entry =
567 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
568 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
569 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
570 }
571
572 // The truncate of a store is free. This is the mirror of extends above.
573 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
574 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
575 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
576 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
577 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
578 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
579 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
580 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
581 };
582 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
583 if (const auto *Entry =
584 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
585 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
586 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
587 }
588
589 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
590 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
591 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
592 };
593 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
594 if (const auto *Entry =
595 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
596 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
597 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
598 }
599 }
600
601 // NEON vector operations that can extend their inputs.
602 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
603 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
604 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
605 // vaddl
606 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
607 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
608 // vsubl
609 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
610 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
611 // vmull
612 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
613 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
614 // vshll
615 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
616 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
617 };
618
619 auto *User = cast<Instruction>(*I->user_begin());
620 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
621 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
622 DstTy.getSimpleVT(),
623 SrcTy.getSimpleVT())) {
624 return AdjustCost(Entry->Cost);
625 }
626 }
627
628 // Single to/from double precision conversions.
629 if (Src->isVectorTy() && ST->hasNEON() &&
630 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
631 DstTy.getScalarType() == MVT::f32) ||
632 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
633 DstTy.getScalarType() == MVT::f64))) {
634 static const CostTblEntry NEONFltDblTbl[] = {
635 // Vector fptrunc/fpext conversions.
636 {ISD::FP_ROUND, MVT::v2f64, 2},
637 {ISD::FP_EXTEND, MVT::v2f32, 2},
638 {ISD::FP_EXTEND, MVT::v4f32, 4}};
639
640 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
641 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
642 return AdjustCost(LT.first * Entry->Cost);
643 }
644
645 // Some arithmetic, load and store operations have specific instructions
646 // to cast up/down their types automatically at no extra cost.
647 // TODO: Get these tables to know at least what the related operations are.
648 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
649 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
650 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
651 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
652 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
653 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
654 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
655
656 // The number of vmovl instructions for the extension.
657 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
658 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
659 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
660 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
661 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
662 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
663 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
664 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
665 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
666 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
667 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
668 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
669 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
670 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
671 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
672 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
673 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
674 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
675
676 // Operations that we legalize using splitting.
677 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
678 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
679
680 // Vector float <-> i32 conversions.
681 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
682 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
683
684 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
685 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
686 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
687 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
688 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
689 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
690 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
691 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
692 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
693 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
694 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
695 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
696 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
697 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
698 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
699 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
700 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
701 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
702 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
703 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
704
705 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
706 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
707 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
708 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
709 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
710 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
711
712 // Vector double <-> i32 conversions.
713 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
714 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
715
716 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
717 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
718 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
719 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
720 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
721 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
722
723 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
724 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
725 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
726 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
727 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
728 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
729 };
730
731 if (SrcTy.isVector() && ST->hasNEON()) {
732 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
733 DstTy.getSimpleVT(),
734 SrcTy.getSimpleVT()))
735 return AdjustCost(Entry->Cost);
736 }
737
738 // Scalar float to integer conversions.
739 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
740 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
741 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
742 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
743 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
744 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
745 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
746 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
747 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
748 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
749 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
750 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
751 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
752 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
753 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
754 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
755 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
756 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
757 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
758 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
759 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
760 };
761 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
762 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
763 DstTy.getSimpleVT(),
764 SrcTy.getSimpleVT()))
765 return AdjustCost(Entry->Cost);
766 }
767
768 // Scalar integer to float conversions.
769 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
770 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
771 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
772 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
773 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
774 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
775 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
776 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
777 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
778 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
779 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
780 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
781 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
782 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
783 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
784 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
785 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
786 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
787 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
788 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
789 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
790 };
791
792 if (SrcTy.isInteger() && ST->hasNEON()) {
793 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
794 ISD, DstTy.getSimpleVT(),
795 SrcTy.getSimpleVT()))
796 return AdjustCost(Entry->Cost);
797 }
798
799 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
800 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
801 // are linearised so take more.
802 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
803 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
804 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
805 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
806 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
807 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
808 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
809 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
810 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
811 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
812 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
813 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
814 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
815 };
816
817 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
818 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
819 ISD, DstTy.getSimpleVT(),
820 SrcTy.getSimpleVT()))
821 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
822 }
823
824 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
825 // As general rule, fp converts that were not matched above are scalarized
826 // and cost 1 vcvt for each lane, so long as the instruction is available.
827 // If not it will become a series of function calls.
828 const InstructionCost CallCost =
829 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
830 int Lanes = 1;
831 if (SrcTy.isFixedLengthVector())
832 Lanes = SrcTy.getVectorNumElements();
833
834 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
835 return Lanes;
836 else
837 return Lanes * CallCost;
838 }
839
840 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
841 SrcTy.isFixedLengthVector()) {
842 // Treat a truncate with larger than legal source (128bits for MVE) as
843 // expensive, 2 instructions per lane.
844 if ((SrcTy.getScalarType() == MVT::i8 ||
845 SrcTy.getScalarType() == MVT::i16 ||
846 SrcTy.getScalarType() == MVT::i32) &&
847 SrcTy.getSizeInBits() > 128 &&
848 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
849 return SrcTy.getVectorNumElements() * 2;
850 }
851
852 // Scalar integer conversion costs.
853 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
854 // i16 -> i64 requires two dependent operations.
855 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
856
857 // Truncates on i64 are assumed to be free.
858 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
859 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
860 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
861 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
862 };
863
864 if (SrcTy.isInteger()) {
865 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
866 DstTy.getSimpleVT(),
867 SrcTy.getSimpleVT()))
868 return AdjustCost(Entry->Cost);
869 }
870
871 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
873 : 1;
874 return AdjustCost(
875 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
876}
877
880 unsigned Index, Value *Op0,
881 Value *Op1) {
882 // Penalize inserting into an D-subregister. We end up with a three times
883 // lower estimated throughput on swift.
884 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
885 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
886 return 3;
887
888 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
889 Opcode == Instruction::ExtractElement)) {
890 // Cross-class copies are expensive on many microarchitectures,
891 // so assume they are expensive by default.
892 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
893 return 3;
894
895 // Even if it's not a cross class copy, this likely leads to mixing
896 // of NEON and VFP code and should be therefore penalized.
897 if (ValTy->isVectorTy() &&
898 ValTy->getScalarSizeInBits() <= 32)
899 return std::max<InstructionCost>(
900 BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1),
901 2U);
902 }
903
904 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
905 Opcode == Instruction::ExtractElement)) {
906 // Integer cross-lane moves are more expensive than float, which can
907 // sometimes just be vmovs. Integer involve being passes to GPR registers,
908 // causing more of a delay.
909 std::pair<InstructionCost, MVT> LT =
911 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
912 }
913
914 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
915}
916
918 Type *CondTy,
919 CmpInst::Predicate VecPred,
921 const Instruction *I) {
922 int ISD = TLI->InstructionOpcodeToISD(Opcode);
923
924 // Thumb scalar code size cost for select.
925 if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
926 ST->isThumb() && !ValTy->isVectorTy()) {
927 // Assume expensive structs.
928 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
929 return TTI::TCC_Expensive;
930
931 // Select costs can vary because they:
932 // - may require one or more conditional mov (including an IT),
933 // - can't operate directly on immediates,
934 // - require live flags, which we can't copy around easily.
936
937 // Possible IT instruction for Thumb2, or more for Thumb1.
938 ++Cost;
939
940 // i1 values may need rematerialising by using mov immediates and/or
941 // flag setting instructions.
942 if (ValTy->isIntegerTy(1))
943 ++Cost;
944
945 return Cost;
946 }
947
948 // If this is a vector min/max/abs, use the cost of that intrinsic directly
949 // instead. Hopefully when min/max intrinsics are more prevalent this code
950 // will not be needed.
951 const Instruction *Sel = I;
952 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
953 Sel->hasOneUse())
954 Sel = cast<Instruction>(Sel->user_back());
955 if (Sel && ValTy->isVectorTy() &&
956 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
957 const Value *LHS, *RHS;
959 unsigned IID = 0;
960 switch (SPF) {
961 case SPF_ABS:
962 IID = Intrinsic::abs;
963 break;
964 case SPF_SMIN:
965 IID = Intrinsic::smin;
966 break;
967 case SPF_SMAX:
968 IID = Intrinsic::smax;
969 break;
970 case SPF_UMIN:
971 IID = Intrinsic::umin;
972 break;
973 case SPF_UMAX:
974 IID = Intrinsic::umax;
975 break;
976 case SPF_FMINNUM:
977 IID = Intrinsic::minnum;
978 break;
979 case SPF_FMAXNUM:
980 IID = Intrinsic::maxnum;
981 break;
982 default:
983 break;
984 }
985 if (IID) {
986 // The ICmp is free, the select gets the cost of the min/max/etc
987 if (Sel != I)
988 return 0;
989 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
990 return getIntrinsicInstrCost(CostAttrs, CostKind);
991 }
992 }
993
994 // On NEON a vector select gets lowered to vbsl.
995 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
996 // Lowering of some vector selects is currently far from perfect.
997 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
998 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
999 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
1000 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
1001 };
1002
1003 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1004 EVT SelValTy = TLI->getValueType(DL, ValTy);
1005 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1006 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1007 SelCondTy.getSimpleVT(),
1008 SelValTy.getSimpleVT()))
1009 return Entry->Cost;
1010 }
1011
1012 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1013 return LT.first;
1014 }
1015
1016 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1017 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1018 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1019 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1020 FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
1021 if (!VecCondTy)
1022 VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
1023
1024 // If we don't have mve.fp any fp operations will need to be scalarized.
1025 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1026 // One scalaization insert, one scalarization extract and the cost of the
1027 // fcmps.
1028 return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
1029 /*Extract*/ true, CostKind) +
1030 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1031 /*Extract*/ false, CostKind) +
1032 VecValTy->getNumElements() *
1033 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1034 VecCondTy->getScalarType(), VecPred,
1035 CostKind, I);
1036 }
1037
1038 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1039 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1040 // There are two types - the input that specifies the type of the compare
1041 // and the output vXi1 type. Because we don't know how the output will be
1042 // split, we may need an expensive shuffle to get two in sync. This has the
1043 // effect of making larger than legal compares (v8i32 for example)
1044 // expensive.
1045 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1046 if (LT.first > 1)
1047 return LT.first * BaseCost +
1048 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1049 /*Extract*/ false, CostKind);
1050 return BaseCost;
1051 }
1052 }
1053
1054 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1055 // for "multiple beats" potentially needed by MVE instructions.
1056 int BaseCost = 1;
1057 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1058 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1059
1060 return BaseCost *
1061 BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1062}
1063
1065 ScalarEvolution *SE,
1066 const SCEV *Ptr) {
1067 // Address computations in vectorized code with non-consecutive addresses will
1068 // likely result in more instructions compared to scalar code where the
1069 // computation can more often be merged into the index mode. The resulting
1070 // extra micro-ops can significantly decrease throughput.
1071 unsigned NumVectorInstToHideOverhead = 10;
1072 int MaxMergeDistance = 64;
1073
1074 if (ST->hasNEON()) {
1075 if (Ty->isVectorTy() && SE &&
1076 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1077 return NumVectorInstToHideOverhead;
1078
1079 // In many cases the address computation is not merged into the instruction
1080 // addressing mode.
1081 return 1;
1082 }
1083 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1084}
1085
1087 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1088 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1089 // optimized, else LSR may block tail-predication.
1090 switch (II->getIntrinsicID()) {
1091 case Intrinsic::arm_mve_vctp8:
1092 case Intrinsic::arm_mve_vctp16:
1093 case Intrinsic::arm_mve_vctp32:
1094 case Intrinsic::arm_mve_vctp64:
1095 return true;
1096 default:
1097 break;
1098 }
1099 }
1100 return false;
1101}
1102
1103bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1104 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1105 return false;
1106
1107 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1108 // Don't support v2i1 yet.
1109 if (VecTy->getNumElements() == 2)
1110 return false;
1111
1112 // We don't support extending fp types.
1113 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1114 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1115 return false;
1116 }
1117
1118 unsigned EltWidth = DataTy->getScalarSizeInBits();
1119 return (EltWidth == 32 && Alignment >= 4) ||
1120 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1121}
1122
1124 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1125 return false;
1126
1127 unsigned EltWidth = Ty->getScalarSizeInBits();
1128 return ((EltWidth == 32 && Alignment >= 4) ||
1129 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1130}
1131
1132/// Given a memcpy/memset/memmove instruction, return the number of memory
1133/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1134/// call is used.
1136 MemOp MOp;
1137 unsigned DstAddrSpace = ~0u;
1138 unsigned SrcAddrSpace = ~0u;
1139 const Function *F = I->getParent()->getParent();
1140
1141 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1142 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1143 // If 'size' is not a constant, a library call will be generated.
1144 if (!C)
1145 return -1;
1146
1147 const unsigned Size = C->getValue().getZExtValue();
1148 const Align DstAlign = *MC->getDestAlign();
1149 const Align SrcAlign = *MC->getSourceAlign();
1150
1151 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1152 /*IsVolatile*/ false);
1153 DstAddrSpace = MC->getDestAddressSpace();
1154 SrcAddrSpace = MC->getSourceAddressSpace();
1155 }
1156 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1157 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1158 // If 'size' is not a constant, a library call will be generated.
1159 if (!C)
1160 return -1;
1161
1162 const unsigned Size = C->getValue().getZExtValue();
1163 const Align DstAlign = *MS->getDestAlign();
1164
1165 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1166 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1167 DstAddrSpace = MS->getDestAddressSpace();
1168 }
1169 else
1170 llvm_unreachable("Expected a memcpy/move or memset!");
1171
1172 unsigned Limit, Factor = 2;
1173 switch(I->getIntrinsicID()) {
1174 case Intrinsic::memcpy:
1175 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1176 break;
1177 case Intrinsic::memmove:
1178 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1179 break;
1180 case Intrinsic::memset:
1181 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1182 Factor = 1;
1183 break;
1184 default:
1185 llvm_unreachable("Expected a memcpy/move or memset!");
1186 }
1187
1188 // MemOps will be poplulated with a list of data types that needs to be
1189 // loaded and stored. That's why we multiply the number of elements by 2 to
1190 // get the cost for this memcpy.
1191 std::vector<EVT> MemOps;
1192 if (getTLI()->findOptimalMemOpLowering(
1193 MemOps, Limit, MOp, DstAddrSpace,
1194 SrcAddrSpace, F->getAttributes()))
1195 return MemOps.size() * Factor;
1196
1197 // If we can't find an optimal memop lowering, return the default cost
1198 return -1;
1199}
1200
1202 int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1203
1204 // To model the cost of a library call, we assume 1 for the call, and
1205 // 3 for the argument setup.
1206 if (NumOps == -1)
1207 return 4;
1208 return NumOps;
1209}
1210
1212 VectorType *Tp, ArrayRef<int> Mask,
1214 int Index, VectorType *SubTp,
1216 const Instruction *CxtI) {
1217 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
1218 // Treat extractsubvector as single op permutation.
1219 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1220 if (IsExtractSubvector)
1222 if (ST->hasNEON()) {
1223 if (Kind == TTI::SK_Broadcast) {
1224 static const CostTblEntry NEONDupTbl[] = {
1225 // VDUP handles these cases.
1226 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1227 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1228 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1229 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1230 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1231 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1232
1233 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1234 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1235 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1236 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1237
1238 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1239 if (const auto *Entry =
1240 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1241 return LT.first * Entry->Cost;
1242 }
1243 if (Kind == TTI::SK_Reverse) {
1244 static const CostTblEntry NEONShuffleTbl[] = {
1245 // Reverse shuffle cost one instruction if we are shuffling within a
1246 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1247 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1248 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1249 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1250 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1251 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1252 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1253
1254 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1255 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1256 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1257 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1258
1259 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1260 if (const auto *Entry =
1261 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1262 return LT.first * Entry->Cost;
1263 }
1264 if (Kind == TTI::SK_Select) {
1265 static const CostTblEntry NEONSelShuffleTbl[] = {
1266 // Select shuffle cost table for ARM. Cost is the number of
1267 // instructions
1268 // required to create the shuffled vector.
1269
1270 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1271 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1272 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1273 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1274
1275 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1276 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1277 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1278
1279 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1280
1281 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1282
1283 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1284 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1285 ISD::VECTOR_SHUFFLE, LT.second))
1286 return LT.first * Entry->Cost;
1287 }
1288 }
1289 if (ST->hasMVEIntegerOps()) {
1290 if (Kind == TTI::SK_Broadcast) {
1291 static const CostTblEntry MVEDupTbl[] = {
1292 // VDUP handles these cases.
1293 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1294 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1295 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1296 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1297 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1298
1299 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1300 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1301 LT.second))
1302 return LT.first * Entry->Cost *
1304 }
1305
1306 if (!Mask.empty()) {
1307 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1308 if (LT.second.isVector() &&
1309 Mask.size() <= LT.second.getVectorNumElements() &&
1310 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1311 isVREVMask(Mask, LT.second, 64)))
1312 return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1313 }
1314 }
1315
1316 // Restore optimal kind.
1317 if (IsExtractSubvector)
1319 int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1321 : 1;
1322 return BaseCost *
1323 BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
1324}
1325
1327 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1330 const Instruction *CxtI) {
1331 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1332 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1333 // Make operations on i1 relatively expensive as this often involves
1334 // combining predicates. AND and XOR should be easier to handle with IT
1335 // blocks.
1336 switch (ISDOpcode) {
1337 default:
1338 break;
1339 case ISD::AND:
1340 case ISD::XOR:
1341 return 2;
1342 case ISD::OR:
1343 return 3;
1344 }
1345 }
1346
1347 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1348
1349 if (ST->hasNEON()) {
1350 const unsigned FunctionCallDivCost = 20;
1351 const unsigned ReciprocalDivCost = 10;
1352 static const CostTblEntry CostTbl[] = {
1353 // Division.
1354 // These costs are somewhat random. Choose a cost of 20 to indicate that
1355 // vectorizing devision (added function call) is going to be very expensive.
1356 // Double registers types.
1357 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1358 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1359 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1360 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1361 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1362 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1363 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1364 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1365 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1366 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1367 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1368 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1369 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1370 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1371 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1372 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1373 // Quad register types.
1374 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1375 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1376 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1377 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1378 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1379 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1380 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1381 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1382 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1383 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1384 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1385 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1386 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1387 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1388 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1389 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1390 // Multiplication.
1391 };
1392
1393 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1394 return LT.first * Entry->Cost;
1395
1397 Opcode, Ty, CostKind, Op1Info, Op2Info);
1398
1399 // This is somewhat of a hack. The problem that we are facing is that SROA
1400 // creates a sequence of shift, and, or instructions to construct values.
1401 // These sequences are recognized by the ISel and have zero-cost. Not so for
1402 // the vectorized code. Because we have support for v2i64 but not i64 those
1403 // sequences look particularly beneficial to vectorize.
1404 // To work around this we increase the cost of v2i64 operations to make them
1405 // seem less beneficial.
1406 if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1407 Cost += 4;
1408
1409 return Cost;
1410 }
1411
1412 // If this operation is a shift on arm/thumb2, it might well be folded into
1413 // the following instruction, hence having a cost of 0.
1414 auto LooksLikeAFreeShift = [&]() {
1415 if (ST->isThumb1Only() || Ty->isVectorTy())
1416 return false;
1417
1418 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1419 return false;
1420 if (!Op2Info.isUniform() || !Op2Info.isConstant())
1421 return false;
1422
1423 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1424 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1425 case Instruction::Add:
1426 case Instruction::Sub:
1427 case Instruction::And:
1428 case Instruction::Xor:
1429 case Instruction::Or:
1430 case Instruction::ICmp:
1431 return true;
1432 default:
1433 return false;
1434 }
1435 };
1436 if (LooksLikeAFreeShift())
1437 return 0;
1438
1439 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1440 // for "multiple beats" potentially needed by MVE instructions.
1441 int BaseCost = 1;
1442 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1443 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1444
1445 // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1446 // without treating floats as more expensive that scalars or increasing the
1447 // costs for custom operations. The results is also multiplied by the
1448 // MVEVectorCostFactor where appropriate.
1449 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1450 return LT.first * BaseCost;
1451
1452 // Else this is expand, assume that we need to scalarize this op.
1453 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1454 unsigned Num = VTy->getNumElements();
1457 // Return the cost of multiple scalar invocation plus the cost of
1458 // inserting and extracting the values.
1459 SmallVector<Type *> Tys(Args.size(), Ty);
1460 return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1461 Num * Cost;
1462 }
1463
1464 return BaseCost;
1465}
1466
1468 MaybeAlign Alignment,
1469 unsigned AddressSpace,
1471 TTI::OperandValueInfo OpInfo,
1472 const Instruction *I) {
1473 // TODO: Handle other cost kinds.
1475 return 1;
1476
1477 // Type legalization can't handle structs
1478 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1479 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1480 CostKind);
1481
1482 if (ST->hasNEON() && Src->isVectorTy() &&
1483 (Alignment && *Alignment != Align(16)) &&
1484 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1485 // Unaligned loads/stores are extremely inefficient.
1486 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1487 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1488 return LT.first * 4;
1489 }
1490
1491 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1492 // Same for stores.
1493 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1494 ((Opcode == Instruction::Load && I->hasOneUse() &&
1495 isa<FPExtInst>(*I->user_begin())) ||
1496 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1497 FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1498 Type *DstTy =
1499 Opcode == Instruction::Load
1500 ? (*I->user_begin())->getType()
1501 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1502 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1503 DstTy->getScalarType()->isFloatTy())
1504 return ST->getMVEVectorCostFactor(CostKind);
1505 }
1506
1507 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1509 : 1;
1510 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1511 CostKind, OpInfo, I);
1512}
1513
1515ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1516 unsigned AddressSpace,
1518 if (ST->hasMVEIntegerOps()) {
1519 if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1520 return ST->getMVEVectorCostFactor(CostKind);
1521 if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1522 return ST->getMVEVectorCostFactor(CostKind);
1523 }
1524 if (!isa<FixedVectorType>(Src))
1525 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1526 CostKind);
1527 // Scalar cost, which is currently very high due to the efficiency of the
1528 // generated code.
1529 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1530}
1531
1533 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1534 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1535 bool UseMaskForCond, bool UseMaskForGaps) {
1536 assert(Factor >= 2 && "Invalid interleave factor");
1537 assert(isa<VectorType>(VecTy) && "Expect a vector type");
1538
1539 // vldN/vstN doesn't support vector types of i64/f64 element.
1540 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1541
1542 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1543 !UseMaskForCond && !UseMaskForGaps) {
1544 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1545 auto *SubVecTy =
1546 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1547
1548 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1549 // Accesses having vector types that are a multiple of 128 bits can be
1550 // matched to more than one vldN/vstN instruction.
1551 int BaseCost =
1552 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1553 if (NumElts % Factor == 0 &&
1554 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1555 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1556
1557 // Some smaller than legal interleaved patterns are cheap as we can make
1558 // use of the vmovn or vrev patterns to interleave a standard load. This is
1559 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1560 // promoted differently). The cost of 2 here is then a load and vrev or
1561 // vmovn.
1562 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1563 VecTy->isIntOrIntVectorTy() &&
1564 DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
1565 return 2 * BaseCost;
1566 }
1567
1568 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1569 Alignment, AddressSpace, CostKind,
1570 UseMaskForCond, UseMaskForGaps);
1571}
1572
1574 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1575 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1576 using namespace PatternMatch;
1577 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1578 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1579 Alignment, CostKind, I);
1580
1581 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1582 auto *VTy = cast<FixedVectorType>(DataTy);
1583
1584 // TODO: Splitting, once we do that.
1585
1586 unsigned NumElems = VTy->getNumElements();
1587 unsigned EltSize = VTy->getScalarSizeInBits();
1588 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
1589
1590 // For now, it is assumed that for the MVE gather instructions the loads are
1591 // all effectively serialised. This means the cost is the scalar cost
1592 // multiplied by the number of elements being loaded. This is possibly very
1593 // conservative, but even so we still end up vectorising loops because the
1594 // cost per iteration for many loops is lower than for scalar loops.
1595 InstructionCost VectorCost =
1596 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1597 // The scalarization cost should be a lot higher. We use the number of vector
1598 // elements plus the scalarization overhead. If masking is required then a lot
1599 // of little blocks will be needed and potentially a scalarized p0 mask,
1600 // greatly increasing the cost.
1601 InstructionCost ScalarCost =
1602 NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1603 BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
1604 CostKind) +
1605 BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
1606 CostKind);
1607
1608 if (EltSize < 8 || Alignment < EltSize / 8)
1609 return ScalarCost;
1610
1611 unsigned ExtSize = EltSize;
1612 // Check whether there's a single user that asks for an extended type
1613 if (I != nullptr) {
1614 // Dependent of the caller of this function, a gather instruction will
1615 // either have opcode Instruction::Load or be a call to the masked_gather
1616 // intrinsic
1617 if ((I->getOpcode() == Instruction::Load ||
1618 match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1619 I->hasOneUse()) {
1620 const User *Us = *I->users().begin();
1621 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1622 // only allow valid type combinations
1623 unsigned TypeSize =
1624 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1625 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1626 (TypeSize == 16 && EltSize == 8)) &&
1627 TypeSize * NumElems == 128) {
1628 ExtSize = TypeSize;
1629 }
1630 }
1631 }
1632 // Check whether the input data needs to be truncated
1633 TruncInst *T;
1634 if ((I->getOpcode() == Instruction::Store ||
1635 match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1636 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1637 // Only allow valid type combinations
1638 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1639 if (((EltSize == 16 && TypeSize == 32) ||
1640 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1641 TypeSize * NumElems == 128)
1642 ExtSize = TypeSize;
1643 }
1644 }
1645
1646 if (ExtSize * NumElems != 128 || NumElems < 4)
1647 return ScalarCost;
1648
1649 // Any (aligned) i32 gather will not need to be scalarised.
1650 if (ExtSize == 32)
1651 return VectorCost;
1652 // For smaller types, we need to ensure that the gep's inputs are correctly
1653 // extended from a small enough value. Other sizes (including i64) are
1654 // scalarized for now.
1655 if (ExtSize != 8 && ExtSize != 16)
1656 return ScalarCost;
1657
1658 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1659 Ptr = BC->getOperand(0);
1660 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1661 if (GEP->getNumOperands() != 2)
1662 return ScalarCost;
1663 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1664 // Scale needs to be correct (which is only relevant for i16s).
1665 if (Scale != 1 && Scale * 8 != ExtSize)
1666 return ScalarCost;
1667 // And we need to zext (not sext) the indexes from a small enough type.
1668 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1669 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1670 return VectorCost;
1671 }
1672 return ScalarCost;
1673 }
1674 return ScalarCost;
1675}
1676
1679 std::optional<FastMathFlags> FMF,
1681
1682 EVT ValVT = TLI->getValueType(DL, ValTy);
1683 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1684 unsigned EltSize = ValVT.getScalarSizeInBits();
1685
1686 // In general floating point reductions are a series of elementwise
1687 // operations, with free extracts on each step. These are either in-order or
1688 // treewise depending on whether that is allowed by the fast math flags.
1689 if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
1690 ((EltSize == 32 && ST->hasVFP2Base()) ||
1691 (EltSize == 64 && ST->hasFP64()) ||
1692 (EltSize == 16 && ST->hasFullFP16()))) {
1693 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1694 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1695 InstructionCost VecCost = 0;
1696 while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(NumElts) &&
1697 NumElts * EltSize > VecLimit) {
1698 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1699 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1700 NumElts /= 2;
1701 }
1702
1703 // For fp16 we need to extract the upper lane elements. MVE can add a
1704 // VREV+FMIN/MAX to perform another vector step instead.
1705 InstructionCost ExtractCost = 0;
1706 if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1707 ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
1708 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1709 NumElts /= 2;
1710 } else if (ValVT.getVectorElementType() == MVT::f16)
1711 ExtractCost = NumElts / 2;
1712
1713 return VecCost + ExtractCost +
1714 NumElts *
1716 }
1717
1718 if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
1719 (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
1720 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1721 unsigned VecLimit =
1722 ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1723 InstructionCost VecCost = 0;
1724 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1725 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1726 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1727 NumElts /= 2;
1728 }
1729 // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1730 // step.
1731 if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
1732 NumElts * EltSize == 64) {
1733 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts);
1734 VecCost += ST->getMVEVectorCostFactor(CostKind) +
1735 getArithmeticInstrCost(Opcode, VecTy, CostKind);
1736 NumElts /= 2;
1737 }
1738
1739 // From here we extract the elements and perform the and/or/xor.
1740 InstructionCost ExtractCost = NumElts;
1741 return VecCost + ExtractCost +
1742 (NumElts - 1) * getArithmeticInstrCost(
1743 Opcode, ValTy->getElementType(), CostKind);
1744 }
1745
1746 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
1748 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1749
1750 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1751
1752 static const CostTblEntry CostTblAdd[]{
1753 {ISD::ADD, MVT::v16i8, 1},
1754 {ISD::ADD, MVT::v8i16, 1},
1755 {ISD::ADD, MVT::v4i32, 1},
1756 };
1757 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1758 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1759
1760 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1761}
1762
1764 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1766 EVT ValVT = TLI->getValueType(DL, ValTy);
1767 EVT ResVT = TLI->getValueType(DL, ResTy);
1768
1769 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1770
1771 switch (ISD) {
1772 case ISD::ADD:
1773 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1774 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1775
1776 // The legal cases are:
1777 // VADDV u/s 8/16/32
1778 // VADDLV u/s 32
1779 // Codegen currently cannot always handle larger than legal vectors very
1780 // well, especially for predicated reductions where the mask needs to be
1781 // split, so restrict to 128bit or smaller input types.
1782 unsigned RevVTSize = ResVT.getSizeInBits();
1783 if (ValVT.getSizeInBits() <= 128 &&
1784 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1785 (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1786 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1787 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1788 }
1789 break;
1790 default:
1791 break;
1792 }
1793 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1794 CostKind);
1795}
1796
1799 VectorType *ValTy,
1801 EVT ValVT = TLI->getValueType(DL, ValTy);
1802 EVT ResVT = TLI->getValueType(DL, ResTy);
1803
1804 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1805 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1806
1807 // The legal cases are:
1808 // VMLAV u/s 8/16/32
1809 // VMLALV u/s 16/32
1810 // Codegen currently cannot always handle larger than legal vectors very
1811 // well, especially for predicated reductions where the mask needs to be
1812 // split, so restrict to 128bit or smaller input types.
1813 unsigned RevVTSize = ResVT.getSizeInBits();
1814 if (ValVT.getSizeInBits() <= 128 &&
1815 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1816 (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
1817 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1818 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1819 }
1820
1821 return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
1822}
1823
1826 FastMathFlags FMF,
1828 EVT ValVT = TLI->getValueType(DL, Ty);
1829
1830 // In general floating point reductions are a series of elementwise
1831 // operations, with free extracts on each step. These are either in-order or
1832 // treewise depending on whether that is allowed by the fast math flags.
1833 if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
1834 ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
1835 (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
1836 (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
1837 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
1838 unsigned EltSize = ValVT.getScalarSizeInBits();
1839 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1840 InstructionCost VecCost;
1841 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1842 Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);
1843 IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
1844 VecCost += getIntrinsicInstrCost(ICA, CostKind);
1845 NumElts /= 2;
1846 }
1847
1848 // For fp16 we need to extract the upper lane elements. MVE can add a
1849 // VREV+FMIN/MAX to perform another vector step instead.
1850 InstructionCost ExtractCost = 0;
1851 if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
1852 NumElts == 8) {
1853 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1854 NumElts /= 2;
1855 } else if (ValVT.getVectorElementType() == MVT::f16)
1856 ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;
1857
1859 {Ty->getElementType(), Ty->getElementType()},
1860 FMF);
1861 return VecCost + ExtractCost +
1862 (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
1863 }
1864
1865 if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
1866 IID == Intrinsic::umin || IID == Intrinsic::umax) {
1867 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1868
1869 // All costs are the same for u/s min/max. These lower to vminv, which are
1870 // given a slightly higher cost as they tend to take multiple cycles for
1871 // smaller type sizes.
1872 static const CostTblEntry CostTblAdd[]{
1873 {ISD::SMIN, MVT::v16i8, 4},
1874 {ISD::SMIN, MVT::v8i16, 3},
1875 {ISD::SMIN, MVT::v4i32, 2},
1876 };
1877 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))
1878 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1879 }
1880
1881 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1882}
1883
1887 switch (ICA.getID()) {
1888 case Intrinsic::get_active_lane_mask:
1889 // Currently we make a somewhat optimistic assumption that
1890 // active_lane_mask's are always free. In reality it may be freely folded
1891 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1892 // of add/icmp code. We may need to improve this in the future, but being
1893 // able to detect if it is free or not involves looking at a lot of other
1894 // code. We currently assume that the vectorizer inserted these, and knew
1895 // what it was doing in adding one.
1896 if (ST->hasMVEIntegerOps())
1897 return 0;
1898 break;
1899 case Intrinsic::sadd_sat:
1900 case Intrinsic::ssub_sat:
1901 case Intrinsic::uadd_sat:
1902 case Intrinsic::usub_sat: {
1903 if (!ST->hasMVEIntegerOps())
1904 break;
1905 Type *VT = ICA.getReturnType();
1906
1907 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1908 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1909 LT.second == MVT::v16i8) {
1910 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1911 // need to extend the type, as it uses shr(qadd(shl, shl)).
1912 unsigned Instrs =
1913 LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1914 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1915 }
1916 break;
1917 }
1918 case Intrinsic::abs:
1919 case Intrinsic::smin:
1920 case Intrinsic::smax:
1921 case Intrinsic::umin:
1922 case Intrinsic::umax: {
1923 if (!ST->hasMVEIntegerOps())
1924 break;
1925 Type *VT = ICA.getReturnType();
1926
1927 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1928 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1929 LT.second == MVT::v16i8)
1930 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1931 break;
1932 }
1933 case Intrinsic::minnum:
1934 case Intrinsic::maxnum: {
1935 if (!ST->hasMVEFloatOps())
1936 break;
1937 Type *VT = ICA.getReturnType();
1938 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1939 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1940 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1941 break;
1942 }
1943 case Intrinsic::fptosi_sat:
1944 case Intrinsic::fptoui_sat: {
1945 if (ICA.getArgTypes().empty())
1946 break;
1947 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1948 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1949 EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
1950 // Check for the legal types, with the corect subtarget features.
1951 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
1952 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
1953 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
1954 return LT.first;
1955
1956 // Equally for MVE vector types
1957 if (ST->hasMVEFloatOps() &&
1958 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
1959 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
1960 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1961
1962 // Otherwise we use a legal convert followed by a min+max
1963 if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
1964 (ST->hasFP64() && LT.second == MVT::f64) ||
1965 (ST->hasFullFP16() && LT.second == MVT::f16) ||
1966 (ST->hasMVEFloatOps() &&
1967 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
1968 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
1969 Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
1970 LT.second.getScalarSizeInBits());
1972 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1973 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
1974 : Intrinsic::umin,
1975 LegalTy, {LegalTy, LegalTy});
1977 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
1978 : Intrinsic::umax,
1979 LegalTy, {LegalTy, LegalTy});
1981 return LT.first * Cost;
1982 }
1983 break;
1984 }
1985 }
1986
1988}
1989
1991 if (!F->isIntrinsic())
1992 return BaseT::isLoweredToCall(F);
1993
1994 // Assume all Arm-specific intrinsics map to an instruction.
1995 if (F->getName().starts_with("llvm.arm"))
1996 return false;
1997
1998 switch (F->getIntrinsicID()) {
1999 default: break;
2000 case Intrinsic::powi:
2001 case Intrinsic::sin:
2002 case Intrinsic::cos:
2003 case Intrinsic::pow:
2004 case Intrinsic::log:
2005 case Intrinsic::log10:
2006 case Intrinsic::log2:
2007 case Intrinsic::exp:
2008 case Intrinsic::exp2:
2009 return true;
2010 case Intrinsic::sqrt:
2011 case Intrinsic::fabs:
2012 case Intrinsic::copysign:
2013 case Intrinsic::floor:
2014 case Intrinsic::ceil:
2015 case Intrinsic::trunc:
2016 case Intrinsic::rint:
2017 case Intrinsic::nearbyint:
2018 case Intrinsic::round:
2019 case Intrinsic::canonicalize:
2020 case Intrinsic::lround:
2021 case Intrinsic::llround:
2022 case Intrinsic::lrint:
2023 case Intrinsic::llrint:
2024 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2025 return true;
2026 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2027 return true;
2028 // Some operations can be handled by vector instructions and assume
2029 // unsupported vectors will be expanded into supported scalar ones.
2030 // TODO Handle scalar operations properly.
2031 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2032 case Intrinsic::masked_store:
2033 case Intrinsic::masked_load:
2034 case Intrinsic::masked_gather:
2035 case Intrinsic::masked_scatter:
2036 return !ST->hasMVEIntegerOps();
2037 case Intrinsic::sadd_with_overflow:
2038 case Intrinsic::uadd_with_overflow:
2039 case Intrinsic::ssub_with_overflow:
2040 case Intrinsic::usub_with_overflow:
2041 case Intrinsic::sadd_sat:
2042 case Intrinsic::uadd_sat:
2043 case Intrinsic::ssub_sat:
2044 case Intrinsic::usub_sat:
2045 return false;
2046 }
2047
2048 return BaseT::isLoweredToCall(F);
2049}
2050
2052 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
2053 EVT VT = TLI->getValueType(DL, I.getType(), true);
2054 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
2055 return true;
2056
2057 // Check if an intrinsic will be lowered to a call and assume that any
2058 // other CallInst will generate a bl.
2059 if (auto *Call = dyn_cast<CallInst>(&I)) {
2060 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
2061 switch(II->getIntrinsicID()) {
2062 case Intrinsic::memcpy:
2063 case Intrinsic::memset:
2064 case Intrinsic::memmove:
2065 return getNumMemOps(II) == -1;
2066 default:
2067 if (const Function *F = Call->getCalledFunction())
2068 return isLoweredToCall(F);
2069 }
2070 }
2071 return true;
2072 }
2073
2074 // FPv5 provides conversions between integer, double-precision,
2075 // single-precision, and half-precision formats.
2076 switch (I.getOpcode()) {
2077 default:
2078 break;
2079 case Instruction::FPToSI:
2080 case Instruction::FPToUI:
2081 case Instruction::SIToFP:
2082 case Instruction::UIToFP:
2083 case Instruction::FPTrunc:
2084 case Instruction::FPExt:
2085 return !ST->hasFPARMv8Base();
2086 }
2087
2088 // FIXME: Unfortunately the approach of checking the Operation Action does
2089 // not catch all cases of Legalization that use library calls. Our
2090 // Legalization step categorizes some transformations into library calls as
2091 // Custom, Expand or even Legal when doing type legalization. So for now
2092 // we have to special case for instance the SDIV of 64bit integers and the
2093 // use of floating point emulation.
2094 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
2095 switch (ISD) {
2096 default:
2097 break;
2098 case ISD::SDIV:
2099 case ISD::UDIV:
2100 case ISD::SREM:
2101 case ISD::UREM:
2102 case ISD::SDIVREM:
2103 case ISD::UDIVREM:
2104 return true;
2105 }
2106 }
2107
2108 // Assume all other non-float operations are supported.
2109 if (!VT.isFloatingPoint())
2110 return false;
2111
2112 // We'll need a library call to handle most floats when using soft.
2113 if (TLI->useSoftFloat()) {
2114 switch (I.getOpcode()) {
2115 default:
2116 return true;
2117 case Instruction::Alloca:
2118 case Instruction::Load:
2119 case Instruction::Store:
2120 case Instruction::Select:
2121 case Instruction::PHI:
2122 return false;
2123 }
2124 }
2125
2126 // We'll need a libcall to perform double precision operations on a single
2127 // precision only FPU.
2128 if (I.getType()->isDoubleTy() && !ST->hasFP64())
2129 return true;
2130
2131 // Likewise for half precision arithmetic.
2132 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2133 return true;
2134
2135 return false;
2136}
2137
2139 AssumptionCache &AC,
2140 TargetLibraryInfo *LibInfo,
2141 HardwareLoopInfo &HWLoopInfo) {
2142 // Low-overhead branches are only supported in the 'low-overhead branch'
2143 // extension of v8.1-m.
2144 if (!ST->hasLOB() || DisableLowOverheadLoops) {
2145 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2146 return false;
2147 }
2148
2150 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2151 return false;
2152 }
2153
2154 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2155 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2156 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2157 return false;
2158 }
2159
2160 const SCEV *TripCountSCEV =
2161 SE.getAddExpr(BackedgeTakenCount,
2162 SE.getOne(BackedgeTakenCount->getType()));
2163
2164 // We need to store the trip count in LR, a 32-bit register.
2165 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2166 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2167 return false;
2168 }
2169
2170 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2171 // point in generating a hardware loop if that's going to happen.
2172
2173 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2174 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2175 switch (Call->getIntrinsicID()) {
2176 default:
2177 break;
2178 case Intrinsic::start_loop_iterations:
2179 case Intrinsic::test_start_loop_iterations:
2180 case Intrinsic::loop_decrement:
2181 case Intrinsic::loop_decrement_reg:
2182 return true;
2183 }
2184 }
2185 return false;
2186 };
2187
2188 // Scan the instructions to see if there's any that we know will turn into a
2189 // call or if this loop is already a low-overhead loop or will become a tail
2190 // predicated loop.
2191 bool IsTailPredLoop = false;
2192 auto ScanLoop = [&](Loop *L) {
2193 for (auto *BB : L->getBlocks()) {
2194 for (auto &I : *BB) {
2195 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2196 isa<InlineAsm>(I)) {
2197 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2198 return false;
2199 }
2200 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2201 IsTailPredLoop |=
2202 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2203 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2204 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2205 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2206 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2207 }
2208 }
2209 return true;
2210 };
2211
2212 // Visit inner loops.
2213 for (auto *Inner : *L)
2214 if (!ScanLoop(Inner))
2215 return false;
2216
2217 if (!ScanLoop(L))
2218 return false;
2219
2220 // TODO: Check whether the trip count calculation is expensive. If L is the
2221 // inner loop but we know it has a low trip count, calculating that trip
2222 // count (in the parent loop) may be detrimental.
2223
2224 LLVMContext &C = L->getHeader()->getContext();
2225 HWLoopInfo.CounterInReg = true;
2226 HWLoopInfo.IsNestingLegal = false;
2227 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2228 HWLoopInfo.CountType = Type::getInt32Ty(C);
2229 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2230 return true;
2231}
2232
2233static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2234 // We don't allow icmp's, and because we only look at single block loops,
2235 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2236 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2237 return false;
2238 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2239 // not currently canonical, but soon will be. Code without them uses icmp, and
2240 // so is not tail predicated as per the condition above. In order to get the
2241 // same performance we treat min and max the same as an icmp for tailpred
2242 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2243 // pick more optimial instructions like VQDMULH. They need to be recognized
2244 // directly by the vectorizer).
2245 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2246 if ((II->getIntrinsicID() == Intrinsic::smin ||
2247 II->getIntrinsicID() == Intrinsic::smax ||
2248 II->getIntrinsicID() == Intrinsic::umin ||
2249 II->getIntrinsicID() == Intrinsic::umax) &&
2250 ++ICmpCount > 1)
2251 return false;
2252
2253 if (isa<FCmpInst>(&I))
2254 return false;
2255
2256 // We could allow extending/narrowing FP loads/stores, but codegen is
2257 // too inefficient so reject this for now.
2258 if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2259 return false;
2260
2261 // Extends have to be extending-loads
2262 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2263 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2264 return false;
2265
2266 // Truncs have to be narrowing-stores
2267 if (isa<TruncInst>(&I) )
2268 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2269 return false;
2270
2271 return true;
2272}
2273
2274// To set up a tail-predicated loop, we need to know the total number of
2275// elements processed by that loop. Thus, we need to determine the element
2276// size and:
2277// 1) it should be uniform for all operations in the vector loop, so we
2278// e.g. don't want any widening/narrowing operations.
2279// 2) it should be smaller than i64s because we don't have vector operations
2280// that work on i64s.
2281// 3) we don't want elements to be reversed or shuffled, to make sure the
2282// tail-predication masks/predicates the right lanes.
2283//
2285 const DataLayout &DL,
2286 const LoopAccessInfo *LAI) {
2287 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2288
2289 // If there are live-out values, it is probably a reduction. We can predicate
2290 // most reduction operations freely under MVE using a combination of
2291 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2292 // floating point and integer reductions, but don't check for operators
2293 // specifically here. If the value ends up not being a reduction (and so the
2294 // vectorizer cannot tailfold the loop), we should fall back to standard
2295 // vectorization automatically.
2297 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2298 bool ReductionsDisabled =
2301
2302 for (auto *I : LiveOuts) {
2303 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2304 !I->getType()->isHalfTy()) {
2305 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2306 "live-out value\n");
2307 return false;
2308 }
2309 if (ReductionsDisabled) {
2310 LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2311 return false;
2312 }
2313 }
2314
2315 // Next, check that all instructions can be tail-predicated.
2316 PredicatedScalarEvolution PSE = LAI->getPSE();
2318 int ICmpCount = 0;
2319
2320 for (BasicBlock *BB : L->blocks()) {
2321 for (Instruction &I : BB->instructionsWithoutDebug()) {
2322 if (isa<PHINode>(&I))
2323 continue;
2324 if (!canTailPredicateInstruction(I, ICmpCount)) {
2325 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2326 return false;
2327 }
2328
2329 Type *T = I.getType();
2330 if (T->getScalarSizeInBits() > 32) {
2331 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2332 return false;
2333 }
2334 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2336 Type *AccessTy = getLoadStoreType(&I);
2337 int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
2338 if (NextStride == 1) {
2339 // TODO: for now only allow consecutive strides of 1. We could support
2340 // other strides as long as it is uniform, but let's keep it simple
2341 // for now.
2342 continue;
2343 } else if (NextStride == -1 ||
2344 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2345 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2347 << "Consecutive strides of 2 found, vld2/vstr2 can't "
2348 "be tail-predicated\n.");
2349 return false;
2350 // TODO: don't tail predicate if there is a reversed load?
2351 } else if (EnableMaskedGatherScatters) {
2352 // Gather/scatters do allow loading from arbitrary strides, at
2353 // least if they are loop invariant.
2354 // TODO: Loop variant strides should in theory work, too, but
2355 // this requires further testing.
2356 const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2357 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2358 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2359 if (PSE.getSE()->isLoopInvariant(Step, L))
2360 continue;
2361 }
2362 }
2363 LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2364 "tail-predicate\n.");
2365 return false;
2366 }
2367 }
2368 }
2369
2370 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2371 return true;
2372}
2373
2375 if (!EnableTailPredication) {
2376 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2377 return false;
2378 }
2379
2380 // Creating a predicated vector loop is the first step for generating a
2381 // tail-predicated hardware loop, for which we need the MVE masked
2382 // load/stores instructions:
2383 if (!ST->hasMVEIntegerOps())
2384 return false;
2385
2386 LoopVectorizationLegality *LVL = TFI->LVL;
2387 Loop *L = LVL->getLoop();
2388
2389 // For now, restrict this to single block loops.
2390 if (L->getNumBlocks() > 1) {
2391 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2392 "loop.\n");
2393 return false;
2394 }
2395
2396 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2397
2398 LoopInfo *LI = LVL->getLoopInfo();
2399 HardwareLoopInfo HWLoopInfo(L);
2400 if (!HWLoopInfo.canAnalyze(*LI)) {
2401 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2402 "analyzable.\n");
2403 return false;
2404 }
2405
2408
2409 // This checks if we have the low-overhead branch architecture
2410 // extension, and if we will create a hardware-loop:
2411 if (!isHardwareLoopProfitable(L, *SE, *AC, TFI->TLI, HWLoopInfo)) {
2412 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2413 "profitable.\n");
2414 return false;
2415 }
2416
2417 DominatorTree *DT = LVL->getDominatorTree();
2418 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) {
2419 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2420 "a candidate.\n");
2421 return false;
2422 }
2423
2424 return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI());
2425}
2426
2428ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
2429 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2431
2432 // Intrinsic @llvm.get.active.lane.mask is supported.
2433 // It is used in the MVETailPredication pass, which requires the number of
2434 // elements processed by this vector loop to setup the tail-predicated
2435 // loop.
2437}
2441 // Enable Upper bound unrolling universally, providing that we do not see an
2442 // active lane mask, which will be better kept as a loop to become tail
2443 // predicated than to be conditionally unrolled.
2444 UP.UpperBound =
2445 !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) {
2446 return isa<IntrinsicInst>(I) &&
2447 cast<IntrinsicInst>(I).getIntrinsicID() ==
2448 Intrinsic::get_active_lane_mask;
2449 });
2450
2451 // Only currently enable these preferences for M-Class cores.
2452 if (!ST->isMClass())
2453 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2454
2455 // Disable loop unrolling for Oz and Os.
2456 UP.OptSizeThreshold = 0;
2458 if (L->getHeader()->getParent()->hasOptSize())
2459 return;
2460
2461 SmallVector<BasicBlock*, 4> ExitingBlocks;
2462 L->getExitingBlocks(ExitingBlocks);
2463 LLVM_DEBUG(dbgs() << "Loop has:\n"
2464 << "Blocks: " << L->getNumBlocks() << "\n"
2465 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2466
2467 // Only allow another exit other than the latch. This acts as an early exit
2468 // as it mirrors the profitability calculation of the runtime unroller.
2469 if (ExitingBlocks.size() > 2)
2470 return;
2471
2472 // Limit the CFG of the loop body for targets with a branch predictor.
2473 // Allowing 4 blocks permits if-then-else diamonds in the body.
2474 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2475 return;
2476
2477 // Don't unroll vectorized loops, including the remainder loop
2478 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2479 return;
2480
2481 // Scan the loop: don't unroll loops with calls as this could prevent
2482 // inlining.
2484 for (auto *BB : L->getBlocks()) {
2485 for (auto &I : *BB) {
2486 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2487 // scalar code.
2488 if (I.getType()->isVectorTy())
2489 return;
2490
2491 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2492 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2493 if (!isLoweredToCall(F))
2494 continue;
2495 }
2496 return;
2497 }
2498
2499 SmallVector<const Value*, 4> Operands(I.operand_values());
2502 }
2503 }
2504
2505 // On v6m cores, there are very few registers available. We can easily end up
2506 // spilling and reloading more registers in an unrolled loop. Look at the
2507 // number of LCSSA phis as a rough measure of how many registers will need to
2508 // be live out of the loop, reducing the default unroll count if more than 1
2509 // value is needed. In the long run, all of this should be being learnt by a
2510 // machine.
2511 unsigned UnrollCount = 4;
2512 if (ST->isThumb1Only()) {
2513 unsigned ExitingValues = 0;
2515 L->getExitBlocks(ExitBlocks);
2516 for (auto *Exit : ExitBlocks) {
2517 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2518 // only the last is expected to be needed for address operands.
2519 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2520 return PH.getNumOperands() != 1 ||
2521 !isa<GetElementPtrInst>(PH.getOperand(0));
2522 });
2523 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2524 }
2525 if (ExitingValues)
2526 UnrollCount /= ExitingValues;
2527 if (UnrollCount <= 1)
2528 return;
2529 }
2530
2531 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2532 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2533
2534 UP.Partial = true;
2535 UP.Runtime = true;
2536 UP.UnrollRemainder = true;
2538 UP.UnrollAndJam = true;
2540
2541 // Force unrolling small loops can be very useful because of the branch
2542 // taken cost of the backedge.
2543 if (Cost < 12)
2544 UP.Force = true;
2545}
2546
2550}
2551
2552bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2553 TTI::ReductionFlags Flags) const {
2554 if (!ST->hasMVEIntegerOps())
2555 return false;
2556
2557 unsigned ScalarBits = Ty->getScalarSizeInBits();
2558 switch (Opcode) {
2559 case Instruction::Add:
2560 return ScalarBits <= 64;
2561 default:
2562 return false;
2563 }
2564}
2565
2567 unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2568 if (!ST->hasMVEIntegerOps())
2569 return false;
2570 return true;
2571}
2572
2574 int64_t BaseOffset,
2575 bool HasBaseReg, int64_t Scale,
2576 unsigned AddrSpace) const {
2578 AM.BaseGV = BaseGV;
2579 AM.BaseOffs = BaseOffset;
2580 AM.HasBaseReg = HasBaseReg;
2581 AM.Scale = Scale;
2582 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2583 if (ST->hasFPAO())
2584 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2585 return 0;
2586 }
2587 return -1;
2588}
2589
2590bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2591 if (Thumb) {
2592 // B.W is available in any Thumb2-supporting target, and also in every
2593 // version of Armv8-M, even Baseline which does not include the rest of
2594 // Thumb2.
2595 return ST->isThumb2() || ST->hasV8MBaselineOps();
2596 } else {
2597 // B is available in all versions of the Arm ISA, so the only question is
2598 // whether that ISA is available at all.
2599 return ST->hasARMOps();
2600 }
2601}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file implements a class to represent arbitrary precision integral constant values and operations...
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static Value * isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm)
cl::opt< bool > EnableMaskedGatherScatters
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor
cl::opt< TailPredication::Mode > EnableTailPredication
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI)
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
This file a TargetTransformInfo::Concept conforming object specific to the ARM target machine.
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
Hexagon Common GEP
This file provides the interface for the instcombine pass implementation.
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
if(VerifyEach)
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:76
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:620
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:217
bool hasARMOps() const
Definition: ARMSubtarget.h:335
bool isThumb1Only() const
Definition: ARMSubtarget.h:434
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:344
bool isThumb2() const
Definition: ARMSubtarget.h:435
bool hasVFP2Base() const
Definition: ARMSubtarget.h:341
bool isMClass() const
Definition: ARMSubtarget.h:436
unsigned getMVEVectorCostFactor(TargetTransformInfo::TargetCostKind CostKind) const
Definition: ARMSubtarget.h:559
bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr)
bool maybeLoweredToCall(Instruction &I)
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
bool isLegalMaskedStore(Type *DataTy, Align Alignment)
bool isLegalMaskedLoad(Type *DataTy, Align Alignment)
InstructionCost getMemcpyCost(const Instruction *I)
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLoweredToCall(const Function *F)
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool hasArmWideBranch(bool Thumb) const
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalMaskedGather(Type *Ty, Align Alignment)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
getScalingFactorCost - Return the cost of the scaling used in addressing mode represented by AM.
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
bool isProfitableLSRChainElement(Instruction *I)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool useSoftFloat() const override
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:582
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:891
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:969
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:762
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:654
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:855
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:339
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore)
Construct a binary instruction, given the opcode and the two operands.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1660
unsigned arg_size() const
Definition: InstrTypes.h:1658
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1335
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:966
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:996
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:993
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:528
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:145
This is an important base class in LLVM.
Definition: Constant.h:41
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1807
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1214
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:466
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2127
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
The core instruction combiner logic.
Definition: InstCombiner.h:47
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:340
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:339
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:385
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, unsigned Depth=0)=0
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:409
BuilderTy & Builder
Definition: InstCombiner.h:60
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:337
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:149
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:359
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1636
bool isShift() const
Definition: Instruction.h:259
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Drive the analysis of memory accesses in the loop.
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
AssumptionCache * getAssumptionCache() const
const LoopAccessInfo * getLAI() const
ScalarEvolution * getScalarEvolution() const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1541
Root of the metadata hierarchy.
Definition: Metadata.h:62
The optimization diagnostic interface.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
This class represents an analyzed expression in the program.
Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
size_t size() const
Definition: SmallVector.h:91
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
Provides information about what library functions are available for the current target.
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxStoresPerMemcpy(bool OptSize) const
Get maximum # of store operations permitted for llvm.memcpy.
unsigned getMaxStoresPerMemmove(bool OptSize) const
Get maximum # of store operations permitted for llvm.memmove.
unsigned getMaxStoresPerMemset(bool OptSize) const
Get maximum # of store operations permitted for llvm.memset.
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:216
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:600
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:674
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
Definition: PatternMatch.h:160
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:821
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:163
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:561
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1085
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
AddressSpace
Definition: NVPTXBaseInfo.h:21
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition: Local.h:241
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
Definition: LoopUtils.cpp:123
SelectPatternFlavor
Specific patterns of select instructions we can match.
@ SPF_ABS
Floating point maxnum.
@ SPF_FMAXNUM
Floating point minnum.
@ SPF_UMIN
Signed minimum.
@ SPF_UMAX
Signed maximum.
@ SPF_SMAX
Unsigned minimum.
@ SPF_FMINNUM
Unsigned maximum.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
InstructionCost Cost
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool isFixedLengthVector() const
Definition: ValueTypes.h:177
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
Attributes of a target dependent hardware loop.
bool canAnalyze(LoopInfo &LI)
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
SelectPatternFlavor Flavor
TargetLibraryInfo * TLI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Type Conversion Cost Table.
Definition: CostTable.h:55