LLVM 23.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "ARMSubtarget.h"
12#include "llvm/ADT/APInt.h"
19#include "llvm/IR/BasicBlock.h"
20#include "llvm/IR/DataLayout.h"
22#include "llvm/IR/Instruction.h"
25#include "llvm/IR/Intrinsics.h"
26#include "llvm/IR/IntrinsicsARM.h"
28#include "llvm/IR/Type.h"
37#include <algorithm>
38#include <cassert>
39#include <cstdint>
40#include <optional>
41#include <utility>
42
43using namespace llvm;
44
45#define DEBUG_TYPE "armtti"
46
48 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
49 cl::desc("Enable the generation of masked loads and stores"));
50
52 "disable-arm-loloops", cl::Hidden, cl::init(false),
53 cl::desc("Disable the generation of low-overhead loops"));
54
55static cl::opt<bool>
56 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
57 cl::desc("Enable the generation of WLS loops"));
58
60 "widen-global-strings", cl::Hidden, cl::init(true),
61 cl::desc("Enable the widening of global strings to alignment boundaries"));
62
64
66
68
70 "arm-force-unroll-threshold", cl::init(12), cl::Hidden,
72 "Threshold for forced unrolling of small loops in Arm architecture"));
73
74/// Convert a vector load intrinsic into a simple llvm load instruction.
75/// This is beneficial when the underlying object being addressed comes
76/// from a constant, since we get constant-folding for free.
77static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
78 InstCombiner::BuilderTy &Builder) {
79 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
80
81 if (!IntrAlign)
82 return nullptr;
83
84 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
85 ? MemAlign
86 : IntrAlign->getLimitedValue();
87
88 if (!isPowerOf2_32(Alignment))
89 return nullptr;
90
91 return Builder.CreateAlignedLoad(II.getType(), II.getArgOperand(0),
92 Align(Alignment));
93}
94
96 const Function *Callee) const {
97 const TargetMachine &TM = getTLI()->getTargetMachine();
98 const FeatureBitset &CallerBits =
99 TM.getSubtargetImpl(*Caller)->getFeatureBits();
100 const FeatureBitset &CalleeBits =
101 TM.getSubtargetImpl(*Callee)->getFeatureBits();
102
103 // To inline a callee, all features not in the allowed list must match exactly.
104 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
105 (CalleeBits & ~InlineFeaturesAllowed);
106 // For features in the allowed list, the callee's features must be a subset of
107 // the callers'.
108 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
109 (CalleeBits & InlineFeaturesAllowed);
110
111 LLVM_DEBUG({
112 if (!MatchExact || !MatchSubset) {
113 dbgs() << "=== Inline compatibility debug ===\n";
114 dbgs() << "Caller: " << Caller->getName() << "\n";
115 dbgs() << "Callee: " << Callee->getName() << "\n";
116
117 // Bit diffs
118 FeatureBitset MissingInCaller = CalleeBits & ~CallerBits; // callee-only
119 FeatureBitset ExtraInCaller = CallerBits & ~CalleeBits; // caller-only
120
121 // Counts
122 dbgs() << "Only-in-caller bit count: " << ExtraInCaller.count() << "\n";
123 dbgs() << "Only-in-callee bit count: " << MissingInCaller.count() << "\n";
124
125 dbgs() << "Only-in-caller feature indices [";
126 {
127 bool First = true;
128 for (size_t I = 0, E = ExtraInCaller.size(); I < E; ++I) {
129 if (ExtraInCaller.test(I)) {
130 if (!First)
131 dbgs() << ", ";
132 dbgs() << I;
133 First = false;
134 }
135 }
136 }
137 dbgs() << "]\n";
138
139 dbgs() << "Only-in-callee feature indices [";
140 {
141 bool First = true;
142 for (size_t I = 0, E = MissingInCaller.size(); I < E; ++I) {
143 if (MissingInCaller.test(I)) {
144 if (!First)
145 dbgs() << ", ";
146 dbgs() << I;
147 First = false;
148 }
149 }
150 }
151 dbgs() << "]\n";
152
153 // Indices map to features as found in
154 // llvm-project/(your_build)/lib/Target/ARM/ARMGenSubtargetInfo.inc
155 dbgs() << "MatchExact=" << (MatchExact ? "true" : "false")
156 << " MatchSubset=" << (MatchSubset ? "true" : "false") << "\n";
157 }
158 });
159 return MatchExact && MatchSubset;
160}
161
164 ScalarEvolution *SE) const {
165 if (ST->hasMVEIntegerOps())
167
168 if (L->getHeader()->getParent()->hasOptSize())
169 return TTI::AMK_None;
170
171 if (ST->isMClass() && ST->isThumb2() &&
172 L->getNumBlocks() == 1)
173 return TTI::AMK_PreIndexed;
174
175 return TTI::AMK_None;
176}
177
178std::optional<Instruction *>
180 using namespace PatternMatch;
181 Intrinsic::ID IID = II.getIntrinsicID();
182 switch (IID) {
183 default:
184 break;
185 case Intrinsic::arm_neon_vld1: {
186 Align MemAlign =
187 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
189 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
190 return IC.replaceInstUsesWith(II, V);
191 }
192 break;
193 }
194
195 case Intrinsic::arm_neon_vld2:
196 case Intrinsic::arm_neon_vld3:
197 case Intrinsic::arm_neon_vld4:
198 case Intrinsic::arm_neon_vld2lane:
199 case Intrinsic::arm_neon_vld3lane:
200 case Intrinsic::arm_neon_vld4lane:
201 case Intrinsic::arm_neon_vst1:
202 case Intrinsic::arm_neon_vst2:
203 case Intrinsic::arm_neon_vst3:
204 case Intrinsic::arm_neon_vst4:
205 case Intrinsic::arm_neon_vst2lane:
206 case Intrinsic::arm_neon_vst3lane:
207 case Intrinsic::arm_neon_vst4lane: {
208 Align MemAlign =
209 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
211 unsigned AlignArg = II.arg_size() - 1;
212 Value *AlignArgOp = II.getArgOperand(AlignArg);
213 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
214 if (Align && *Align < MemAlign) {
215 return IC.replaceOperand(
216 II, AlignArg,
217 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
218 false));
219 }
220 break;
221 }
222
223 case Intrinsic::arm_neon_vld1x2:
224 case Intrinsic::arm_neon_vld1x3:
225 case Intrinsic::arm_neon_vld1x4:
226 case Intrinsic::arm_neon_vst1x2:
227 case Intrinsic::arm_neon_vst1x3:
228 case Intrinsic::arm_neon_vst1x4: {
229 Align NewAlign =
230 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
232 Align OldAlign = II.getParamAlign(0).valueOrOne();
233 if (NewAlign > OldAlign)
234 II.addParamAttr(0,
235 Attribute::getWithAlignment(II.getContext(), NewAlign));
236 break;
237 }
238
239 case Intrinsic::arm_mve_pred_i2v: {
240 Value *Arg = II.getArgOperand(0);
241 Value *ArgArg;
243 PatternMatch::m_Value(ArgArg))) &&
244 II.getType() == ArgArg->getType()) {
245 return IC.replaceInstUsesWith(II, ArgArg);
246 }
247 Constant *XorMask;
249 PatternMatch::m_Value(ArgArg)),
250 PatternMatch::m_Constant(XorMask))) &&
251 II.getType() == ArgArg->getType()) {
252 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
253 if (CI->getValue().trunc(16).isAllOnes()) {
254 auto TrueVector = IC.Builder.CreateVectorSplat(
255 cast<FixedVectorType>(II.getType())->getNumElements(),
256 IC.Builder.getTrue());
257 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
258 }
259 }
260 }
261 KnownBits ScalarKnown(32);
262 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
263 ScalarKnown)) {
264 return &II;
265 }
266 break;
267 }
268 case Intrinsic::arm_mve_pred_v2i: {
269 Value *Arg = II.getArgOperand(0);
270 Value *ArgArg;
272 PatternMatch::m_Value(ArgArg)))) {
273 return IC.replaceInstUsesWith(II, ArgArg);
274 }
275
276 if (II.getMetadata(LLVMContext::MD_range))
277 break;
278
279 ConstantRange Range(APInt(32, 0), APInt(32, 0x10000));
280
281 if (auto CurrentRange = II.getRange()) {
282 Range = Range.intersectWith(*CurrentRange);
283 if (Range == CurrentRange)
284 break;
285 }
286
287 II.addRangeRetAttr(Range);
288 II.addRetAttr(Attribute::NoUndef);
289 return &II;
290 }
291 case Intrinsic::arm_mve_vadc:
292 case Intrinsic::arm_mve_vadc_predicated: {
293 unsigned CarryOp =
294 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
295 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
296 "Bad type for intrinsic!");
297
298 KnownBits CarryKnown(32);
299 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
300 CarryKnown)) {
301 return &II;
302 }
303 break;
304 }
305 case Intrinsic::arm_mve_vmldava: {
307 if (I->hasOneUse()) {
308 auto *User = cast<Instruction>(*I->user_begin());
309 Value *OpZ;
310 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
311 match(I->getOperand(3), m_Zero())) {
312 Value *OpX = I->getOperand(4);
313 Value *OpY = I->getOperand(5);
314 Type *OpTy = OpX->getType();
315
317 Value *V =
318 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
319 {I->getOperand(0), I->getOperand(1),
320 I->getOperand(2), OpZ, OpX, OpY});
321
323 return IC.eraseInstFromFunction(*User);
324 }
325 }
326 return std::nullopt;
327 }
328 }
329 return std::nullopt;
330}
331
333 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
334 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
335 std::function<void(Instruction *, unsigned, APInt, APInt &)>
336 SimplifyAndSetOp) const {
337
338 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
339 // opcode specifying a Top/Bottom instruction, which can change between
340 // instructions.
341 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
342 unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
343 unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
344
345 // The only odd/even lanes of operand 0 will only be demanded depending
346 // on whether this is a top/bottom instruction.
347 APInt DemandedElts =
348 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
349 : APInt::getHighBitsSet(2, 1));
350 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
351 // The other lanes will be defined from the inserted elements.
352 UndefElts &= APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
353 : APInt::getHighBitsSet(2, 1));
354 return std::nullopt;
355 };
356
357 switch (II.getIntrinsicID()) {
358 default:
359 break;
360 case Intrinsic::arm_mve_vcvt_narrow:
361 SimplifyNarrowInstrTopBottom(2);
362 break;
363 case Intrinsic::arm_mve_vqmovn:
364 SimplifyNarrowInstrTopBottom(4);
365 break;
366 case Intrinsic::arm_mve_vshrn:
367 SimplifyNarrowInstrTopBottom(7);
368 break;
369 }
370
371 return std::nullopt;
372}
373
376 assert(Ty->isIntegerTy());
377
378 unsigned Bits = Ty->getPrimitiveSizeInBits();
379 if (Bits == 0 || Imm.getActiveBits() >= 64)
380 return 4;
381
382 int64_t SImmVal = Imm.getSExtValue();
383 uint64_t ZImmVal = Imm.getZExtValue();
384 if (!ST->isThumb()) {
385 if ((SImmVal >= 0 && SImmVal < 65536) ||
386 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
387 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
388 return 1;
389 return ST->hasV6T2Ops() ? 2 : 3;
390 }
391 if (ST->isThumb2()) {
392 if ((SImmVal >= 0 && SImmVal < 65536) ||
393 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
394 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
395 return 1;
396 return ST->hasV6T2Ops() ? 2 : 3;
397 }
398 // Thumb1, any i8 imm cost 1.
399 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
400 return 1;
401 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
402 return 2;
403 // Load from constantpool.
404 return 3;
405}
406
407// Constants smaller than 256 fit in the immediate field of
408// Thumb1 instructions so we return a zero cost and 1 otherwise.
410 const APInt &Imm,
411 Type *Ty) const {
412 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
413 return 0;
414
415 return 1;
416}
417
418// Checks whether Inst is part of a min(max()) or max(min()) pattern
419// that will match to an SSAT instruction. Returns the instruction being
420// saturated, or null if no saturation pattern was found.
421static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
422 Value *LHS, *RHS;
423 ConstantInt *C;
425
426 if (InstSPF == SPF_SMAX &&
428 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
429
430 auto isSSatMin = [&](Value *MinInst) {
431 if (isa<SelectInst>(MinInst)) {
432 Value *MinLHS, *MinRHS;
433 ConstantInt *MinC;
434 SelectPatternFlavor MinSPF =
435 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
436 if (MinSPF == SPF_SMIN &&
438 MinC->getValue() == ((-Imm) - 1))
439 return true;
440 }
441 return false;
442 };
443
444 if (isSSatMin(Inst->getOperand(1)))
445 return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
446 if (Inst->hasNUses(2) &&
447 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
448 return Inst->getOperand(1);
449 }
450 return nullptr;
451}
452
453// Look for a FP Saturation pattern, where the instruction can be simplified to
454// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
455static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
456 if (Imm.getBitWidth() != 64 ||
457 Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
458 return false;
459 Value *FP = isSSATMinMaxPattern(Inst, Imm);
460 if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
462 if (!FP)
463 return false;
464 return isa<FPToSIInst>(FP);
465}
466
467InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
468 const APInt &Imm, Type *Ty,
470 Instruction *Inst) const {
471 // Division by a constant can be turned into multiplication, but only if we
472 // know it's constant. So it's not so much that the immediate is cheap (it's
473 // not), but that the alternative is worse.
474 // FIXME: this is probably unneeded with GlobalISel.
475 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
476 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
477 Idx == 1)
478 return 0;
479
480 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
481 // splitting any large offsets.
482 if (Opcode == Instruction::GetElementPtr && Idx != 0)
483 return 0;
484
485 if (Opcode == Instruction::And) {
486 // UXTB/UXTH
487 if (Imm == 255 || Imm == 65535)
488 return 0;
489 // Conversion to BIC is free, and means we can use ~Imm instead.
490 return std::min(getIntImmCost(Imm, Ty, CostKind),
491 getIntImmCost(~Imm, Ty, CostKind));
492 }
493
494 if (Opcode == Instruction::Add)
495 // Conversion to SUB is free, and means we can use -Imm instead.
496 return std::min(getIntImmCost(Imm, Ty, CostKind),
497 getIntImmCost(-Imm, Ty, CostKind));
498
499 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
500 Ty->getIntegerBitWidth() == 32) {
501 int64_t NegImm = -Imm.getSExtValue();
502 if (ST->isThumb2() && NegImm < 1<<12)
503 // icmp X, #-C -> cmn X, #C
504 return 0;
505 if (ST->isThumb() && NegImm < 1<<8)
506 // icmp X, #-C -> adds X, #C
507 return 0;
508 }
509
510 // xor a, -1 can always be folded to MVN
511 if (Opcode == Instruction::Xor && Imm.isAllOnes())
512 return 0;
513
514 // Ensures negative constant of min(max()) or max(min()) patterns that
515 // match to SSAT instructions don't get hoisted
516 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
517 Ty->getIntegerBitWidth() <= 32) {
518 if (isSSATMinMaxPattern(Inst, Imm) ||
519 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
521 return 0;
522 }
523
524 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
525 return 0;
526
527 // We can convert <= -1 to < 0, which is generally quite cheap.
528 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
529 ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
530 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
531 return std::min(getIntImmCost(Imm, Ty, CostKind),
532 getIntImmCost(Imm + 1, Ty, CostKind));
533 }
534
535 return getIntImmCost(Imm, Ty, CostKind);
536}
537
540 const Instruction *I) const {
542 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
543 // FIXME: The vectorizer is highly sensistive to the cost of these
544 // instructions, which suggests that it may be using the costs incorrectly.
545 // But, for now, just make them free to avoid performance regressions for
546 // vector targets.
547 return 0;
548 }
549 return BaseT::getCFInstrCost(Opcode, CostKind, I);
550}
551
553 Type *Src,
556 const Instruction *I) const {
557 int ISD = TLI->InstructionOpcodeToISD(Opcode);
558 assert(ISD && "Invalid opcode");
559
560 // TODO: Allow non-throughput costs that aren't binary.
561 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
563 return Cost == 0 ? 0 : 1;
564 return Cost;
565 };
566 auto IsLegalFPType = [this](EVT VT) {
567 EVT EltVT = VT.getScalarType();
568 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
569 (EltVT == MVT::f64 && ST->hasFP64()) ||
570 (EltVT == MVT::f16 && ST->hasFullFP16());
571 };
572
573 EVT SrcTy = TLI->getValueType(DL, Src);
574 EVT DstTy = TLI->getValueType(DL, Dst);
575
576 if (!SrcTy.isSimple() || !DstTy.isSimple())
577 return AdjustCost(
578 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
579
580 // Extending masked load/Truncating masked stores is expensive because we
581 // currently don't split them. This means that we'll likely end up
582 // loading/storing each element individually (hence the high cost).
583 if ((ST->hasMVEIntegerOps() &&
584 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
585 Opcode == Instruction::SExt)) ||
586 (ST->hasMVEFloatOps() &&
587 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
588 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
589 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
590 return 2 * DstTy.getVectorNumElements() *
591 ST->getMVEVectorCostFactor(CostKind);
592
593 // The extend of other kinds of load is free
594 if (CCH == TTI::CastContextHint::Normal ||
596 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
597 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
598 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
599 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
600 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
601 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
602 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
603 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
604 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
605 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
606 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
607 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
608 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
609 };
610 if (const auto *Entry = ConvertCostTableLookup(
611 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
612 return AdjustCost(Entry->Cost);
613
614 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
615 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
616 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
617 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
618 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
619 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
620 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
621 // The following extend from a legal type to an illegal type, so need to
622 // split the load. This introduced an extra load operation, but the
623 // extend is still "free".
624 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
625 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
626 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
627 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
628 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
629 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
630 };
631 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
632 if (const auto *Entry =
633 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
634 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
635 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
636 }
637
638 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
639 // FPExtends are similar but also require the VCVT instructions.
640 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
641 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
642 };
643 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
644 if (const auto *Entry =
645 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
646 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
647 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
648 }
649
650 // The truncate of a store is free. This is the mirror of extends above.
651 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
652 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
653 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
654 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
655 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
656 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
657 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
658 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
659 };
660 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
661 if (const auto *Entry =
662 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
663 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
664 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
665 }
666
667 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
668 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
669 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
670 };
671 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
672 if (const auto *Entry =
673 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
674 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
675 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
676 }
677 }
678
679 // NEON vector operations that can extend their inputs.
680 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
681 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
682 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
683 // vaddl
684 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
685 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
686 // vsubl
687 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
688 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
689 // vmull
690 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
691 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
692 // vshll
693 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
694 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
695 };
696
697 auto *User = cast<Instruction>(*I->user_begin());
698 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
699 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
700 DstTy.getSimpleVT(),
701 SrcTy.getSimpleVT())) {
702 return AdjustCost(Entry->Cost);
703 }
704 }
705
706 // Single to/from double precision conversions.
707 if (Src->isVectorTy() && ST->hasNEON() &&
708 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
709 DstTy.getScalarType() == MVT::f32) ||
710 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
711 DstTy.getScalarType() == MVT::f64))) {
712 static const CostTblEntry NEONFltDblTbl[] = {
713 // Vector fptrunc/fpext conversions.
714 {ISD::FP_ROUND, MVT::v2f64, 2},
715 {ISD::FP_EXTEND, MVT::v2f32, 2},
716 {ISD::FP_EXTEND, MVT::v4f32, 4}};
717
718 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
719 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
720 return AdjustCost(LT.first * Entry->Cost);
721 }
722
723 // Some arithmetic, load and store operations have specific instructions
724 // to cast up/down their types automatically at no extra cost.
725 // TODO: Get these tables to know at least what the related operations are.
726 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
727 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
728 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
729 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
730 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
731 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
732 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
733
734 // The number of vmovl instructions for the extension.
735 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
736 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
737 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
738 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
739 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
740 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
741 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
742 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
743 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
744 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
745 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
746 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
747 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
748 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
749 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
750 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
751 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
752 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
753
754 // Operations that we legalize using splitting.
755 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
756 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
757
758 // Vector float <-> i32 conversions.
759 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
760 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
761
762 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
763 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
764 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
765 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
766 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
767 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
768 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
769 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
770 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
771 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
772 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
773 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
774 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
775 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
776 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
777 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
778 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
779 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
780 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
781 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
782
783 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
784 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
785 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
786 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
787 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
788 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
789
790 // Vector double <-> i32 conversions.
791 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
792 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
793
794 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
795 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
796 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
797 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
798 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
799 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
800
801 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
802 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
803 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
804 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
805 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
806 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
807 };
808
809 if (SrcTy.isVector() && ST->hasNEON()) {
810 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
811 DstTy.getSimpleVT(),
812 SrcTy.getSimpleVT()))
813 return AdjustCost(Entry->Cost);
814 }
815
816 // Scalar float to integer conversions.
817 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
818 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
819 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
820 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
821 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
822 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
823 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
824 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
825 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
826 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
827 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
828 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
829 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
830 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
831 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
832 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
833 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
834 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
835 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
836 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
837 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
838 };
839 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
840 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
841 DstTy.getSimpleVT(),
842 SrcTy.getSimpleVT()))
843 return AdjustCost(Entry->Cost);
844 }
845
846 // Scalar integer to float conversions.
847 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
848 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
849 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
850 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
851 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
852 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
853 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
854 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
855 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
856 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
857 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
858 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
859 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
860 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
861 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
862 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
863 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
864 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
865 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
866 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
867 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
868 };
869
870 if (SrcTy.isInteger() && ST->hasNEON()) {
871 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
872 ISD, DstTy.getSimpleVT(),
873 SrcTy.getSimpleVT()))
874 return AdjustCost(Entry->Cost);
875 }
876
877 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
878 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
879 // are linearised so take more.
880 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
881 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
882 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
883 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
884 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
885 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
886 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
887 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
888 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
889 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
890 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
891 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
892 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
893 };
894
895 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
896 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
897 ISD, DstTy.getSimpleVT(),
898 SrcTy.getSimpleVT()))
899 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
900 }
901
902 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
903 // As general rule, fp converts that were not matched above are scalarized
904 // and cost 1 vcvt for each lane, so long as the instruction is available.
905 // If not it will become a series of function calls.
906 const InstructionCost CallCost =
907 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
908 int Lanes = 1;
909 if (SrcTy.isFixedLengthVector())
910 Lanes = SrcTy.getVectorNumElements();
911
912 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
913 return Lanes;
914 else
915 return Lanes * CallCost;
916 }
917
918 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
919 SrcTy.isFixedLengthVector()) {
920 // Treat a truncate with larger than legal source (128bits for MVE) as
921 // expensive, 2 instructions per lane.
922 if ((SrcTy.getScalarType() == MVT::i8 ||
923 SrcTy.getScalarType() == MVT::i16 ||
924 SrcTy.getScalarType() == MVT::i32) &&
925 SrcTy.getSizeInBits() > 128 &&
926 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
927 return SrcTy.getVectorNumElements() * 2;
928 }
929
930 // Scalar integer conversion costs.
931 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
932 // i16 -> i64 requires two dependent operations.
933 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
934
935 // Truncates on i64 are assumed to be free.
936 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
937 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
938 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
939 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
940 };
941
942 if (SrcTy.isInteger()) {
943 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
944 DstTy.getSimpleVT(),
945 SrcTy.getSimpleVT()))
946 return AdjustCost(Entry->Cost);
947 }
948
949 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
950 ? ST->getMVEVectorCostFactor(CostKind)
951 : 1;
952 return AdjustCost(
953 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
954}
955
957 unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
958 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
959 // Penalize inserting into an D-subregister. We end up with a three times
960 // lower estimated throughput on swift.
961 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
962 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
963 return 3;
964
965 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
966 Opcode == Instruction::ExtractElement)) {
967 // Cross-class copies are expensive on many microarchitectures,
968 // so assume they are expensive by default.
969 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
970 return 3;
971
972 // Even if it's not a cross class copy, this likely leads to mixing
973 // of NEON and VFP code and should be therefore penalized.
974 if (ValTy->isVectorTy() &&
975 ValTy->getScalarSizeInBits() <= 32)
976 return std::max<InstructionCost>(
977 BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
978 VIC),
979 2U);
980 }
981
982 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
983 Opcode == Instruction::ExtractElement)) {
984 // Integer cross-lane moves are more expensive than float, which can
985 // sometimes just be vmovs. Integer involve being passes to GPR registers,
986 // causing more of a delay.
987 std::pair<InstructionCost, MVT> LT =
988 getTypeLegalizationCost(ValTy->getScalarType());
989 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
990 }
991
992 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
993 VIC);
994}
995
997 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
999 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
1000 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1001
1002 // Thumb scalar code size cost for select.
1004 ST->isThumb() && !ValTy->isVectorTy()) {
1005 // Assume expensive structs.
1006 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
1007 return TTI::TCC_Expensive;
1008
1009 // Select costs can vary because they:
1010 // - may require one or more conditional mov (including an IT),
1011 // - can't operate directly on immediates,
1012 // - require live flags, which we can't copy around easily.
1014
1015 // Possible IT instruction for Thumb2, or more for Thumb1.
1016 ++Cost;
1017
1018 // i1 values may need rematerialising by using mov immediates and/or
1019 // flag setting instructions.
1020 if (ValTy->isIntegerTy(1))
1021 ++Cost;
1022
1023 return Cost;
1024 }
1025
1026 // If this is a vector min/max/abs, use the cost of that intrinsic directly
1027 // instead. Hopefully when min/max intrinsics are more prevalent this code
1028 // will not be needed.
1029 const Instruction *Sel = I;
1030 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
1031 Sel->hasOneUse())
1032 Sel = cast<Instruction>(Sel->user_back());
1033 if (Sel && ValTy->isVectorTy() &&
1034 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
1035 const Value *LHS, *RHS;
1036 SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
1037 unsigned IID = 0;
1038 switch (SPF) {
1039 case SPF_ABS:
1040 IID = Intrinsic::abs;
1041 break;
1042 case SPF_SMIN:
1043 IID = Intrinsic::smin;
1044 break;
1045 case SPF_SMAX:
1046 IID = Intrinsic::smax;
1047 break;
1048 case SPF_UMIN:
1049 IID = Intrinsic::umin;
1050 break;
1051 case SPF_UMAX:
1052 IID = Intrinsic::umax;
1053 break;
1054 case SPF_FMINNUM:
1055 IID = Intrinsic::minnum;
1056 break;
1057 case SPF_FMAXNUM:
1058 IID = Intrinsic::maxnum;
1059 break;
1060 default:
1061 break;
1062 }
1063 if (IID) {
1064 // The ICmp is free, the select gets the cost of the min/max/etc
1065 if (Sel != I)
1066 return 0;
1067 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
1068 return getIntrinsicInstrCost(CostAttrs, CostKind);
1069 }
1070 }
1071
1072 // On NEON a vector select gets lowered to vbsl.
1073 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
1074 // Lowering of some vector selects is currently far from perfect.
1075 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
1076 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
1077 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
1078 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
1079 };
1080
1081 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1082 EVT SelValTy = TLI->getValueType(DL, ValTy);
1083 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1084 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1085 SelCondTy.getSimpleVT(),
1086 SelValTy.getSimpleVT()))
1087 return Entry->Cost;
1088 }
1089
1090 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1091 return LT.first;
1092 }
1093
1094 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1095 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1096 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1097 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1099 if (!VecCondTy)
1101
1102 // If we don't have mve.fp any fp operations will need to be scalarized.
1103 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1104 // One scalaization insert, one scalarization extract and the cost of the
1105 // fcmps.
1106 return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
1107 /*Extract*/ true, CostKind) +
1108 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1109 /*Extract*/ false, CostKind) +
1110 VecValTy->getNumElements() *
1111 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1112 VecCondTy->getScalarType(), VecPred,
1113 CostKind, Op1Info, Op2Info, I);
1114 }
1115
1116 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1117 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1118 // There are two types - the input that specifies the type of the compare
1119 // and the output vXi1 type. Because we don't know how the output will be
1120 // split, we may need an expensive shuffle to get two in sync. This has the
1121 // effect of making larger than legal compares (v8i32 for example)
1122 // expensive.
1123 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1124 if (LT.first > 1)
1125 return LT.first * BaseCost +
1126 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1127 /*Extract*/ false, CostKind);
1128 return BaseCost;
1129 }
1130 }
1131
1132 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1133 // for "multiple beats" potentially needed by MVE instructions.
1134 int BaseCost = 1;
1135 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1136 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1137
1138 return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred,
1139 CostKind, Op1Info, Op2Info, I);
1140}
1141
1144 const SCEV *Ptr,
1146 // Address computations in vectorized code with non-consecutive addresses will
1147 // likely result in more instructions compared to scalar code where the
1148 // computation can more often be merged into the index mode. The resulting
1149 // extra micro-ops can significantly decrease throughput.
1150 unsigned NumVectorInstToHideOverhead = 10;
1151 int MaxMergeDistance = 64;
1152
1153 if (ST->hasNEON()) {
1154 if (PtrTy->isVectorTy() && SE &&
1155 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1156 return NumVectorInstToHideOverhead;
1157
1158 // In many cases the address computation is not merged into the instruction
1159 // addressing mode.
1160 return 1;
1161 }
1162 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1163}
1164
1167 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1168 // optimized, else LSR may block tail-predication.
1169 switch (II->getIntrinsicID()) {
1170 case Intrinsic::arm_mve_vctp8:
1171 case Intrinsic::arm_mve_vctp16:
1172 case Intrinsic::arm_mve_vctp32:
1173 case Intrinsic::arm_mve_vctp64:
1174 return true;
1175 default:
1176 break;
1177 }
1178 }
1179 return false;
1180}
1181
1183 unsigned /*AddressSpace*/,
1184 TTI::MaskKind /*MaskKind*/) const {
1185 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1186 return false;
1187
1188 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1189 // Don't support v2i1 yet.
1190 if (VecTy->getNumElements() == 2)
1191 return false;
1192
1193 // We don't support extending fp types.
1194 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1195 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1196 return false;
1197 }
1198
1199 unsigned EltWidth = DataTy->getScalarSizeInBits();
1200 return (EltWidth == 32 && Alignment >= 4) ||
1201 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1202}
1203
1204bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const {
1205 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1206 return false;
1207
1208 unsigned EltWidth = Ty->getScalarSizeInBits();
1209 return ((EltWidth == 32 && Alignment >= 4) ||
1210 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1211}
1212
1213/// Given a memcpy/memset/memmove instruction, return the number of memory
1214/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1215/// call is used.
1217 MemOp MOp;
1218 unsigned DstAddrSpace = ~0u;
1219 unsigned SrcAddrSpace = ~0u;
1220 const Function *F = I->getParent()->getParent();
1221
1222 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1223 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1224 // If 'size' is not a constant, a library call will be generated.
1225 if (!C)
1226 return -1;
1227
1228 const unsigned Size = C->getValue().getZExtValue();
1229 const Align DstAlign = MC->getDestAlign().valueOrOne();
1230 const Align SrcAlign = MC->getSourceAlign().valueOrOne();
1231
1232 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1233 /*IsVolatile*/ false);
1234 DstAddrSpace = MC->getDestAddressSpace();
1235 SrcAddrSpace = MC->getSourceAddressSpace();
1236 }
1237 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1238 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1239 // If 'size' is not a constant, a library call will be generated.
1240 if (!C)
1241 return -1;
1242
1243 const unsigned Size = C->getValue().getZExtValue();
1244 const Align DstAlign = MS->getDestAlign().valueOrOne();
1245
1246 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1247 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1248 DstAddrSpace = MS->getDestAddressSpace();
1249 }
1250 else
1251 llvm_unreachable("Expected a memcpy/move or memset!");
1252
1253 unsigned Limit, Factor = 2;
1254 switch(I->getIntrinsicID()) {
1255 case Intrinsic::memcpy:
1256 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1257 break;
1258 case Intrinsic::memmove:
1259 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1260 break;
1261 case Intrinsic::memset:
1262 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1263 Factor = 1;
1264 break;
1265 default:
1266 llvm_unreachable("Expected a memcpy/move or memset!");
1267 }
1268
1269 // MemOps will be poplulated with a list of data types that needs to be
1270 // loaded and stored. That's why we multiply the number of elements by 2 to
1271 // get the cost for this memcpy.
1272 std::vector<EVT> MemOps;
1273 LLVMContext &C = F->getContext();
1274 if (getTLI()->findOptimalMemOpLowering(C, MemOps, Limit, MOp, DstAddrSpace,
1275 SrcAddrSpace, F->getAttributes(),
1276 nullptr))
1277 return MemOps.size() * Factor;
1278
1279 // If we can't find an optimal memop lowering, return the default cost
1280 return -1;
1281}
1282
1285
1286 // To model the cost of a library call, we assume 1 for the call, and
1287 // 3 for the argument setup.
1288 if (NumOps == -1)
1289 return 4;
1290 return NumOps;
1291}
1292
1294 VectorType *DstTy, VectorType *SrcTy,
1295 ArrayRef<int> Mask,
1297 int Index, VectorType *SubTp,
1299 const Instruction *CxtI) const {
1300 assert((Mask.empty() || DstTy->isScalableTy() ||
1301 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1302 "Expected the Mask to match the return size if given");
1303 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1304 "Expected the same scalar types");
1305
1306 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1307 // Treat extractsubvector as single op permutation.
1308 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1309 if (IsExtractSubvector)
1311 if (ST->hasNEON()) {
1312 if (Kind == TTI::SK_Broadcast) {
1313 static const CostTblEntry NEONDupTbl[] = {
1314 // VDUP handles these cases.
1315 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1316 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1317 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1318 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1319 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1320 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1321
1322 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1323 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1324 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1325 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1326
1327 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1328 if (const auto *Entry =
1329 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1330 return LT.first * Entry->Cost;
1331 }
1332 if (Kind == TTI::SK_Reverse) {
1333 static const CostTblEntry NEONShuffleTbl[] = {
1334 // Reverse shuffle cost one instruction if we are shuffling within a
1335 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1336 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1337 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1338 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1339 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1340 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1341 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1342
1343 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1344 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1345 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1346 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1347
1348 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1349 if (const auto *Entry =
1350 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1351 return LT.first * Entry->Cost;
1352 }
1353 if (Kind == TTI::SK_Select) {
1354 static const CostTblEntry NEONSelShuffleTbl[] = {
1355 // Select shuffle cost table for ARM. Cost is the number of
1356 // instructions
1357 // required to create the shuffled vector.
1358
1359 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1360 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1361 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1362 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1363
1364 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1365 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1366 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1367
1368 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1369
1370 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1371
1372 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1373 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1374 ISD::VECTOR_SHUFFLE, LT.second))
1375 return LT.first * Entry->Cost;
1376 }
1377 }
1378 if (ST->hasMVEIntegerOps()) {
1379 if (Kind == TTI::SK_Broadcast) {
1380 static const CostTblEntry MVEDupTbl[] = {
1381 // VDUP handles these cases.
1382 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1383 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1384 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1385 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1386 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1387
1388 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1389 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1390 LT.second))
1391 return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
1392 }
1393
1394 if (!Mask.empty()) {
1395 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1396 // Check for LD2/LD4 instructions, which are represented in llvm IR as
1397 // deinterleaving-shuffle(load). The shuffle cost could potentially be
1398 // free, but we model it with a cost of LT.first so that LD2/LD4 have a
1399 // higher cost than just the load.
1400 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
1401 (LT.second.getScalarSizeInBits() == 8 ||
1402 LT.second.getScalarSizeInBits() == 16 ||
1403 LT.second.getScalarSizeInBits() == 32) &&
1404 LT.second.getSizeInBits() == 128 &&
1405 ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
1407 (TLI->getMaxSupportedInterleaveFactor() == 4 &&
1409 return ST->getMVEVectorCostFactor(CostKind) *
1410 std::max<InstructionCost>(1, LT.first / 4);
1411
1412 // Check for ST2/ST4 instructions, which are represented in llvm IR as
1413 // store(interleaving-shuffle). The shuffle cost could potentially be
1414 // free, but we model it with a cost of LT.first so that ST2/ST4 have a
1415 // higher cost than just the store.
1416 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
1417 (LT.second.getScalarSizeInBits() == 8 ||
1418 LT.second.getScalarSizeInBits() == 16 ||
1419 LT.second.getScalarSizeInBits() == 32) &&
1420 LT.second.getSizeInBits() == 128 &&
1421 ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
1423 Mask, 2, SrcTy->getElementCount().getKnownMinValue() * 2)) ||
1424 (TLI->getMaxSupportedInterleaveFactor() == 4 &&
1426 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2))))
1427 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1428
1429 if (LT.second.isVector() &&
1430 Mask.size() <= LT.second.getVectorNumElements() &&
1431 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1432 isVREVMask(Mask, LT.second, 64)))
1433 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1434 }
1435 }
1436
1437 // Restore optimal kind.
1438 if (IsExtractSubvector)
1440 int BaseCost = ST->hasMVEIntegerOps() && SrcTy->isVectorTy()
1441 ? ST->getMVEVectorCostFactor(CostKind)
1442 : 1;
1443 return BaseCost * BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind,
1444 Index, SubTp);
1445}
1446
1448 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1450 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
1451 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1452 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1453 // Make operations on i1 relatively expensive as this often involves
1454 // combining predicates. AND and XOR should be easier to handle with IT
1455 // blocks.
1456 switch (ISDOpcode) {
1457 default:
1458 break;
1459 case ISD::AND:
1460 case ISD::XOR:
1461 return 2;
1462 case ISD::OR:
1463 return 3;
1464 }
1465 }
1466
1467 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1468
1469 if (ST->hasNEON()) {
1470 const unsigned FunctionCallDivCost = 20;
1471 const unsigned ReciprocalDivCost = 10;
1472 static const CostTblEntry CostTbl[] = {
1473 // Division.
1474 // These costs are somewhat random. Choose a cost of 20 to indicate that
1475 // vectorizing devision (added function call) is going to be very expensive.
1476 // Double registers types.
1477 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1478 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1479 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1480 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1481 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1482 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1483 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1484 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1485 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1486 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1487 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1488 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1489 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1490 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1491 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1492 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1493 // Quad register types.
1494 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1495 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1496 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1497 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1498 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1499 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1500 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1501 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1502 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1503 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1504 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1505 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1506 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1507 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1508 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1509 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1510 // Multiplication.
1511 };
1512
1513 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1514 return LT.first * Entry->Cost;
1515
1517 Opcode, Ty, CostKind, Op1Info, Op2Info);
1518
1519 // This is somewhat of a hack. The problem that we are facing is that SROA
1520 // creates a sequence of shift, and, or instructions to construct values.
1521 // These sequences are recognized by the ISel and have zero-cost. Not so for
1522 // the vectorized code. Because we have support for v2i64 but not i64 those
1523 // sequences look particularly beneficial to vectorize.
1524 // To work around this we increase the cost of v2i64 operations to make them
1525 // seem less beneficial.
1526 if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1527 Cost += 4;
1528
1529 return Cost;
1530 }
1531
1532 // If this operation is a shift on arm/thumb2, it might well be folded into
1533 // the following instruction, hence having a cost of 0.
1534 auto LooksLikeAFreeShift = [&]() {
1535 if (ST->isThumb1Only() || Ty->isVectorTy())
1536 return false;
1537
1538 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1539 return false;
1540 if (!Op2Info.isUniform() || !Op2Info.isConstant())
1541 return false;
1542
1543 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1544 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1545 case Instruction::Add:
1546 case Instruction::Sub:
1547 case Instruction::And:
1548 case Instruction::Xor:
1549 case Instruction::Or:
1550 case Instruction::ICmp:
1551 return true;
1552 default:
1553 return false;
1554 }
1555 };
1556 if (LooksLikeAFreeShift())
1557 return 0;
1558
1559 // When targets have both DSP and MVE we find that the
1560 // the compiler will attempt to vectorize as well as using
1561 // scalar (S/U)MLAL operations. This is in cases where we have
1562 // the pattern ext(mul(ext(i16), ext(i16))) we find
1563 // that codegen performs better when only using (S/U)MLAL scalar
1564 // ops instead of trying to mix vector ops with (S/U)MLAL ops. We therefore
1565 // check if a mul instruction is used in a (U/S)MLAL pattern.
1566 auto MulInDSPMLALPattern = [&](const Instruction *I, unsigned Opcode,
1567 Type *Ty) -> bool {
1568 if (!ST->hasDSP())
1569 return false;
1570
1571 if (!I)
1572 return false;
1573
1574 if (Opcode != Instruction::Mul)
1575 return false;
1576
1577 if (Ty->isVectorTy())
1578 return false;
1579
1580 auto ValueOpcodesEqual = [](const Value *LHS, const Value *RHS) -> bool {
1581 return cast<Instruction>(LHS)->getOpcode() ==
1582 cast<Instruction>(RHS)->getOpcode();
1583 };
1584 auto IsExtInst = [](const Value *V) -> bool {
1585 return isa<ZExtInst>(V) || isa<SExtInst>(V);
1586 };
1587 auto IsExtensionFromHalf = [](const Value *V) -> bool {
1588 return cast<Instruction>(V)->getOperand(0)->getType()->isIntegerTy(16);
1589 };
1590
1591 // We check the arguments of the instruction to see if they're extends
1592 auto *BinOp = dyn_cast<BinaryOperator>(I);
1593 if (!BinOp)
1594 return false;
1595 Value *Op0 = BinOp->getOperand(0);
1596 Value *Op1 = BinOp->getOperand(1);
1597 if (IsExtInst(Op0) && IsExtInst(Op1) && ValueOpcodesEqual(Op0, Op1)) {
1598 // We're interested in an ext of an i16
1599 if (!I->getType()->isIntegerTy(32) || !IsExtensionFromHalf(Op0) ||
1600 !IsExtensionFromHalf(Op1))
1601 return false;
1602 // We need to check if this result will be further extended to i64
1603 // and that all these uses are SExt
1604 for (auto *U : I->users())
1605 if (!IsExtInst(U))
1606 return false;
1607 return true;
1608 }
1609
1610 return false;
1611 };
1612
1613 if (MulInDSPMLALPattern(CxtI, Opcode, Ty))
1614 return 0;
1615
1616 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1617 // for "multiple beats" potentially needed by MVE instructions.
1618 int BaseCost = 1;
1619 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1620 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1621
1622 // The rest of this mostly follows what is done in
1623 // BaseT::getArithmeticInstrCost, without treating floats as more expensive
1624 // that scalars or increasing the costs for custom operations. The results is
1625 // also multiplied by the MVEVectorCostFactor where appropriate.
1626 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1627 return LT.first * BaseCost;
1628
1629 // Else this is expand, assume that we need to scalarize this op.
1630 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1631 unsigned Num = VTy->getNumElements();
1633 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
1634 // Return the cost of multiple scalar invocation plus the cost of
1635 // inserting and extracting the values.
1636 SmallVector<Type *> Tys(Args.size(), Ty);
1637 return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1638 Num * Cost;
1639 }
1640
1641 return BaseCost;
1642}
1643
1645 Align Alignment,
1646 unsigned AddressSpace,
1648 TTI::OperandValueInfo OpInfo,
1649 const Instruction *I) const {
1650 // TODO: Handle other cost kinds.
1652 return 1;
1653
1654 // Type legalization can't handle structs
1655 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1656 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1657 CostKind);
1658
1659 if (ST->hasNEON() && Src->isVectorTy() && Alignment != Align(16) &&
1660 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1661 // Unaligned loads/stores are extremely inefficient.
1662 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1663 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1664 return LT.first * 4;
1665 }
1666
1667 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1668 // Same for stores.
1669 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1670 ((Opcode == Instruction::Load && I->hasOneUse() &&
1671 isa<FPExtInst>(*I->user_begin())) ||
1672 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1674 Type *DstTy =
1675 Opcode == Instruction::Load
1676 ? (*I->user_begin())->getType()
1677 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1678 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1679 DstTy->getScalarType()->isFloatTy())
1680 return ST->getMVEVectorCostFactor(CostKind);
1681 }
1682
1683 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1684 ? ST->getMVEVectorCostFactor(CostKind)
1685 : 1;
1686 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1687 CostKind, OpInfo, I);
1688}
1689
1693 switch (MICA.getID()) {
1694 case Intrinsic::masked_scatter:
1695 case Intrinsic::masked_gather:
1696 return getGatherScatterOpCost(MICA, CostKind);
1697 case Intrinsic::masked_load:
1698 case Intrinsic::masked_store:
1699 return getMaskedMemoryOpCost(MICA, CostKind);
1700 }
1702}
1703
1707 unsigned IID = MICA.getID();
1708 Type *Src = MICA.getDataType();
1709 Align Alignment = MICA.getAlignment();
1710 unsigned AddressSpace = MICA.getAddressSpace();
1711 if (ST->hasMVEIntegerOps()) {
1712 if (IID == Intrinsic::masked_load &&
1713 isLegalMaskedLoad(Src, Alignment, AddressSpace))
1714 return ST->getMVEVectorCostFactor(CostKind);
1715 if (IID == Intrinsic::masked_store &&
1716 isLegalMaskedStore(Src, Alignment, AddressSpace))
1717 return ST->getMVEVectorCostFactor(CostKind);
1718 }
1719 if (!isa<FixedVectorType>(Src))
1721 // Scalar cost, which is currently very high due to the efficiency of the
1722 // generated code.
1723 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1724}
1725
1727 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1728 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1729 bool UseMaskForCond, bool UseMaskForGaps) const {
1730 assert(Factor >= 2 && "Invalid interleave factor");
1731 assert(isa<VectorType>(VecTy) && "Expect a vector type");
1732
1733 // vldN/vstN doesn't support vector types of i64/f64 element.
1734 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1735
1736 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1737 !UseMaskForCond && !UseMaskForGaps) {
1738 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1739 auto *SubVecTy =
1740 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1741
1742 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1743 // Accesses having vector types that are a multiple of 128 bits can be
1744 // matched to more than one vldN/vstN instruction.
1745 int BaseCost =
1746 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1747 if (NumElts % Factor == 0 &&
1748 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1749 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1750
1751 // Some smaller than legal interleaved patterns are cheap as we can make
1752 // use of the vmovn or vrev patterns to interleave a standard load. This is
1753 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1754 // promoted differently). The cost of 2 here is then a load and vrev or
1755 // vmovn.
1756 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1757 VecTy->isIntOrIntVectorTy() &&
1758 DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
1759 return 2 * BaseCost;
1760 }
1761
1762 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1763 Alignment, AddressSpace, CostKind,
1764 UseMaskForCond, UseMaskForGaps);
1765}
1766
1770
1771 Type *DataTy = MICA.getDataType();
1772 const Value *Ptr = MICA.getPointer();
1773 bool VariableMask = MICA.getVariableMask();
1774 Align Alignment = MICA.getAlignment();
1775 const Instruction *I = MICA.getInst();
1776
1777 using namespace PatternMatch;
1778 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1780
1781 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1782 auto *VTy = cast<FixedVectorType>(DataTy);
1783
1784 // TODO: Splitting, once we do that.
1785
1786 unsigned NumElems = VTy->getNumElements();
1787 unsigned EltSize = VTy->getScalarSizeInBits();
1788 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
1789
1790 // For now, it is assumed that for the MVE gather instructions the loads are
1791 // all effectively serialised. This means the cost is the scalar cost
1792 // multiplied by the number of elements being loaded. This is possibly very
1793 // conservative, but even so we still end up vectorising loops because the
1794 // cost per iteration for many loops is lower than for scalar loops.
1795 InstructionCost VectorCost =
1796 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1797 // The scalarization cost should be a lot higher. We use the number of vector
1798 // elements plus the scalarization overhead. If masking is required then a lot
1799 // of little blocks will be needed and potentially a scalarized p0 mask,
1800 // greatly increasing the cost.
1801 InstructionCost ScalarCost =
1802 NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1803 BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
1804 CostKind) +
1805 BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
1806 CostKind);
1807
1808 if (EltSize < 8 || Alignment < EltSize / 8)
1809 return ScalarCost;
1810
1811 unsigned ExtSize = EltSize;
1812 // Check whether there's a single user that asks for an extended type
1813 if (I != nullptr) {
1814 // Dependent of the caller of this function, a gather instruction will
1815 // either have opcode Instruction::Load or be a call to the masked_gather
1816 // intrinsic
1817 if ((I->getOpcode() == Instruction::Load ||
1819 I->hasOneUse()) {
1820 const User *Us = *I->users().begin();
1821 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1822 // only allow valid type combinations
1823 unsigned TypeSize =
1824 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1825 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1826 (TypeSize == 16 && EltSize == 8)) &&
1827 TypeSize * NumElems == 128) {
1828 ExtSize = TypeSize;
1829 }
1830 }
1831 }
1832 // Check whether the input data needs to be truncated
1833 TruncInst *T;
1834 if ((I->getOpcode() == Instruction::Store ||
1836 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1837 // Only allow valid type combinations
1838 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1839 if (((EltSize == 16 && TypeSize == 32) ||
1840 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1841 TypeSize * NumElems == 128)
1842 ExtSize = TypeSize;
1843 }
1844 }
1845
1846 if (ExtSize * NumElems != 128 || NumElems < 4)
1847 return ScalarCost;
1848
1849 // Any (aligned) i32 gather will not need to be scalarised.
1850 if (ExtSize == 32)
1851 return VectorCost;
1852 // For smaller types, we need to ensure that the gep's inputs are correctly
1853 // extended from a small enough value. Other sizes (including i64) are
1854 // scalarized for now.
1855 if (ExtSize != 8 && ExtSize != 16)
1856 return ScalarCost;
1857
1858 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1859 Ptr = BC->getOperand(0);
1860 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1861 if (GEP->getNumOperands() != 2)
1862 return ScalarCost;
1863 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1864 // Scale needs to be correct (which is only relevant for i16s).
1865 if (Scale != 1 && Scale * 8 != ExtSize)
1866 return ScalarCost;
1867 // And we need to zext (not sext) the indexes from a small enough type.
1868 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1869 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1870 return VectorCost;
1871 }
1872 return ScalarCost;
1873 }
1874 return ScalarCost;
1875}
1876
1879 std::optional<FastMathFlags> FMF,
1881
1882 EVT ValVT = TLI->getValueType(DL, ValTy);
1883 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1884 unsigned EltSize = ValVT.getScalarSizeInBits();
1885
1886 // In general floating point reductions are a series of elementwise
1887 // operations, with free extracts on each step. These are either in-order or
1888 // treewise depending on whether that is allowed by the fast math flags.
1889 if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
1890 ((EltSize == 32 && ST->hasVFP2Base()) ||
1891 (EltSize == 64 && ST->hasFP64()) ||
1892 (EltSize == 16 && ST->hasFullFP16()))) {
1893 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1894 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1895 InstructionCost VecCost = 0;
1896 while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(NumElts) &&
1897 NumElts * EltSize > VecLimit) {
1898 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1899 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1900 NumElts /= 2;
1901 }
1902
1903 // For fp16 we need to extract the upper lane elements. MVE can add a
1904 // VREV+FMIN/MAX to perform another vector step instead.
1905 InstructionCost ExtractCost = 0;
1906 if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1907 ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
1908 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1909 NumElts /= 2;
1910 } else if (ValVT.getVectorElementType() == MVT::f16)
1911 ExtractCost = NumElts / 2;
1912
1913 return VecCost + ExtractCost +
1914 NumElts *
1915 getArithmeticInstrCost(Opcode, ValTy->getElementType(), CostKind);
1916 }
1917
1918 if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
1919 (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
1920 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1921 unsigned VecLimit =
1922 ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1923 InstructionCost VecCost = 0;
1924 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1925 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1926 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1927 NumElts /= 2;
1928 }
1929 // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1930 // step.
1931 if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
1932 NumElts * EltSize == 64) {
1933 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts);
1934 VecCost += ST->getMVEVectorCostFactor(CostKind) +
1935 getArithmeticInstrCost(Opcode, VecTy, CostKind);
1936 NumElts /= 2;
1937 }
1938
1939 // From here we extract the elements and perform the and/or/xor.
1940 InstructionCost ExtractCost = NumElts;
1941 return VecCost + ExtractCost +
1942 (NumElts - 1) * getArithmeticInstrCost(
1943 Opcode, ValTy->getElementType(), CostKind);
1944 }
1945
1946 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
1948 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1949
1950 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1951
1952 static const CostTblEntry CostTblAdd[]{
1953 {ISD::ADD, MVT::v16i8, 1},
1954 {ISD::ADD, MVT::v8i16, 1},
1955 {ISD::ADD, MVT::v4i32, 1},
1956 };
1957 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1958 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1959
1960 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1961}
1962
1964 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1965 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
1966 EVT ValVT = TLI->getValueType(DL, ValTy);
1967 EVT ResVT = TLI->getValueType(DL, ResTy);
1968
1969 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1970
1971 switch (ISD) {
1972 case ISD::ADD:
1973 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1974 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1975
1976 // The legal cases are:
1977 // VADDV u/s 8/16/32
1978 // VADDLV u/s 32
1979 // Codegen currently cannot always handle larger than legal vectors very
1980 // well, especially for predicated reductions where the mask needs to be
1981 // split, so restrict to 128bit or smaller input types.
1982 unsigned RevVTSize = ResVT.getSizeInBits();
1983 if (ValVT.getSizeInBits() <= 128 &&
1984 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1985 (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1986 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1987 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1988 }
1989 break;
1990 default:
1991 break;
1992 }
1993 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1994 CostKind);
1995}
1996
1998ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
1999 Type *ResTy, VectorType *ValTy,
2001 if (RedOpcode != Instruction::Add)
2003 EVT ValVT = TLI->getValueType(DL, ValTy);
2004 EVT ResVT = TLI->getValueType(DL, ResTy);
2005
2006 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
2007 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2008
2009 // The legal cases are:
2010 // VMLAV u/s 8/16/32
2011 // VMLALV u/s 16/32
2012 // Codegen currently cannot always handle larger than legal vectors very
2013 // well, especially for predicated reductions where the mask needs to be
2014 // split, so restrict to 128bit or smaller input types.
2015 unsigned RevVTSize = ResVT.getSizeInBits();
2016 if (ValVT.getSizeInBits() <= 128 &&
2017 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
2018 (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
2019 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
2020 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
2021 }
2022
2023 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, ValTy,
2024 CostKind);
2025}
2026
2029 FastMathFlags FMF,
2031 EVT ValVT = TLI->getValueType(DL, Ty);
2032
2033 // In general floating point reductions are a series of elementwise
2034 // operations, with free extracts on each step. These are either in-order or
2035 // treewise depending on whether that is allowed by the fast math flags.
2036 if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
2037 ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
2038 (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
2039 (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
2040 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
2041 unsigned EltSize = ValVT.getScalarSizeInBits();
2042 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
2043 InstructionCost VecCost;
2044 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
2045 Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);
2046 IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
2047 VecCost += getIntrinsicInstrCost(ICA, CostKind);
2048 NumElts /= 2;
2049 }
2050
2051 // For fp16 we need to extract the upper lane elements. MVE can add a
2052 // VREV+FMIN/MAX to perform another vector step instead.
2053 InstructionCost ExtractCost = 0;
2054 if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
2055 NumElts == 8) {
2056 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
2057 NumElts /= 2;
2058 } else if (ValVT.getVectorElementType() == MVT::f16)
2059 ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;
2060
2061 IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
2062 {Ty->getElementType(), Ty->getElementType()},
2063 FMF);
2064 return VecCost + ExtractCost +
2065 (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
2066 }
2067
2068 if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
2069 IID == Intrinsic::umin || IID == Intrinsic::umax) {
2070 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2071
2072 // All costs are the same for u/s min/max. These lower to vminv, which are
2073 // given a slightly higher cost as they tend to take multiple cycles for
2074 // smaller type sizes.
2075 static const CostTblEntry CostTblAdd[]{
2076 {ISD::SMIN, MVT::v16i8, 4},
2077 {ISD::SMIN, MVT::v8i16, 3},
2078 {ISD::SMIN, MVT::v4i32, 2},
2079 };
2080 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))
2081 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
2082 }
2083
2084 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2085}
2086
2090 unsigned Opc = ICA.getID();
2091 switch (Opc) {
2092 case Intrinsic::get_active_lane_mask:
2093 // Currently we make a somewhat optimistic assumption that
2094 // active_lane_mask's are always free. In reality it may be freely folded
2095 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
2096 // of add/icmp code. We may need to improve this in the future, but being
2097 // able to detect if it is free or not involves looking at a lot of other
2098 // code. We currently assume that the vectorizer inserted these, and knew
2099 // what it was doing in adding one.
2100 if (ST->hasMVEIntegerOps())
2101 return 0;
2102 break;
2103 case Intrinsic::sadd_sat:
2104 case Intrinsic::ssub_sat:
2105 case Intrinsic::uadd_sat:
2106 case Intrinsic::usub_sat: {
2107 bool IsAdd = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
2108 bool IsSigned = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
2109 Type *RetTy = ICA.getReturnType();
2110
2111 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
2112 if (IsSigned && ST->hasDSP() && ITy->getBitWidth() == 32)
2113 return 1; // qadd / qsub
2114 if (ST->hasDSP() && (ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16))
2115 return 2; // uqadd16 / qadd16 / uqsub16 / qsub16 + possible extend.
2116 // Otherwise return the cost of expanding the node. Generally an add +
2117 // icmp + sel.
2119 Type *CondTy = RetTy->getWithNewBitWidth(1);
2120 return getArithmeticInstrCost(IsAdd ? Instruction::Add : Instruction::Sub,
2121 RetTy, CostKind) +
2122 2 * getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, Pred,
2123 CostKind) +
2124 2 * getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, Pred,
2125 CostKind);
2126 }
2127
2128 if (!ST->hasMVEIntegerOps())
2129 break;
2130
2131 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
2132 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2133 LT.second == MVT::v16i8) {
2134 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
2135 // need to extend the type, as it uses shr(qadd(shl, shl)).
2136 unsigned Instrs =
2137 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1
2138 : 4;
2139 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
2140 }
2141 break;
2142 }
2143 case Intrinsic::abs:
2144 case Intrinsic::smin:
2145 case Intrinsic::smax:
2146 case Intrinsic::umin:
2147 case Intrinsic::umax: {
2148 if (!ST->hasMVEIntegerOps())
2149 break;
2150 Type *VT = ICA.getReturnType();
2151
2152 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
2153 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2154 LT.second == MVT::v16i8)
2155 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2156 break;
2157 }
2158 case Intrinsic::minnum:
2159 case Intrinsic::maxnum: {
2160 if (!ST->hasMVEFloatOps())
2161 break;
2162 Type *VT = ICA.getReturnType();
2163 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
2164 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
2165 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2166 break;
2167 }
2168 case Intrinsic::fptosi_sat:
2169 case Intrinsic::fptoui_sat: {
2170 if (ICA.getArgTypes().empty())
2171 break;
2172 bool IsSigned = Opc == Intrinsic::fptosi_sat;
2173 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
2174 EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
2175 // Check for the legal types, with the corect subtarget features.
2176 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
2177 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
2178 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
2179 return LT.first;
2180
2181 // Equally for MVE vector types
2182 if (ST->hasMVEFloatOps() &&
2183 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
2184 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
2185 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2186
2187 // If we can we use a legal convert followed by a min+max
2188 if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
2189 (ST->hasFP64() && LT.second == MVT::f64) ||
2190 (ST->hasFullFP16() && LT.second == MVT::f16) ||
2191 (ST->hasMVEFloatOps() &&
2192 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
2193 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
2194 Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
2195 LT.second.getScalarSizeInBits());
2197 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
2198 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
2199 : Intrinsic::umin,
2200 LegalTy, {LegalTy, LegalTy});
2202 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
2203 : Intrinsic::umax,
2204 LegalTy, {LegalTy, LegalTy});
2206 return LT.first * Cost;
2207 }
2208 // Otherwise we need to follow the default expansion that clamps the value
2209 // using a float min/max with a fcmp+sel for nan handling when signed.
2210 Type *FPTy = ICA.getArgTypes()[0];
2211 Type *RetTy = ICA.getReturnType();
2212 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
2214 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
2216 Cost +=
2217 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
2218 RetTy, FPTy, TTI::CastContextHint::None, CostKind);
2219 if (IsSigned) {
2220 Type *CondTy = RetTy->getWithNewBitWidth(1);
2221 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
2223 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2225 }
2226 return Cost;
2227 }
2228 }
2229
2231}
2232
2234 if (!F->isIntrinsic())
2235 return BaseT::isLoweredToCall(F);
2236
2237 // Assume all Arm-specific intrinsics map to an instruction.
2238 if (F->getName().starts_with("llvm.arm"))
2239 return false;
2240
2241 switch (F->getIntrinsicID()) {
2242 default: break;
2243 case Intrinsic::powi:
2244 case Intrinsic::sin:
2245 case Intrinsic::cos:
2246 case Intrinsic::sincos:
2247 case Intrinsic::pow:
2248 case Intrinsic::log:
2249 case Intrinsic::log10:
2250 case Intrinsic::log2:
2251 case Intrinsic::exp:
2252 case Intrinsic::exp2:
2253 return true;
2254 case Intrinsic::sqrt:
2255 case Intrinsic::fabs:
2256 case Intrinsic::copysign:
2257 case Intrinsic::floor:
2258 case Intrinsic::ceil:
2259 case Intrinsic::trunc:
2260 case Intrinsic::rint:
2261 case Intrinsic::nearbyint:
2262 case Intrinsic::round:
2263 case Intrinsic::canonicalize:
2264 case Intrinsic::lround:
2265 case Intrinsic::llround:
2266 case Intrinsic::lrint:
2267 case Intrinsic::llrint:
2268 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2269 return true;
2270 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2271 return true;
2272 // Some operations can be handled by vector instructions and assume
2273 // unsupported vectors will be expanded into supported scalar ones.
2274 // TODO Handle scalar operations properly.
2275 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2276 case Intrinsic::masked_store:
2277 case Intrinsic::masked_load:
2278 case Intrinsic::masked_gather:
2279 case Intrinsic::masked_scatter:
2280 return !ST->hasMVEIntegerOps();
2281 case Intrinsic::sadd_with_overflow:
2282 case Intrinsic::uadd_with_overflow:
2283 case Intrinsic::ssub_with_overflow:
2284 case Intrinsic::usub_with_overflow:
2285 case Intrinsic::sadd_sat:
2286 case Intrinsic::uadd_sat:
2287 case Intrinsic::ssub_sat:
2288 case Intrinsic::usub_sat:
2289 return false;
2290 }
2291
2292 return BaseT::isLoweredToCall(F);
2293}
2294
2296 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
2297 EVT VT = TLI->getValueType(DL, I.getType(), true);
2298 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
2299 return true;
2300
2301 // Check if an intrinsic will be lowered to a call and assume that any
2302 // other CallInst will generate a bl.
2303 if (auto *Call = dyn_cast<CallInst>(&I)) {
2304 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
2305 switch(II->getIntrinsicID()) {
2306 case Intrinsic::memcpy:
2307 case Intrinsic::memset:
2308 case Intrinsic::memmove:
2309 return getNumMemOps(II) == -1;
2310 default:
2311 if (const Function *F = Call->getCalledFunction())
2312 return isLoweredToCall(F);
2313 }
2314 }
2315 return true;
2316 }
2317
2318 // FPv5 provides conversions between integer, double-precision,
2319 // single-precision, and half-precision formats.
2320 switch (I.getOpcode()) {
2321 default:
2322 break;
2323 case Instruction::FPToSI:
2324 case Instruction::FPToUI:
2325 case Instruction::SIToFP:
2326 case Instruction::UIToFP:
2327 case Instruction::FPTrunc:
2328 case Instruction::FPExt:
2329 return !ST->hasFPARMv8Base();
2330 }
2331
2332 // FIXME: Unfortunately the approach of checking the Operation Action does
2333 // not catch all cases of Legalization that use library calls. Our
2334 // Legalization step categorizes some transformations into library calls as
2335 // Custom, Expand or even Legal when doing type legalization. So for now
2336 // we have to special case for instance the SDIV of 64bit integers and the
2337 // use of floating point emulation.
2338 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
2339 switch (ISD) {
2340 default:
2341 break;
2342 case ISD::SDIV:
2343 case ISD::UDIV:
2344 case ISD::SREM:
2345 case ISD::UREM:
2346 case ISD::SDIVREM:
2347 case ISD::UDIVREM:
2348 return true;
2349 }
2350 }
2351
2352 // Assume all other non-float operations are supported.
2353 if (!VT.isFloatingPoint())
2354 return false;
2355
2356 // We'll need a library call to handle most floats when using soft.
2357 if (TLI->useSoftFloat()) {
2358 switch (I.getOpcode()) {
2359 default:
2360 return true;
2361 case Instruction::Alloca:
2362 case Instruction::Load:
2363 case Instruction::Store:
2364 case Instruction::Select:
2365 case Instruction::PHI:
2366 return false;
2367 }
2368 }
2369
2370 // We'll need a libcall to perform double precision operations on a single
2371 // precision only FPU.
2372 if (I.getType()->isDoubleTy() && !ST->hasFP64())
2373 return true;
2374
2375 // Likewise for half precision arithmetic.
2376 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2377 return true;
2378
2379 return false;
2380}
2381
2383 AssumptionCache &AC,
2384 TargetLibraryInfo *LibInfo,
2385 HardwareLoopInfo &HWLoopInfo) const {
2386 // Low-overhead branches are only supported in the 'low-overhead branch'
2387 // extension of v8.1-m.
2388 if (!ST->hasLOB() || DisableLowOverheadLoops) {
2389 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2390 return false;
2391 }
2392
2394 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2395 return false;
2396 }
2397
2398 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2399 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2400 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2401 return false;
2402 }
2403
2404 const SCEV *TripCountSCEV =
2405 SE.getAddExpr(BackedgeTakenCount,
2406 SE.getOne(BackedgeTakenCount->getType()));
2407
2408 // We need to store the trip count in LR, a 32-bit register.
2409 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2410 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2411 return false;
2412 }
2413
2414 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2415 // point in generating a hardware loop if that's going to happen.
2416
2417 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2418 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2419 switch (Call->getIntrinsicID()) {
2420 default:
2421 break;
2422 case Intrinsic::start_loop_iterations:
2423 case Intrinsic::test_start_loop_iterations:
2424 case Intrinsic::loop_decrement:
2425 case Intrinsic::loop_decrement_reg:
2426 return true;
2427 }
2428 }
2429 return false;
2430 };
2431
2432 // Scan the instructions to see if there's any that we know will turn into a
2433 // call or if this loop is already a low-overhead loop or will become a tail
2434 // predicated loop.
2435 bool IsTailPredLoop = false;
2436 auto ScanLoop = [&](Loop *L) {
2437 for (auto *BB : L->getBlocks()) {
2438 for (auto &I : *BB) {
2439 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2440 isa<InlineAsm>(I)) {
2441 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2442 return false;
2443 }
2444 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2445 IsTailPredLoop |=
2446 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2447 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2448 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2449 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2450 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2451 }
2452 }
2453 return true;
2454 };
2455
2456 // Visit inner loops.
2457 for (auto *Inner : *L)
2458 if (!ScanLoop(Inner))
2459 return false;
2460
2461 if (!ScanLoop(L))
2462 return false;
2463
2464 // TODO: Check whether the trip count calculation is expensive. If L is the
2465 // inner loop but we know it has a low trip count, calculating that trip
2466 // count (in the parent loop) may be detrimental.
2467
2468 LLVMContext &C = L->getHeader()->getContext();
2469 HWLoopInfo.CounterInReg = true;
2470 HWLoopInfo.IsNestingLegal = false;
2471 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2472 HWLoopInfo.CountType = Type::getInt32Ty(C);
2473 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2474 return true;
2475}
2476
2477static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2478 // We don't allow icmp's, and because we only look at single block loops,
2479 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2480 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2481 return false;
2482 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2483 // not currently canonical, but soon will be. Code without them uses icmp, and
2484 // so is not tail predicated as per the condition above. In order to get the
2485 // same performance we treat min and max the same as an icmp for tailpred
2486 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2487 // pick more optimial instructions like VQDMULH. They need to be recognized
2488 // directly by the vectorizer).
2489 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2490 if ((II->getIntrinsicID() == Intrinsic::smin ||
2491 II->getIntrinsicID() == Intrinsic::smax ||
2492 II->getIntrinsicID() == Intrinsic::umin ||
2493 II->getIntrinsicID() == Intrinsic::umax) &&
2494 ++ICmpCount > 1)
2495 return false;
2496
2497 if (isa<FCmpInst>(&I))
2498 return false;
2499
2500 // We could allow extending/narrowing FP loads/stores, but codegen is
2501 // too inefficient so reject this for now.
2503 return false;
2504
2505 // Extends have to be extending-loads
2506 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2507 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2508 return false;
2509
2510 // Truncs have to be narrowing-stores
2511 if (isa<TruncInst>(&I) )
2512 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2513 return false;
2514
2515 return true;
2516}
2517
2518// To set up a tail-predicated loop, we need to know the total number of
2519// elements processed by that loop. Thus, we need to determine the element
2520// size and:
2521// 1) it should be uniform for all operations in the vector loop, so we
2522// e.g. don't want any widening/narrowing operations.
2523// 2) it should be smaller than i64s because we don't have vector operations
2524// that work on i64s.
2525// 3) we don't want elements to be reversed or shuffled, to make sure the
2526// tail-predication masks/predicates the right lanes.
2527//
2529 const DataLayout &DL,
2530 const LoopAccessInfo *LAI,
2531 const DominatorTree &DT) {
2532 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2533
2534 // If there are live-out values, it is probably a reduction. We can predicate
2535 // most reduction operations freely under MVE using a combination of
2536 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2537 // floating point and integer reductions, but don't check for operators
2538 // specifically here. If the value ends up not being a reduction (and so the
2539 // vectorizer cannot tailfold the loop), we should fall back to standard
2540 // vectorization automatically.
2542 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2543 bool ReductionsDisabled =
2546
2547 for (auto *I : LiveOuts) {
2548 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2549 !I->getType()->isHalfTy()) {
2550 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2551 "live-out value\n");
2552 return false;
2553 }
2554 if (ReductionsDisabled) {
2555 LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2556 return false;
2557 }
2558 }
2559
2560 // Next, check that all instructions can be tail-predicated.
2561 PredicatedScalarEvolution PSE = LAI->getPSE();
2562 int ICmpCount = 0;
2563
2564 for (BasicBlock *BB : L->blocks()) {
2565 for (Instruction &I : BB->instructionsWithoutDebug()) {
2566 if (isa<PHINode>(&I))
2567 continue;
2568 if (!canTailPredicateInstruction(I, ICmpCount)) {
2569 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2570 return false;
2571 }
2572
2573 Type *T = I.getType();
2574 if (T->getScalarSizeInBits() > 32) {
2575 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2576 return false;
2577 }
2578 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2580 Type *AccessTy = getLoadStoreType(&I);
2581 int64_t NextStride =
2582 getPtrStride(PSE, AccessTy, Ptr, L, DT).value_or(0);
2583 if (NextStride == 1) {
2584 // TODO: for now only allow consecutive strides of 1. We could support
2585 // other strides as long as it is uniform, but let's keep it simple
2586 // for now.
2587 continue;
2588 } else if (NextStride == -1 ||
2589 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2590 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2592 << "Consecutive strides of 2 found, vld2/vstr2 can't "
2593 "be tail-predicated\n.");
2594 return false;
2595 // TODO: don't tail predicate if there is a reversed load?
2596 } else if (EnableMaskedGatherScatters) {
2597 // Gather/scatters do allow loading from arbitrary strides, at
2598 // least if they are loop invariant.
2599 // TODO: Loop variant strides should in theory work, too, but
2600 // this requires further testing.
2601 const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2602 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2603 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2604 if (PSE.getSE()->isLoopInvariant(Step, L))
2605 continue;
2606 }
2607 }
2608 LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2609 "tail-predicate\n.");
2610 return false;
2611 }
2612 }
2613 }
2614
2615 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2616 return true;
2617}
2618
2620 if (!EnableTailPredication) {
2621 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2622 return false;
2623 }
2624
2625 // Creating a predicated vector loop is the first step for generating a
2626 // tail-predicated hardware loop, for which we need the MVE masked
2627 // load/stores instructions:
2628 if (!ST->hasMVEIntegerOps())
2629 return false;
2630
2631 LoopVectorizationLegality *LVL = TFI->LVL;
2632 Loop *L = LVL->getLoop();
2633
2634 // For now, restrict this to single block loops.
2635 if (L->getNumBlocks() > 1) {
2636 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2637 "loop.\n");
2638 return false;
2639 }
2640
2641 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2642
2643 LoopInfo *LI = LVL->getLoopInfo();
2644 HardwareLoopInfo HWLoopInfo(L);
2645 if (!HWLoopInfo.canAnalyze(*LI)) {
2646 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2647 "analyzable.\n");
2648 return false;
2649 }
2650
2653
2654 // This checks if we have the low-overhead branch architecture
2655 // extension, and if we will create a hardware-loop:
2656 if (!isHardwareLoopProfitable(L, *SE, *AC, TFI->TLI, HWLoopInfo)) {
2657 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2658 "profitable.\n");
2659 return false;
2660 }
2661
2662 DominatorTree *DT = LVL->getDominatorTree();
2663 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) {
2664 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2665 "a candidate.\n");
2666 return false;
2667 }
2668
2669 return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(),
2670 *LVL->getDominatorTree());
2671}
2672
2674ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
2675 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2677
2678 // Intrinsic @llvm.get.active.lane.mask is supported.
2679 // It is used in the MVETailPredication pass, which requires the number of
2680 // elements processed by this vector loop to setup the tail-predicated
2681 // loop.
2683}
2686 OptimizationRemarkEmitter *ORE) const {
2687 // Enable Upper bound unrolling universally, providing that we do not see an
2688 // active lane mask, which will be better kept as a loop to become tail
2689 // predicated than to be conditionally unrolled.
2690 UP.UpperBound =
2691 !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) {
2692 return isa<IntrinsicInst>(I) &&
2693 cast<IntrinsicInst>(I).getIntrinsicID() ==
2694 Intrinsic::get_active_lane_mask;
2695 });
2696
2697 // Only currently enable these preferences for M-Class cores.
2698 if (!ST->isMClass())
2699 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2700
2701 // Disable loop unrolling for Oz and Os.
2702 UP.OptSizeThreshold = 0;
2704 if (L->getHeader()->getParent()->hasOptSize())
2705 return;
2706
2707 SmallVector<BasicBlock*, 4> ExitingBlocks;
2708 L->getExitingBlocks(ExitingBlocks);
2709 LLVM_DEBUG(dbgs() << "Loop has:\n"
2710 << "Blocks: " << L->getNumBlocks() << "\n"
2711 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2712
2713 // Only allow another exit other than the latch. This acts as an early exit
2714 // as it mirrors the profitability calculation of the runtime unroller.
2715 if (ExitingBlocks.size() > 2)
2716 return;
2717
2718 // Limit the CFG of the loop body for targets with a branch predictor.
2719 // Allowing 4 blocks permits if-then-else diamonds in the body.
2720 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2721 return;
2722
2723 // Don't unroll vectorized loops, including the remainder loop
2724 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2725 return;
2726
2727 // Scan the loop: don't unroll loops with calls as this could prevent
2728 // inlining.
2730 for (auto *BB : L->getBlocks()) {
2731 for (auto &I : *BB) {
2732 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2733 // scalar code.
2734 if (I.getType()->isVectorTy())
2735 return;
2736
2737 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2738 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2739 if (!isLoweredToCall(F))
2740 continue;
2741 }
2742 return;
2743 }
2744
2745 SmallVector<const Value*, 4> Operands(I.operand_values());
2746 Cost += getInstructionCost(&I, Operands,
2748 }
2749 }
2750
2751 // On v6m cores, there are very few registers available. We can easily end up
2752 // spilling and reloading more registers in an unrolled loop. Look at the
2753 // number of LCSSA phis as a rough measure of how many registers will need to
2754 // be live out of the loop, reducing the default unroll count if more than 1
2755 // value is needed. In the long run, all of this should be being learnt by a
2756 // machine.
2757 unsigned UnrollCount = 4;
2758 if (ST->isThumb1Only()) {
2759 unsigned ExitingValues = 0;
2761 L->getExitBlocks(ExitBlocks);
2762 for (auto *Exit : ExitBlocks) {
2763 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2764 // only the last is expected to be needed for address operands.
2765 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2766 return PH.getNumOperands() != 1 ||
2767 !isa<GetElementPtrInst>(PH.getOperand(0));
2768 });
2769 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2770 }
2771 if (ExitingValues)
2772 UnrollCount /= ExitingValues;
2773 if (UnrollCount <= 1)
2774 return;
2775 }
2776
2777 // For processors with low overhead branching (LOB), runtime unrolling the
2778 // innermost loop is often detrimental to performance. In these cases the loop
2779 // remainder gets unrolled into a series of compare-and-jump blocks, which in
2780 // deeply nested loops get executed multiple times, negating the benefits of
2781 // LOB. This is particularly noticable when the loop trip count of the
2782 // innermost loop varies within the outer loop, such as in the case of
2783 // triangular matrix decompositions. In these cases we will prefer to not
2784 // unroll the innermost loop, with the intention for it to be executed as a
2785 // low overhead loop.
2786 bool Runtime = true;
2787 if (ST->hasLOB()) {
2789 const auto *BETC = SE.getBackedgeTakenCount(L);
2790 auto *Outer = L->getOutermostLoop();
2791 if ((L != Outer && Outer != L->getParentLoop()) ||
2792 (L != Outer && BETC && !SE.isLoopInvariant(BETC, Outer))) {
2793 Runtime = false;
2794 }
2795 }
2796 }
2797
2798 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2799 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2800
2801 UP.Partial = true;
2802 UP.Runtime = Runtime;
2803 UP.UnrollRemainder = true;
2805 UP.UnrollAndJam = true;
2807
2808 // Force unrolling small loops can be very useful because of the branch
2809 // taken cost of the backedge.
2811 UP.Force = true;
2812}
2813
2818
2820 if (!ST->hasMVEIntegerOps())
2821 return false;
2822
2823 unsigned ScalarBits = Ty->getScalarSizeInBits();
2824 switch (Kind) {
2825 case RecurKind::Add:
2826 return ScalarBits <= 64;
2827 default:
2828 return false;
2829 }
2830}
2831
2833 if (!ST->hasMVEIntegerOps())
2834 return false;
2835 return true;
2836}
2837
2839 StackOffset BaseOffset,
2840 bool HasBaseReg, int64_t Scale,
2841 unsigned AddrSpace) const {
2843 AM.BaseGV = BaseGV;
2844 AM.BaseOffs = BaseOffset.getFixed();
2845 AM.HasBaseReg = HasBaseReg;
2846 AM.Scale = Scale;
2847 AM.ScalableOffset = BaseOffset.getScalable();
2848 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2849 if (ST->hasFPAO())
2850 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2851 return 0;
2852 }
2854}
2855
2856bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2857 if (Thumb) {
2858 // B.W is available in any Thumb2-supporting target, and also in every
2859 // version of Armv8-M, even Baseline which does not include the rest of
2860 // Thumb2.
2861 return ST->isThumb2() || ST->hasV8MBaselineOps();
2862 } else {
2863 // B is available in all versions of the Arm ISA, so the only question is
2864 // whether that ISA is available at all.
2865 return ST->hasARMOps();
2866 }
2867}
2868
2869/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
2870/// of the vector elements.
2871static bool areExtractExts(Value *Ext1, Value *Ext2) {
2872 using namespace PatternMatch;
2873
2874 auto areExtDoubled = [](Instruction *Ext) {
2875 return Ext->getType()->getScalarSizeInBits() ==
2876 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
2877 };
2878
2879 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
2880 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
2881 !areExtDoubled(cast<Instruction>(Ext1)) ||
2882 !areExtDoubled(cast<Instruction>(Ext2)))
2883 return false;
2884
2885 return true;
2886}
2887
2888/// Check if sinking \p I's operands to I's basic block is profitable, because
2889/// the operands can be folded into a target instruction, e.g.
2890/// sext/zext can be folded into vsubl.
2892 SmallVectorImpl<Use *> &Ops) const {
2893 using namespace PatternMatch;
2894
2895 if (!I->getType()->isVectorTy())
2896 return false;
2897
2898 if (ST->hasNEON()) {
2899 switch (I->getOpcode()) {
2900 case Instruction::Sub:
2901 case Instruction::Add: {
2902 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
2903 return false;
2904 Ops.push_back(&I->getOperandUse(0));
2905 Ops.push_back(&I->getOperandUse(1));
2906 return true;
2907 }
2908 default:
2909 return false;
2910 }
2911 }
2912
2913 if (!ST->hasMVEIntegerOps())
2914 return false;
2915
2916 auto IsFMSMul = [&](Instruction *I) {
2917 if (!I->hasOneUse())
2918 return false;
2919 auto *Sub = cast<Instruction>(*I->users().begin());
2920 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
2921 };
2922 auto IsFMS = [&](Instruction *I) {
2923 if (match(I->getOperand(0), m_FNeg(m_Value())) ||
2924 match(I->getOperand(1), m_FNeg(m_Value())))
2925 return true;
2926 return false;
2927 };
2928
2929 auto IsSinker = [&](Instruction *I, int Operand) {
2930 switch (I->getOpcode()) {
2931 case Instruction::Add:
2932 case Instruction::Mul:
2933 case Instruction::FAdd:
2934 case Instruction::ICmp:
2935 case Instruction::FCmp:
2936 return true;
2937 case Instruction::FMul:
2938 return !IsFMSMul(I);
2939 case Instruction::Sub:
2940 case Instruction::FSub:
2941 case Instruction::Shl:
2942 case Instruction::LShr:
2943 case Instruction::AShr:
2944 return Operand == 1;
2945 case Instruction::Call:
2946 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
2947 switch (II->getIntrinsicID()) {
2948 case Intrinsic::fma:
2949 return !IsFMS(I);
2950 case Intrinsic::sadd_sat:
2951 case Intrinsic::uadd_sat:
2952 case Intrinsic::arm_mve_add_predicated:
2953 case Intrinsic::arm_mve_mul_predicated:
2954 case Intrinsic::arm_mve_qadd_predicated:
2955 case Intrinsic::arm_mve_vhadd:
2956 case Intrinsic::arm_mve_hadd_predicated:
2957 case Intrinsic::arm_mve_vqdmull:
2958 case Intrinsic::arm_mve_vqdmull_predicated:
2959 case Intrinsic::arm_mve_vqdmulh:
2960 case Intrinsic::arm_mve_qdmulh_predicated:
2961 case Intrinsic::arm_mve_vqrdmulh:
2962 case Intrinsic::arm_mve_qrdmulh_predicated:
2963 case Intrinsic::arm_mve_fma_predicated:
2964 return true;
2965 case Intrinsic::ssub_sat:
2966 case Intrinsic::usub_sat:
2967 case Intrinsic::arm_mve_sub_predicated:
2968 case Intrinsic::arm_mve_qsub_predicated:
2969 case Intrinsic::arm_mve_hsub_predicated:
2970 case Intrinsic::arm_mve_vhsub:
2971 return Operand == 1;
2972 default:
2973 return false;
2974 }
2975 }
2976 return false;
2977 default:
2978 return false;
2979 }
2980 };
2981
2982 for (auto OpIdx : enumerate(I->operands())) {
2983 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
2984 // Make sure we are not already sinking this operand
2985 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
2986 continue;
2987
2988 Instruction *Shuffle = Op;
2989 if (Shuffle->getOpcode() == Instruction::BitCast)
2990 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
2991 // We are looking for a splat that can be sunk.
2992 if (!Shuffle || !match(Shuffle, m_Shuffle(m_InsertElt(m_Undef(), m_Value(),
2993 m_ZeroInt()),
2994 m_Undef(), m_ZeroMask())))
2995 continue;
2996 if (!IsSinker(I, OpIdx.index()))
2997 continue;
2998
2999 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3000 // and vector registers
3001 for (Use &U : Op->uses()) {
3002 Instruction *Insn = cast<Instruction>(U.getUser());
3003 if (!IsSinker(Insn, U.getOperandNo()))
3004 return false;
3005 }
3006
3007 Ops.push_back(&Shuffle->getOperandUse(0));
3008 if (Shuffle != Op)
3009 Ops.push_back(&Op->getOperandUse(0));
3010 Ops.push_back(&OpIdx.value());
3011 }
3012 return true;
3013}
3014
3016 Type *ArrayType) const {
3017 if (!UseWidenGlobalArrays) {
3018 LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
3019 return false;
3020 }
3021
3022 // Don't modify none integer array types
3023 if (!ArrayType || !ArrayType->isArrayTy() ||
3025 return 0;
3026
3027 // We pad to 4 byte boundaries
3028 if (Size % 4 == 0)
3029 return 0;
3030
3031 unsigned NumBytesToPad = 4 - (Size % 4);
3032 unsigned NewSize = Size + NumBytesToPad;
3033
3034 // Max number of bytes that memcpy allows for lowering to load/stores before
3035 // it uses library function (__aeabi_memcpy).
3036 unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
3037
3038 if (NewSize > MaxMemIntrinsicSize)
3039 return 0;
3040
3041 return NumBytesToPad;
3042}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
This file implements a class to represent arbitrary precision integral constant values and operations...
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< int > ArmForceUnrollThreshold("arm-force-unroll-threshold", cl::init(12), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in Arm architecture"))
static Value * isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI, const DominatorTree &DT)
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm)
static cl::opt< bool > UseWidenGlobalArrays("widen-global-strings", cl::Hidden, cl::init(true), cl::desc("Enable the widening of global strings to alignment boundaries"))
cl::opt< bool > EnableMaskedGatherScatters
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
cl::opt< TailPredication::Mode > EnableTailPredication
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
This file provides the interface for the instcombine pass implementation.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition APInt.h:78
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1497
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMemcpyCost(const Instruction *I) const override
bool maybeLoweredToCall(Instruction &I) const
bool preferInLoopReduction(RecurKind Kind, Type *Ty) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool hasArmWideBranch(bool Thumb) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty) const override
bool isLoweredToCall(const Function *F) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
bool isLegalMaskedStore(Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool isLegalMaskedLoad(Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
bool preferPredicatedReductionSelect() const override
bool isLegalMaskedGather(Type *Ty, Align Alignment) const override
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const override
bool isProfitableLSRChainElement(Instruction *I) const override
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
getScalingFactorCost - Return the cost of the scaling used in addressing mode represented by AM.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
Class to represent array types.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Container class for subtarget features.
constexpr bool test(unsigned I) const
constexpr size_t size() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition IRBuilder.h:502
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
The core instruction combiner logic.
const DataLayout & getDataLayout() const
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
IRBuilder< TargetFolder, IRBuilderCallbackInserter > BuilderTy
An IRBuilder that automatically inserts new instructions into the worklist.
DominatorTree & getDominatorTree() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
AssumptionCache & getAssumptionCache() const
static InstructionCost getInvalid(CostType Val=0)
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isShift() const
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Drive the analysis of memory accesses in the loop.
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
AssumptionCache * getAssumptionCache() const
const LoopAccessInfo * getLAI() const
ScalarEvolution * getScalarEvolution() const
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
Provides information about what library functions are available for the current target.
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual bool isLoweredToCall(const Function *F) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
MaskKind
Some targets only support masked load/store with a constant mask.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
@ AMK_PreIndexed
Prefer pre-indexed addressing mode.
@ AMK_None
Don't prefer any addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:264
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
Type * getArrayElementType() const
Definition Type.h:408
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Runtime
Detect stack use after return if not disabled runtime with (ASAN_OPTIONS=detect_stack_use_after_retur...
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition Local.h:252
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
SelectPatternFlavor
Specific patterns of select instructions we can match.
@ SPF_ABS
Floating point maxnum.
@ SPF_FMAXNUM
Floating point minnum.
@ SPF_UMIN
Signed minimum.
@ SPF_UMAX
Signed maximum.
@ SPF_SMAX
Unsigned minimum.
@ SPF_FMINNUM
Unsigned maximum.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
RecurKind
These are the kinds of recurrences that we support.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
Attributes of a target dependent hardware loop.
LLVM_ABI bool canAnalyze(LoopInfo &LI)
LLVM_ABI bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
SelectPatternFlavor Flavor
TargetLibraryInfo * TLI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).