LLVM 22.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "ARMSubtarget.h"
12#include "llvm/ADT/APInt.h"
19#include "llvm/IR/BasicBlock.h"
20#include "llvm/IR/DataLayout.h"
22#include "llvm/IR/Instruction.h"
25#include "llvm/IR/Intrinsics.h"
26#include "llvm/IR/IntrinsicsARM.h"
28#include "llvm/IR/Type.h"
37#include <algorithm>
38#include <cassert>
39#include <cstdint>
40#include <optional>
41#include <utility>
42
43using namespace llvm;
44
45#define DEBUG_TYPE "armtti"
46
48 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
49 cl::desc("Enable the generation of masked loads and stores"));
50
52 "disable-arm-loloops", cl::Hidden, cl::init(false),
53 cl::desc("Disable the generation of low-overhead loops"));
54
55static cl::opt<bool>
56 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
57 cl::desc("Enable the generation of WLS loops"));
58
60 "widen-global-strings", cl::Hidden, cl::init(true),
61 cl::desc("Enable the widening of global strings to alignment boundaries"));
62
64
66
68
70 "arm-force-unroll-threshold", cl::init(12), cl::Hidden,
72 "Threshold for forced unrolling of small loops in Arm architecture"));
73
74/// Convert a vector load intrinsic into a simple llvm load instruction.
75/// This is beneficial when the underlying object being addressed comes
76/// from a constant, since we get constant-folding for free.
77static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
78 InstCombiner::BuilderTy &Builder) {
79 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
80
81 if (!IntrAlign)
82 return nullptr;
83
84 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
85 ? MemAlign
86 : IntrAlign->getLimitedValue();
87
88 if (!isPowerOf2_32(Alignment))
89 return nullptr;
90
91 return Builder.CreateAlignedLoad(II.getType(), II.getArgOperand(0),
92 Align(Alignment));
93}
94
96 const Function *Callee) const {
97 const TargetMachine &TM = getTLI()->getTargetMachine();
98 const FeatureBitset &CallerBits =
99 TM.getSubtargetImpl(*Caller)->getFeatureBits();
100 const FeatureBitset &CalleeBits =
101 TM.getSubtargetImpl(*Callee)->getFeatureBits();
102
103 // To inline a callee, all features not in the allowed list must match exactly.
104 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
105 (CalleeBits & ~InlineFeaturesAllowed);
106 // For features in the allowed list, the callee's features must be a subset of
107 // the callers'.
108 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
109 (CalleeBits & InlineFeaturesAllowed);
110 return MatchExact && MatchSubset;
111}
112
115 ScalarEvolution *SE) const {
116 if (ST->hasMVEIntegerOps())
118
119 if (L->getHeader()->getParent()->hasOptSize())
120 return TTI::AMK_None;
121
122 if (ST->isMClass() && ST->isThumb2() &&
123 L->getNumBlocks() == 1)
124 return TTI::AMK_PreIndexed;
125
126 return TTI::AMK_None;
127}
128
129std::optional<Instruction *>
131 using namespace PatternMatch;
132 Intrinsic::ID IID = II.getIntrinsicID();
133 switch (IID) {
134 default:
135 break;
136 case Intrinsic::arm_neon_vld1: {
137 Align MemAlign =
138 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
140 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
141 return IC.replaceInstUsesWith(II, V);
142 }
143 break;
144 }
145
146 case Intrinsic::arm_neon_vld2:
147 case Intrinsic::arm_neon_vld3:
148 case Intrinsic::arm_neon_vld4:
149 case Intrinsic::arm_neon_vld2lane:
150 case Intrinsic::arm_neon_vld3lane:
151 case Intrinsic::arm_neon_vld4lane:
152 case Intrinsic::arm_neon_vst1:
153 case Intrinsic::arm_neon_vst2:
154 case Intrinsic::arm_neon_vst3:
155 case Intrinsic::arm_neon_vst4:
156 case Intrinsic::arm_neon_vst2lane:
157 case Intrinsic::arm_neon_vst3lane:
158 case Intrinsic::arm_neon_vst4lane: {
159 Align MemAlign =
160 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
162 unsigned AlignArg = II.arg_size() - 1;
163 Value *AlignArgOp = II.getArgOperand(AlignArg);
164 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
165 if (Align && *Align < MemAlign) {
166 return IC.replaceOperand(
167 II, AlignArg,
168 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
169 false));
170 }
171 break;
172 }
173
174 case Intrinsic::arm_neon_vld1x2:
175 case Intrinsic::arm_neon_vld1x3:
176 case Intrinsic::arm_neon_vld1x4:
177 case Intrinsic::arm_neon_vst1x2:
178 case Intrinsic::arm_neon_vst1x3:
179 case Intrinsic::arm_neon_vst1x4: {
180 Align NewAlign =
181 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
183 Align OldAlign = II.getParamAlign(0).valueOrOne();
184 if (NewAlign > OldAlign)
185 II.addParamAttr(0,
186 Attribute::getWithAlignment(II.getContext(), NewAlign));
187 break;
188 }
189
190 case Intrinsic::arm_mve_pred_i2v: {
191 Value *Arg = II.getArgOperand(0);
192 Value *ArgArg;
194 PatternMatch::m_Value(ArgArg))) &&
195 II.getType() == ArgArg->getType()) {
196 return IC.replaceInstUsesWith(II, ArgArg);
197 }
198 Constant *XorMask;
200 PatternMatch::m_Value(ArgArg)),
201 PatternMatch::m_Constant(XorMask))) &&
202 II.getType() == ArgArg->getType()) {
203 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
204 if (CI->getValue().trunc(16).isAllOnes()) {
205 auto TrueVector = IC.Builder.CreateVectorSplat(
206 cast<FixedVectorType>(II.getType())->getNumElements(),
207 IC.Builder.getTrue());
208 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
209 }
210 }
211 }
212 KnownBits ScalarKnown(32);
213 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
214 ScalarKnown)) {
215 return &II;
216 }
217 break;
218 }
219 case Intrinsic::arm_mve_pred_v2i: {
220 Value *Arg = II.getArgOperand(0);
221 Value *ArgArg;
223 PatternMatch::m_Value(ArgArg)))) {
224 return IC.replaceInstUsesWith(II, ArgArg);
225 }
226
227 if (II.getMetadata(LLVMContext::MD_range))
228 break;
229
230 ConstantRange Range(APInt(32, 0), APInt(32, 0x10000));
231
232 if (auto CurrentRange = II.getRange()) {
233 Range = Range.intersectWith(*CurrentRange);
234 if (Range == CurrentRange)
235 break;
236 }
237
238 II.addRangeRetAttr(Range);
239 II.addRetAttr(Attribute::NoUndef);
240 return &II;
241 }
242 case Intrinsic::arm_mve_vadc:
243 case Intrinsic::arm_mve_vadc_predicated: {
244 unsigned CarryOp =
245 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
246 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
247 "Bad type for intrinsic!");
248
249 KnownBits CarryKnown(32);
250 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
251 CarryKnown)) {
252 return &II;
253 }
254 break;
255 }
256 case Intrinsic::arm_mve_vmldava: {
258 if (I->hasOneUse()) {
259 auto *User = cast<Instruction>(*I->user_begin());
260 Value *OpZ;
261 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
262 match(I->getOperand(3), m_Zero())) {
263 Value *OpX = I->getOperand(4);
264 Value *OpY = I->getOperand(5);
265 Type *OpTy = OpX->getType();
266
268 Value *V =
269 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
270 {I->getOperand(0), I->getOperand(1),
271 I->getOperand(2), OpZ, OpX, OpY});
272
274 return IC.eraseInstFromFunction(*User);
275 }
276 }
277 return std::nullopt;
278 }
279 }
280 return std::nullopt;
281}
282
284 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
285 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
286 std::function<void(Instruction *, unsigned, APInt, APInt &)>
287 SimplifyAndSetOp) const {
288
289 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
290 // opcode specifying a Top/Bottom instruction, which can change between
291 // instructions.
292 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
293 unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
294 unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
295
296 // The only odd/even lanes of operand 0 will only be demanded depending
297 // on whether this is a top/bottom instruction.
298 APInt DemandedElts =
299 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
300 : APInt::getHighBitsSet(2, 1));
301 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
302 // The other lanes will be defined from the inserted elements.
303 UndefElts &= APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
304 : APInt::getHighBitsSet(2, 1));
305 return std::nullopt;
306 };
307
308 switch (II.getIntrinsicID()) {
309 default:
310 break;
311 case Intrinsic::arm_mve_vcvt_narrow:
312 SimplifyNarrowInstrTopBottom(2);
313 break;
314 case Intrinsic::arm_mve_vqmovn:
315 SimplifyNarrowInstrTopBottom(4);
316 break;
317 case Intrinsic::arm_mve_vshrn:
318 SimplifyNarrowInstrTopBottom(7);
319 break;
320 }
321
322 return std::nullopt;
323}
324
327 assert(Ty->isIntegerTy());
328
329 unsigned Bits = Ty->getPrimitiveSizeInBits();
330 if (Bits == 0 || Imm.getActiveBits() >= 64)
331 return 4;
332
333 int64_t SImmVal = Imm.getSExtValue();
334 uint64_t ZImmVal = Imm.getZExtValue();
335 if (!ST->isThumb()) {
336 if ((SImmVal >= 0 && SImmVal < 65536) ||
337 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
338 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
339 return 1;
340 return ST->hasV6T2Ops() ? 2 : 3;
341 }
342 if (ST->isThumb2()) {
343 if ((SImmVal >= 0 && SImmVal < 65536) ||
344 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
345 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
346 return 1;
347 return ST->hasV6T2Ops() ? 2 : 3;
348 }
349 // Thumb1, any i8 imm cost 1.
350 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
351 return 1;
352 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
353 return 2;
354 // Load from constantpool.
355 return 3;
356}
357
358// Constants smaller than 256 fit in the immediate field of
359// Thumb1 instructions so we return a zero cost and 1 otherwise.
361 const APInt &Imm,
362 Type *Ty) const {
363 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
364 return 0;
365
366 return 1;
367}
368
369// Checks whether Inst is part of a min(max()) or max(min()) pattern
370// that will match to an SSAT instruction. Returns the instruction being
371// saturated, or null if no saturation pattern was found.
372static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
373 Value *LHS, *RHS;
374 ConstantInt *C;
376
377 if (InstSPF == SPF_SMAX &&
379 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
380
381 auto isSSatMin = [&](Value *MinInst) {
382 if (isa<SelectInst>(MinInst)) {
383 Value *MinLHS, *MinRHS;
384 ConstantInt *MinC;
385 SelectPatternFlavor MinSPF =
386 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
387 if (MinSPF == SPF_SMIN &&
389 MinC->getValue() == ((-Imm) - 1))
390 return true;
391 }
392 return false;
393 };
394
395 if (isSSatMin(Inst->getOperand(1)))
396 return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
397 if (Inst->hasNUses(2) &&
398 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
399 return Inst->getOperand(1);
400 }
401 return nullptr;
402}
403
404// Look for a FP Saturation pattern, where the instruction can be simplified to
405// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
406static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
407 if (Imm.getBitWidth() != 64 ||
408 Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
409 return false;
410 Value *FP = isSSATMinMaxPattern(Inst, Imm);
411 if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
413 if (!FP)
414 return false;
415 return isa<FPToSIInst>(FP);
416}
417
418InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
419 const APInt &Imm, Type *Ty,
421 Instruction *Inst) const {
422 // Division by a constant can be turned into multiplication, but only if we
423 // know it's constant. So it's not so much that the immediate is cheap (it's
424 // not), but that the alternative is worse.
425 // FIXME: this is probably unneeded with GlobalISel.
426 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
427 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
428 Idx == 1)
429 return 0;
430
431 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
432 // splitting any large offsets.
433 if (Opcode == Instruction::GetElementPtr && Idx != 0)
434 return 0;
435
436 if (Opcode == Instruction::And) {
437 // UXTB/UXTH
438 if (Imm == 255 || Imm == 65535)
439 return 0;
440 // Conversion to BIC is free, and means we can use ~Imm instead.
441 return std::min(getIntImmCost(Imm, Ty, CostKind),
442 getIntImmCost(~Imm, Ty, CostKind));
443 }
444
445 if (Opcode == Instruction::Add)
446 // Conversion to SUB is free, and means we can use -Imm instead.
447 return std::min(getIntImmCost(Imm, Ty, CostKind),
448 getIntImmCost(-Imm, Ty, CostKind));
449
450 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
451 Ty->getIntegerBitWidth() == 32) {
452 int64_t NegImm = -Imm.getSExtValue();
453 if (ST->isThumb2() && NegImm < 1<<12)
454 // icmp X, #-C -> cmn X, #C
455 return 0;
456 if (ST->isThumb() && NegImm < 1<<8)
457 // icmp X, #-C -> adds X, #C
458 return 0;
459 }
460
461 // xor a, -1 can always be folded to MVN
462 if (Opcode == Instruction::Xor && Imm.isAllOnes())
463 return 0;
464
465 // Ensures negative constant of min(max()) or max(min()) patterns that
466 // match to SSAT instructions don't get hoisted
467 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
468 Ty->getIntegerBitWidth() <= 32) {
469 if (isSSATMinMaxPattern(Inst, Imm) ||
470 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
472 return 0;
473 }
474
475 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
476 return 0;
477
478 // We can convert <= -1 to < 0, which is generally quite cheap.
479 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
480 ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
481 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
482 return std::min(getIntImmCost(Imm, Ty, CostKind),
483 getIntImmCost(Imm + 1, Ty, CostKind));
484 }
485
486 return getIntImmCost(Imm, Ty, CostKind);
487}
488
491 const Instruction *I) const {
493 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
494 // FIXME: The vectorizer is highly sensistive to the cost of these
495 // instructions, which suggests that it may be using the costs incorrectly.
496 // But, for now, just make them free to avoid performance regressions for
497 // vector targets.
498 return 0;
499 }
500 return BaseT::getCFInstrCost(Opcode, CostKind, I);
501}
502
504 Type *Src,
507 const Instruction *I) const {
508 int ISD = TLI->InstructionOpcodeToISD(Opcode);
509 assert(ISD && "Invalid opcode");
510
511 // TODO: Allow non-throughput costs that aren't binary.
512 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
514 return Cost == 0 ? 0 : 1;
515 return Cost;
516 };
517 auto IsLegalFPType = [this](EVT VT) {
518 EVT EltVT = VT.getScalarType();
519 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
520 (EltVT == MVT::f64 && ST->hasFP64()) ||
521 (EltVT == MVT::f16 && ST->hasFullFP16());
522 };
523
524 EVT SrcTy = TLI->getValueType(DL, Src);
525 EVT DstTy = TLI->getValueType(DL, Dst);
526
527 if (!SrcTy.isSimple() || !DstTy.isSimple())
528 return AdjustCost(
529 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
530
531 // Extending masked load/Truncating masked stores is expensive because we
532 // currently don't split them. This means that we'll likely end up
533 // loading/storing each element individually (hence the high cost).
534 if ((ST->hasMVEIntegerOps() &&
535 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
536 Opcode == Instruction::SExt)) ||
537 (ST->hasMVEFloatOps() &&
538 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
539 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
540 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
541 return 2 * DstTy.getVectorNumElements() *
542 ST->getMVEVectorCostFactor(CostKind);
543
544 // The extend of other kinds of load is free
545 if (CCH == TTI::CastContextHint::Normal ||
547 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
548 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
549 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
550 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
551 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
552 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
553 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
554 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
555 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
556 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
557 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
558 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
559 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
560 };
561 if (const auto *Entry = ConvertCostTableLookup(
562 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
563 return AdjustCost(Entry->Cost);
564
565 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
566 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
567 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
568 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
569 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
570 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
571 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
572 // The following extend from a legal type to an illegal type, so need to
573 // split the load. This introduced an extra load operation, but the
574 // extend is still "free".
575 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
576 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
577 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
578 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
579 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
580 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
581 };
582 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
583 if (const auto *Entry =
584 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
585 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
586 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
587 }
588
589 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
590 // FPExtends are similar but also require the VCVT instructions.
591 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
592 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
593 };
594 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
595 if (const auto *Entry =
596 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
597 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
598 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
599 }
600
601 // The truncate of a store is free. This is the mirror of extends above.
602 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
603 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
604 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
605 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
606 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
607 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
608 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
609 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
610 };
611 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
612 if (const auto *Entry =
613 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
614 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
615 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
616 }
617
618 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
619 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
620 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
621 };
622 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
623 if (const auto *Entry =
624 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
625 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
626 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
627 }
628 }
629
630 // NEON vector operations that can extend their inputs.
631 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
632 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
633 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
634 // vaddl
635 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
636 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
637 // vsubl
638 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
639 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
640 // vmull
641 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
642 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
643 // vshll
644 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
645 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
646 };
647
648 auto *User = cast<Instruction>(*I->user_begin());
649 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
650 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
651 DstTy.getSimpleVT(),
652 SrcTy.getSimpleVT())) {
653 return AdjustCost(Entry->Cost);
654 }
655 }
656
657 // Single to/from double precision conversions.
658 if (Src->isVectorTy() && ST->hasNEON() &&
659 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
660 DstTy.getScalarType() == MVT::f32) ||
661 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
662 DstTy.getScalarType() == MVT::f64))) {
663 static const CostTblEntry NEONFltDblTbl[] = {
664 // Vector fptrunc/fpext conversions.
665 {ISD::FP_ROUND, MVT::v2f64, 2},
666 {ISD::FP_EXTEND, MVT::v2f32, 2},
667 {ISD::FP_EXTEND, MVT::v4f32, 4}};
668
669 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
670 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
671 return AdjustCost(LT.first * Entry->Cost);
672 }
673
674 // Some arithmetic, load and store operations have specific instructions
675 // to cast up/down their types automatically at no extra cost.
676 // TODO: Get these tables to know at least what the related operations are.
677 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
678 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
679 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
680 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
681 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
682 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
683 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
684
685 // The number of vmovl instructions for the extension.
686 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
687 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
688 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
689 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
690 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
691 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
692 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
693 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
694 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
695 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
696 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
697 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
698 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
699 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
700 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
701 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
702 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
703 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
704
705 // Operations that we legalize using splitting.
706 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
707 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
708
709 // Vector float <-> i32 conversions.
710 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
711 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
712
713 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
714 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
715 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
716 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
717 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
718 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
719 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
720 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
721 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
722 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
723 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
724 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
725 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
726 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
727 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
728 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
729 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
730 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
731 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
732 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
733
734 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
735 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
736 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
737 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
738 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
739 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
740
741 // Vector double <-> i32 conversions.
742 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
743 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
744
745 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
746 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
747 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
748 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
749 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
750 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
751
752 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
753 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
754 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
755 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
756 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
757 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
758 };
759
760 if (SrcTy.isVector() && ST->hasNEON()) {
761 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
762 DstTy.getSimpleVT(),
763 SrcTy.getSimpleVT()))
764 return AdjustCost(Entry->Cost);
765 }
766
767 // Scalar float to integer conversions.
768 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
769 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
770 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
771 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
772 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
773 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
774 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
775 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
776 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
777 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
778 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
779 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
780 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
781 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
782 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
783 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
784 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
785 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
786 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
787 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
788 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
789 };
790 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
791 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
792 DstTy.getSimpleVT(),
793 SrcTy.getSimpleVT()))
794 return AdjustCost(Entry->Cost);
795 }
796
797 // Scalar integer to float conversions.
798 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
799 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
800 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
801 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
802 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
803 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
804 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
805 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
806 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
807 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
808 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
809 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
810 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
811 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
812 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
813 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
814 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
815 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
816 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
817 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
818 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
819 };
820
821 if (SrcTy.isInteger() && ST->hasNEON()) {
822 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
823 ISD, DstTy.getSimpleVT(),
824 SrcTy.getSimpleVT()))
825 return AdjustCost(Entry->Cost);
826 }
827
828 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
829 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
830 // are linearised so take more.
831 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
832 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
833 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
834 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
835 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
836 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
837 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
838 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
839 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
840 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
841 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
842 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
843 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
844 };
845
846 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
847 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
848 ISD, DstTy.getSimpleVT(),
849 SrcTy.getSimpleVT()))
850 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
851 }
852
853 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
854 // As general rule, fp converts that were not matched above are scalarized
855 // and cost 1 vcvt for each lane, so long as the instruction is available.
856 // If not it will become a series of function calls.
857 const InstructionCost CallCost =
858 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
859 int Lanes = 1;
860 if (SrcTy.isFixedLengthVector())
861 Lanes = SrcTy.getVectorNumElements();
862
863 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
864 return Lanes;
865 else
866 return Lanes * CallCost;
867 }
868
869 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
870 SrcTy.isFixedLengthVector()) {
871 // Treat a truncate with larger than legal source (128bits for MVE) as
872 // expensive, 2 instructions per lane.
873 if ((SrcTy.getScalarType() == MVT::i8 ||
874 SrcTy.getScalarType() == MVT::i16 ||
875 SrcTy.getScalarType() == MVT::i32) &&
876 SrcTy.getSizeInBits() > 128 &&
877 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
878 return SrcTy.getVectorNumElements() * 2;
879 }
880
881 // Scalar integer conversion costs.
882 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
883 // i16 -> i64 requires two dependent operations.
884 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
885
886 // Truncates on i64 are assumed to be free.
887 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
888 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
889 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
890 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
891 };
892
893 if (SrcTy.isInteger()) {
894 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
895 DstTy.getSimpleVT(),
896 SrcTy.getSimpleVT()))
897 return AdjustCost(Entry->Cost);
898 }
899
900 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
901 ? ST->getMVEVectorCostFactor(CostKind)
902 : 1;
903 return AdjustCost(
904 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
905}
906
909 unsigned Index, const Value *Op0,
910 const Value *Op1) const {
911 // Penalize inserting into an D-subregister. We end up with a three times
912 // lower estimated throughput on swift.
913 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
914 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
915 return 3;
916
917 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
918 Opcode == Instruction::ExtractElement)) {
919 // Cross-class copies are expensive on many microarchitectures,
920 // so assume they are expensive by default.
921 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
922 return 3;
923
924 // Even if it's not a cross class copy, this likely leads to mixing
925 // of NEON and VFP code and should be therefore penalized.
926 if (ValTy->isVectorTy() &&
927 ValTy->getScalarSizeInBits() <= 32)
928 return std::max<InstructionCost>(
929 BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1),
930 2U);
931 }
932
933 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
934 Opcode == Instruction::ExtractElement)) {
935 // Integer cross-lane moves are more expensive than float, which can
936 // sometimes just be vmovs. Integer involve being passes to GPR registers,
937 // causing more of a delay.
938 std::pair<InstructionCost, MVT> LT =
939 getTypeLegalizationCost(ValTy->getScalarType());
940 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
941 }
942
943 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
944}
945
947 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
949 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
950 int ISD = TLI->InstructionOpcodeToISD(Opcode);
951
952 // Thumb scalar code size cost for select.
954 ST->isThumb() && !ValTy->isVectorTy()) {
955 // Assume expensive structs.
956 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
957 return TTI::TCC_Expensive;
958
959 // Select costs can vary because they:
960 // - may require one or more conditional mov (including an IT),
961 // - can't operate directly on immediates,
962 // - require live flags, which we can't copy around easily.
964
965 // Possible IT instruction for Thumb2, or more for Thumb1.
966 ++Cost;
967
968 // i1 values may need rematerialising by using mov immediates and/or
969 // flag setting instructions.
970 if (ValTy->isIntegerTy(1))
971 ++Cost;
972
973 return Cost;
974 }
975
976 // If this is a vector min/max/abs, use the cost of that intrinsic directly
977 // instead. Hopefully when min/max intrinsics are more prevalent this code
978 // will not be needed.
979 const Instruction *Sel = I;
980 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
981 Sel->hasOneUse())
982 Sel = cast<Instruction>(Sel->user_back());
983 if (Sel && ValTy->isVectorTy() &&
984 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
985 const Value *LHS, *RHS;
986 SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
987 unsigned IID = 0;
988 switch (SPF) {
989 case SPF_ABS:
990 IID = Intrinsic::abs;
991 break;
992 case SPF_SMIN:
993 IID = Intrinsic::smin;
994 break;
995 case SPF_SMAX:
996 IID = Intrinsic::smax;
997 break;
998 case SPF_UMIN:
999 IID = Intrinsic::umin;
1000 break;
1001 case SPF_UMAX:
1002 IID = Intrinsic::umax;
1003 break;
1004 case SPF_FMINNUM:
1005 IID = Intrinsic::minnum;
1006 break;
1007 case SPF_FMAXNUM:
1008 IID = Intrinsic::maxnum;
1009 break;
1010 default:
1011 break;
1012 }
1013 if (IID) {
1014 // The ICmp is free, the select gets the cost of the min/max/etc
1015 if (Sel != I)
1016 return 0;
1017 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
1018 return getIntrinsicInstrCost(CostAttrs, CostKind);
1019 }
1020 }
1021
1022 // On NEON a vector select gets lowered to vbsl.
1023 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
1024 // Lowering of some vector selects is currently far from perfect.
1025 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
1026 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
1027 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
1028 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
1029 };
1030
1031 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1032 EVT SelValTy = TLI->getValueType(DL, ValTy);
1033 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1034 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1035 SelCondTy.getSimpleVT(),
1036 SelValTy.getSimpleVT()))
1037 return Entry->Cost;
1038 }
1039
1040 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1041 return LT.first;
1042 }
1043
1044 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1045 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1046 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1047 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1049 if (!VecCondTy)
1051
1052 // If we don't have mve.fp any fp operations will need to be scalarized.
1053 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1054 // One scalaization insert, one scalarization extract and the cost of the
1055 // fcmps.
1056 return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
1057 /*Extract*/ true, CostKind) +
1058 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1059 /*Extract*/ false, CostKind) +
1060 VecValTy->getNumElements() *
1061 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1062 VecCondTy->getScalarType(), VecPred,
1063 CostKind, Op1Info, Op2Info, I);
1064 }
1065
1066 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1067 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1068 // There are two types - the input that specifies the type of the compare
1069 // and the output vXi1 type. Because we don't know how the output will be
1070 // split, we may need an expensive shuffle to get two in sync. This has the
1071 // effect of making larger than legal compares (v8i32 for example)
1072 // expensive.
1073 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1074 if (LT.first > 1)
1075 return LT.first * BaseCost +
1076 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1077 /*Extract*/ false, CostKind);
1078 return BaseCost;
1079 }
1080 }
1081
1082 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1083 // for "multiple beats" potentially needed by MVE instructions.
1084 int BaseCost = 1;
1085 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1086 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1087
1088 return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred,
1089 CostKind, Op1Info, Op2Info, I);
1090}
1091
1094 const SCEV *Ptr,
1096 // Address computations in vectorized code with non-consecutive addresses will
1097 // likely result in more instructions compared to scalar code where the
1098 // computation can more often be merged into the index mode. The resulting
1099 // extra micro-ops can significantly decrease throughput.
1100 unsigned NumVectorInstToHideOverhead = 10;
1101 int MaxMergeDistance = 64;
1102
1103 if (ST->hasNEON()) {
1104 if (PtrTy->isVectorTy() && SE &&
1105 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1106 return NumVectorInstToHideOverhead;
1107
1108 // In many cases the address computation is not merged into the instruction
1109 // addressing mode.
1110 return 1;
1111 }
1112 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1113}
1114
1117 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1118 // optimized, else LSR may block tail-predication.
1119 switch (II->getIntrinsicID()) {
1120 case Intrinsic::arm_mve_vctp8:
1121 case Intrinsic::arm_mve_vctp16:
1122 case Intrinsic::arm_mve_vctp32:
1123 case Intrinsic::arm_mve_vctp64:
1124 return true;
1125 default:
1126 break;
1127 }
1128 }
1129 return false;
1130}
1131
1133 unsigned /*AddressSpace*/,
1134 TTI::MaskKind /*MaskKind*/) const {
1135 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1136 return false;
1137
1138 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1139 // Don't support v2i1 yet.
1140 if (VecTy->getNumElements() == 2)
1141 return false;
1142
1143 // We don't support extending fp types.
1144 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1145 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1146 return false;
1147 }
1148
1149 unsigned EltWidth = DataTy->getScalarSizeInBits();
1150 return (EltWidth == 32 && Alignment >= 4) ||
1151 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1152}
1153
1154bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const {
1155 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1156 return false;
1157
1158 unsigned EltWidth = Ty->getScalarSizeInBits();
1159 return ((EltWidth == 32 && Alignment >= 4) ||
1160 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1161}
1162
1163/// Given a memcpy/memset/memmove instruction, return the number of memory
1164/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1165/// call is used.
1167 MemOp MOp;
1168 unsigned DstAddrSpace = ~0u;
1169 unsigned SrcAddrSpace = ~0u;
1170 const Function *F = I->getParent()->getParent();
1171
1172 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1173 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1174 // If 'size' is not a constant, a library call will be generated.
1175 if (!C)
1176 return -1;
1177
1178 const unsigned Size = C->getValue().getZExtValue();
1179 const Align DstAlign = MC->getDestAlign().valueOrOne();
1180 const Align SrcAlign = MC->getSourceAlign().valueOrOne();
1181
1182 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1183 /*IsVolatile*/ false);
1184 DstAddrSpace = MC->getDestAddressSpace();
1185 SrcAddrSpace = MC->getSourceAddressSpace();
1186 }
1187 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1188 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1189 // If 'size' is not a constant, a library call will be generated.
1190 if (!C)
1191 return -1;
1192
1193 const unsigned Size = C->getValue().getZExtValue();
1194 const Align DstAlign = MS->getDestAlign().valueOrOne();
1195
1196 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1197 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1198 DstAddrSpace = MS->getDestAddressSpace();
1199 }
1200 else
1201 llvm_unreachable("Expected a memcpy/move or memset!");
1202
1203 unsigned Limit, Factor = 2;
1204 switch(I->getIntrinsicID()) {
1205 case Intrinsic::memcpy:
1206 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1207 break;
1208 case Intrinsic::memmove:
1209 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1210 break;
1211 case Intrinsic::memset:
1212 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1213 Factor = 1;
1214 break;
1215 default:
1216 llvm_unreachable("Expected a memcpy/move or memset!");
1217 }
1218
1219 // MemOps will be poplulated with a list of data types that needs to be
1220 // loaded and stored. That's why we multiply the number of elements by 2 to
1221 // get the cost for this memcpy.
1222 std::vector<EVT> MemOps;
1223 LLVMContext &C = F->getContext();
1224 if (getTLI()->findOptimalMemOpLowering(C, MemOps, Limit, MOp, DstAddrSpace,
1225 SrcAddrSpace, F->getAttributes()))
1226 return MemOps.size() * Factor;
1227
1228 // If we can't find an optimal memop lowering, return the default cost
1229 return -1;
1230}
1231
1234
1235 // To model the cost of a library call, we assume 1 for the call, and
1236 // 3 for the argument setup.
1237 if (NumOps == -1)
1238 return 4;
1239 return NumOps;
1240}
1241
1243 VectorType *DstTy, VectorType *SrcTy,
1244 ArrayRef<int> Mask,
1246 int Index, VectorType *SubTp,
1248 const Instruction *CxtI) const {
1249 assert((Mask.empty() || DstTy->isScalableTy() ||
1250 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1251 "Expected the Mask to match the return size if given");
1252 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1253 "Expected the same scalar types");
1254
1255 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1256 // Treat extractsubvector as single op permutation.
1257 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1258 if (IsExtractSubvector)
1260 if (ST->hasNEON()) {
1261 if (Kind == TTI::SK_Broadcast) {
1262 static const CostTblEntry NEONDupTbl[] = {
1263 // VDUP handles these cases.
1264 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1265 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1266 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1267 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1268 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1269 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1270
1271 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1272 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1273 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1274 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1275
1276 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1277 if (const auto *Entry =
1278 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1279 return LT.first * Entry->Cost;
1280 }
1281 if (Kind == TTI::SK_Reverse) {
1282 static const CostTblEntry NEONShuffleTbl[] = {
1283 // Reverse shuffle cost one instruction if we are shuffling within a
1284 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1285 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1286 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1287 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1288 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1289 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1290 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1291
1292 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1293 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1294 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1295 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1296
1297 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1298 if (const auto *Entry =
1299 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1300 return LT.first * Entry->Cost;
1301 }
1302 if (Kind == TTI::SK_Select) {
1303 static const CostTblEntry NEONSelShuffleTbl[] = {
1304 // Select shuffle cost table for ARM. Cost is the number of
1305 // instructions
1306 // required to create the shuffled vector.
1307
1308 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1309 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1310 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1311 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1312
1313 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1314 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1315 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1316
1317 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1318
1319 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1320
1321 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1322 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1323 ISD::VECTOR_SHUFFLE, LT.second))
1324 return LT.first * Entry->Cost;
1325 }
1326 }
1327 if (ST->hasMVEIntegerOps()) {
1328 if (Kind == TTI::SK_Broadcast) {
1329 static const CostTblEntry MVEDupTbl[] = {
1330 // VDUP handles these cases.
1331 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1332 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1333 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1334 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1335 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1336
1337 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1338 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1339 LT.second))
1340 return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
1341 }
1342
1343 if (!Mask.empty()) {
1344 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1345 // Check for LD2/LD4 instructions, which are represented in llvm IR as
1346 // deinterleaving-shuffle(load). The shuffle cost could potentially be
1347 // free, but we model it with a cost of LT.first so that LD2/LD4 have a
1348 // higher cost than just the load.
1349 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
1350 (LT.second.getScalarSizeInBits() == 8 ||
1351 LT.second.getScalarSizeInBits() == 16 ||
1352 LT.second.getScalarSizeInBits() == 32) &&
1353 LT.second.getSizeInBits() == 128 &&
1354 ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
1356 (TLI->getMaxSupportedInterleaveFactor() == 4 &&
1358 return ST->getMVEVectorCostFactor(CostKind) *
1359 std::max<InstructionCost>(1, LT.first / 4);
1360
1361 // Check for ST2/ST4 instructions, which are represented in llvm IR as
1362 // store(interleaving-shuffle). The shuffle cost could potentially be
1363 // free, but we model it with a cost of LT.first so that ST2/ST4 have a
1364 // higher cost than just the store.
1365 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
1366 (LT.second.getScalarSizeInBits() == 8 ||
1367 LT.second.getScalarSizeInBits() == 16 ||
1368 LT.second.getScalarSizeInBits() == 32) &&
1369 LT.second.getSizeInBits() == 128 &&
1370 ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
1372 Mask, 2, SrcTy->getElementCount().getKnownMinValue() * 2)) ||
1373 (TLI->getMaxSupportedInterleaveFactor() == 4 &&
1375 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2))))
1376 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1377
1378 if (LT.second.isVector() &&
1379 Mask.size() <= LT.second.getVectorNumElements() &&
1380 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1381 isVREVMask(Mask, LT.second, 64)))
1382 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1383 }
1384 }
1385
1386 // Restore optimal kind.
1387 if (IsExtractSubvector)
1389 int BaseCost = ST->hasMVEIntegerOps() && SrcTy->isVectorTy()
1390 ? ST->getMVEVectorCostFactor(CostKind)
1391 : 1;
1392 return BaseCost * BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind,
1393 Index, SubTp);
1394}
1395
1397 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1399 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
1400 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1401 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1402 // Make operations on i1 relatively expensive as this often involves
1403 // combining predicates. AND and XOR should be easier to handle with IT
1404 // blocks.
1405 switch (ISDOpcode) {
1406 default:
1407 break;
1408 case ISD::AND:
1409 case ISD::XOR:
1410 return 2;
1411 case ISD::OR:
1412 return 3;
1413 }
1414 }
1415
1416 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1417
1418 if (ST->hasNEON()) {
1419 const unsigned FunctionCallDivCost = 20;
1420 const unsigned ReciprocalDivCost = 10;
1421 static const CostTblEntry CostTbl[] = {
1422 // Division.
1423 // These costs are somewhat random. Choose a cost of 20 to indicate that
1424 // vectorizing devision (added function call) is going to be very expensive.
1425 // Double registers types.
1426 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1427 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1428 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1429 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1430 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1431 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1432 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1433 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1434 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1435 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1436 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1437 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1438 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1439 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1440 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1441 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1442 // Quad register types.
1443 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1444 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1445 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1446 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1447 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1448 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1449 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1450 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1451 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1452 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1453 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1454 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1455 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1456 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1457 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1458 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1459 // Multiplication.
1460 };
1461
1462 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1463 return LT.first * Entry->Cost;
1464
1466 Opcode, Ty, CostKind, Op1Info, Op2Info);
1467
1468 // This is somewhat of a hack. The problem that we are facing is that SROA
1469 // creates a sequence of shift, and, or instructions to construct values.
1470 // These sequences are recognized by the ISel and have zero-cost. Not so for
1471 // the vectorized code. Because we have support for v2i64 but not i64 those
1472 // sequences look particularly beneficial to vectorize.
1473 // To work around this we increase the cost of v2i64 operations to make them
1474 // seem less beneficial.
1475 if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1476 Cost += 4;
1477
1478 return Cost;
1479 }
1480
1481 // If this operation is a shift on arm/thumb2, it might well be folded into
1482 // the following instruction, hence having a cost of 0.
1483 auto LooksLikeAFreeShift = [&]() {
1484 if (ST->isThumb1Only() || Ty->isVectorTy())
1485 return false;
1486
1487 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1488 return false;
1489 if (!Op2Info.isUniform() || !Op2Info.isConstant())
1490 return false;
1491
1492 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1493 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1494 case Instruction::Add:
1495 case Instruction::Sub:
1496 case Instruction::And:
1497 case Instruction::Xor:
1498 case Instruction::Or:
1499 case Instruction::ICmp:
1500 return true;
1501 default:
1502 return false;
1503 }
1504 };
1505 if (LooksLikeAFreeShift())
1506 return 0;
1507
1508 // When targets have both DSP and MVE we find that the
1509 // the compiler will attempt to vectorize as well as using
1510 // scalar (S/U)MLAL operations. This is in cases where we have
1511 // the pattern ext(mul(ext(i16), ext(i16))) we find
1512 // that codegen performs better when only using (S/U)MLAL scalar
1513 // ops instead of trying to mix vector ops with (S/U)MLAL ops. We therefore
1514 // check if a mul instruction is used in a (U/S)MLAL pattern.
1515 auto MulInDSPMLALPattern = [&](const Instruction *I, unsigned Opcode,
1516 Type *Ty) -> bool {
1517 if (!ST->hasDSP())
1518 return false;
1519
1520 if (!I)
1521 return false;
1522
1523 if (Opcode != Instruction::Mul)
1524 return false;
1525
1526 if (Ty->isVectorTy())
1527 return false;
1528
1529 auto ValueOpcodesEqual = [](const Value *LHS, const Value *RHS) -> bool {
1530 return cast<Instruction>(LHS)->getOpcode() ==
1531 cast<Instruction>(RHS)->getOpcode();
1532 };
1533 auto IsExtInst = [](const Value *V) -> bool {
1534 return isa<ZExtInst>(V) || isa<SExtInst>(V);
1535 };
1536 auto IsExtensionFromHalf = [](const Value *V) -> bool {
1537 return cast<Instruction>(V)->getOperand(0)->getType()->isIntegerTy(16);
1538 };
1539
1540 // We check the arguments of the instruction to see if they're extends
1541 auto *BinOp = dyn_cast<BinaryOperator>(I);
1542 if (!BinOp)
1543 return false;
1544 Value *Op0 = BinOp->getOperand(0);
1545 Value *Op1 = BinOp->getOperand(1);
1546 if (IsExtInst(Op0) && IsExtInst(Op1) && ValueOpcodesEqual(Op0, Op1)) {
1547 // We're interested in an ext of an i16
1548 if (!I->getType()->isIntegerTy(32) || !IsExtensionFromHalf(Op0) ||
1549 !IsExtensionFromHalf(Op1))
1550 return false;
1551 // We need to check if this result will be further extended to i64
1552 // and that all these uses are SExt
1553 for (auto *U : I->users())
1554 if (!IsExtInst(U))
1555 return false;
1556 return true;
1557 }
1558
1559 return false;
1560 };
1561
1562 if (MulInDSPMLALPattern(CxtI, Opcode, Ty))
1563 return 0;
1564
1565 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1566 // for "multiple beats" potentially needed by MVE instructions.
1567 int BaseCost = 1;
1568 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1569 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1570
1571 // The rest of this mostly follows what is done in
1572 // BaseT::getArithmeticInstrCost, without treating floats as more expensive
1573 // that scalars or increasing the costs for custom operations. The results is
1574 // also multiplied by the MVEVectorCostFactor where appropriate.
1575 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1576 return LT.first * BaseCost;
1577
1578 // Else this is expand, assume that we need to scalarize this op.
1579 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1580 unsigned Num = VTy->getNumElements();
1582 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
1583 // Return the cost of multiple scalar invocation plus the cost of
1584 // inserting and extracting the values.
1585 SmallVector<Type *> Tys(Args.size(), Ty);
1586 return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1587 Num * Cost;
1588 }
1589
1590 return BaseCost;
1591}
1592
1594 Align Alignment,
1595 unsigned AddressSpace,
1597 TTI::OperandValueInfo OpInfo,
1598 const Instruction *I) const {
1599 // TODO: Handle other cost kinds.
1601 return 1;
1602
1603 // Type legalization can't handle structs
1604 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1605 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1606 CostKind);
1607
1608 if (ST->hasNEON() && Src->isVectorTy() && Alignment != Align(16) &&
1609 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1610 // Unaligned loads/stores are extremely inefficient.
1611 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1612 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1613 return LT.first * 4;
1614 }
1615
1616 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1617 // Same for stores.
1618 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1619 ((Opcode == Instruction::Load && I->hasOneUse() &&
1620 isa<FPExtInst>(*I->user_begin())) ||
1621 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1623 Type *DstTy =
1624 Opcode == Instruction::Load
1625 ? (*I->user_begin())->getType()
1626 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1627 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1628 DstTy->getScalarType()->isFloatTy())
1629 return ST->getMVEVectorCostFactor(CostKind);
1630 }
1631
1632 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1633 ? ST->getMVEVectorCostFactor(CostKind)
1634 : 1;
1635 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1636 CostKind, OpInfo, I);
1637}
1638
1642 switch (MICA.getID()) {
1643 case Intrinsic::masked_scatter:
1644 case Intrinsic::masked_gather:
1645 return getGatherScatterOpCost(MICA, CostKind);
1646 case Intrinsic::masked_load:
1647 case Intrinsic::masked_store:
1648 return getMaskedMemoryOpCost(MICA, CostKind);
1649 }
1651}
1652
1656 unsigned IID = MICA.getID();
1657 Type *Src = MICA.getDataType();
1658 Align Alignment = MICA.getAlignment();
1659 unsigned AddressSpace = MICA.getAddressSpace();
1660 if (ST->hasMVEIntegerOps()) {
1661 if (IID == Intrinsic::masked_load &&
1662 isLegalMaskedLoad(Src, Alignment, AddressSpace))
1663 return ST->getMVEVectorCostFactor(CostKind);
1664 if (IID == Intrinsic::masked_store &&
1665 isLegalMaskedStore(Src, Alignment, AddressSpace))
1666 return ST->getMVEVectorCostFactor(CostKind);
1667 }
1668 if (!isa<FixedVectorType>(Src))
1670 // Scalar cost, which is currently very high due to the efficiency of the
1671 // generated code.
1672 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1673}
1674
1676 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1677 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1678 bool UseMaskForCond, bool UseMaskForGaps) const {
1679 assert(Factor >= 2 && "Invalid interleave factor");
1680 assert(isa<VectorType>(VecTy) && "Expect a vector type");
1681
1682 // vldN/vstN doesn't support vector types of i64/f64 element.
1683 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1684
1685 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1686 !UseMaskForCond && !UseMaskForGaps) {
1687 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1688 auto *SubVecTy =
1689 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1690
1691 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1692 // Accesses having vector types that are a multiple of 128 bits can be
1693 // matched to more than one vldN/vstN instruction.
1694 int BaseCost =
1695 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1696 if (NumElts % Factor == 0 &&
1697 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1698 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1699
1700 // Some smaller than legal interleaved patterns are cheap as we can make
1701 // use of the vmovn or vrev patterns to interleave a standard load. This is
1702 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1703 // promoted differently). The cost of 2 here is then a load and vrev or
1704 // vmovn.
1705 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1706 VecTy->isIntOrIntVectorTy() &&
1707 DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
1708 return 2 * BaseCost;
1709 }
1710
1711 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1712 Alignment, AddressSpace, CostKind,
1713 UseMaskForCond, UseMaskForGaps);
1714}
1715
1719
1720 Type *DataTy = MICA.getDataType();
1721 const Value *Ptr = MICA.getPointer();
1722 bool VariableMask = MICA.getVariableMask();
1723 Align Alignment = MICA.getAlignment();
1724 const Instruction *I = MICA.getInst();
1725
1726 using namespace PatternMatch;
1727 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1729
1730 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1731 auto *VTy = cast<FixedVectorType>(DataTy);
1732
1733 // TODO: Splitting, once we do that.
1734
1735 unsigned NumElems = VTy->getNumElements();
1736 unsigned EltSize = VTy->getScalarSizeInBits();
1737 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
1738
1739 // For now, it is assumed that for the MVE gather instructions the loads are
1740 // all effectively serialised. This means the cost is the scalar cost
1741 // multiplied by the number of elements being loaded. This is possibly very
1742 // conservative, but even so we still end up vectorising loops because the
1743 // cost per iteration for many loops is lower than for scalar loops.
1744 InstructionCost VectorCost =
1745 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1746 // The scalarization cost should be a lot higher. We use the number of vector
1747 // elements plus the scalarization overhead. If masking is required then a lot
1748 // of little blocks will be needed and potentially a scalarized p0 mask,
1749 // greatly increasing the cost.
1750 InstructionCost ScalarCost =
1751 NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1752 BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
1753 CostKind) +
1754 BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
1755 CostKind);
1756
1757 if (EltSize < 8 || Alignment < EltSize / 8)
1758 return ScalarCost;
1759
1760 unsigned ExtSize = EltSize;
1761 // Check whether there's a single user that asks for an extended type
1762 if (I != nullptr) {
1763 // Dependent of the caller of this function, a gather instruction will
1764 // either have opcode Instruction::Load or be a call to the masked_gather
1765 // intrinsic
1766 if ((I->getOpcode() == Instruction::Load ||
1768 I->hasOneUse()) {
1769 const User *Us = *I->users().begin();
1770 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1771 // only allow valid type combinations
1772 unsigned TypeSize =
1773 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1774 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1775 (TypeSize == 16 && EltSize == 8)) &&
1776 TypeSize * NumElems == 128) {
1777 ExtSize = TypeSize;
1778 }
1779 }
1780 }
1781 // Check whether the input data needs to be truncated
1782 TruncInst *T;
1783 if ((I->getOpcode() == Instruction::Store ||
1785 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1786 // Only allow valid type combinations
1787 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1788 if (((EltSize == 16 && TypeSize == 32) ||
1789 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1790 TypeSize * NumElems == 128)
1791 ExtSize = TypeSize;
1792 }
1793 }
1794
1795 if (ExtSize * NumElems != 128 || NumElems < 4)
1796 return ScalarCost;
1797
1798 // Any (aligned) i32 gather will not need to be scalarised.
1799 if (ExtSize == 32)
1800 return VectorCost;
1801 // For smaller types, we need to ensure that the gep's inputs are correctly
1802 // extended from a small enough value. Other sizes (including i64) are
1803 // scalarized for now.
1804 if (ExtSize != 8 && ExtSize != 16)
1805 return ScalarCost;
1806
1807 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1808 Ptr = BC->getOperand(0);
1809 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1810 if (GEP->getNumOperands() != 2)
1811 return ScalarCost;
1812 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1813 // Scale needs to be correct (which is only relevant for i16s).
1814 if (Scale != 1 && Scale * 8 != ExtSize)
1815 return ScalarCost;
1816 // And we need to zext (not sext) the indexes from a small enough type.
1817 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1818 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1819 return VectorCost;
1820 }
1821 return ScalarCost;
1822 }
1823 return ScalarCost;
1824}
1825
1828 std::optional<FastMathFlags> FMF,
1830
1831 EVT ValVT = TLI->getValueType(DL, ValTy);
1832 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1833 unsigned EltSize = ValVT.getScalarSizeInBits();
1834
1835 // In general floating point reductions are a series of elementwise
1836 // operations, with free extracts on each step. These are either in-order or
1837 // treewise depending on whether that is allowed by the fast math flags.
1838 if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
1839 ((EltSize == 32 && ST->hasVFP2Base()) ||
1840 (EltSize == 64 && ST->hasFP64()) ||
1841 (EltSize == 16 && ST->hasFullFP16()))) {
1842 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1843 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1844 InstructionCost VecCost = 0;
1845 while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(NumElts) &&
1846 NumElts * EltSize > VecLimit) {
1847 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1848 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1849 NumElts /= 2;
1850 }
1851
1852 // For fp16 we need to extract the upper lane elements. MVE can add a
1853 // VREV+FMIN/MAX to perform another vector step instead.
1854 InstructionCost ExtractCost = 0;
1855 if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1856 ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
1857 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1858 NumElts /= 2;
1859 } else if (ValVT.getVectorElementType() == MVT::f16)
1860 ExtractCost = NumElts / 2;
1861
1862 return VecCost + ExtractCost +
1863 NumElts *
1864 getArithmeticInstrCost(Opcode, ValTy->getElementType(), CostKind);
1865 }
1866
1867 if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
1868 (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
1869 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1870 unsigned VecLimit =
1871 ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1872 InstructionCost VecCost = 0;
1873 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1874 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1875 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1876 NumElts /= 2;
1877 }
1878 // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1879 // step.
1880 if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
1881 NumElts * EltSize == 64) {
1882 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts);
1883 VecCost += ST->getMVEVectorCostFactor(CostKind) +
1884 getArithmeticInstrCost(Opcode, VecTy, CostKind);
1885 NumElts /= 2;
1886 }
1887
1888 // From here we extract the elements and perform the and/or/xor.
1889 InstructionCost ExtractCost = NumElts;
1890 return VecCost + ExtractCost +
1891 (NumElts - 1) * getArithmeticInstrCost(
1892 Opcode, ValTy->getElementType(), CostKind);
1893 }
1894
1895 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
1897 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1898
1899 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1900
1901 static const CostTblEntry CostTblAdd[]{
1902 {ISD::ADD, MVT::v16i8, 1},
1903 {ISD::ADD, MVT::v8i16, 1},
1904 {ISD::ADD, MVT::v4i32, 1},
1905 };
1906 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1907 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1908
1909 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1910}
1911
1913 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1914 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
1915 EVT ValVT = TLI->getValueType(DL, ValTy);
1916 EVT ResVT = TLI->getValueType(DL, ResTy);
1917
1918 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1919
1920 switch (ISD) {
1921 case ISD::ADD:
1922 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1923 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1924
1925 // The legal cases are:
1926 // VADDV u/s 8/16/32
1927 // VADDLV u/s 32
1928 // Codegen currently cannot always handle larger than legal vectors very
1929 // well, especially for predicated reductions where the mask needs to be
1930 // split, so restrict to 128bit or smaller input types.
1931 unsigned RevVTSize = ResVT.getSizeInBits();
1932 if (ValVT.getSizeInBits() <= 128 &&
1933 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1934 (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1935 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1936 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1937 }
1938 break;
1939 default:
1940 break;
1941 }
1942 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1943 CostKind);
1944}
1945
1947ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
1948 Type *ResTy, VectorType *ValTy,
1950 if (RedOpcode != Instruction::Add)
1952 EVT ValVT = TLI->getValueType(DL, ValTy);
1953 EVT ResVT = TLI->getValueType(DL, ResTy);
1954
1955 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1956 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1957
1958 // The legal cases are:
1959 // VMLAV u/s 8/16/32
1960 // VMLALV u/s 16/32
1961 // Codegen currently cannot always handle larger than legal vectors very
1962 // well, especially for predicated reductions where the mask needs to be
1963 // split, so restrict to 128bit or smaller input types.
1964 unsigned RevVTSize = ResVT.getSizeInBits();
1965 if (ValVT.getSizeInBits() <= 128 &&
1966 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1967 (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
1968 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1969 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1970 }
1971
1972 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, ValTy,
1973 CostKind);
1974}
1975
1978 FastMathFlags FMF,
1980 EVT ValVT = TLI->getValueType(DL, Ty);
1981
1982 // In general floating point reductions are a series of elementwise
1983 // operations, with free extracts on each step. These are either in-order or
1984 // treewise depending on whether that is allowed by the fast math flags.
1985 if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
1986 ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
1987 (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
1988 (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
1989 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
1990 unsigned EltSize = ValVT.getScalarSizeInBits();
1991 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1992 InstructionCost VecCost;
1993 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1994 Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);
1995 IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
1996 VecCost += getIntrinsicInstrCost(ICA, CostKind);
1997 NumElts /= 2;
1998 }
1999
2000 // For fp16 we need to extract the upper lane elements. MVE can add a
2001 // VREV+FMIN/MAX to perform another vector step instead.
2002 InstructionCost ExtractCost = 0;
2003 if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
2004 NumElts == 8) {
2005 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
2006 NumElts /= 2;
2007 } else if (ValVT.getVectorElementType() == MVT::f16)
2008 ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;
2009
2010 IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
2011 {Ty->getElementType(), Ty->getElementType()},
2012 FMF);
2013 return VecCost + ExtractCost +
2014 (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
2015 }
2016
2017 if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
2018 IID == Intrinsic::umin || IID == Intrinsic::umax) {
2019 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2020
2021 // All costs are the same for u/s min/max. These lower to vminv, which are
2022 // given a slightly higher cost as they tend to take multiple cycles for
2023 // smaller type sizes.
2024 static const CostTblEntry CostTblAdd[]{
2025 {ISD::SMIN, MVT::v16i8, 4},
2026 {ISD::SMIN, MVT::v8i16, 3},
2027 {ISD::SMIN, MVT::v4i32, 2},
2028 };
2029 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))
2030 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
2031 }
2032
2033 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2034}
2035
2039 unsigned Opc = ICA.getID();
2040 switch (Opc) {
2041 case Intrinsic::get_active_lane_mask:
2042 // Currently we make a somewhat optimistic assumption that
2043 // active_lane_mask's are always free. In reality it may be freely folded
2044 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
2045 // of add/icmp code. We may need to improve this in the future, but being
2046 // able to detect if it is free or not involves looking at a lot of other
2047 // code. We currently assume that the vectorizer inserted these, and knew
2048 // what it was doing in adding one.
2049 if (ST->hasMVEIntegerOps())
2050 return 0;
2051 break;
2052 case Intrinsic::sadd_sat:
2053 case Intrinsic::ssub_sat:
2054 case Intrinsic::uadd_sat:
2055 case Intrinsic::usub_sat: {
2056 bool IsAdd = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
2057 bool IsSigned = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
2058 Type *RetTy = ICA.getReturnType();
2059
2060 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
2061 if (IsSigned && ST->hasDSP() && ITy->getBitWidth() == 32)
2062 return 1; // qadd / qsub
2063 if (ST->hasDSP() && (ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16))
2064 return 2; // uqadd16 / qadd16 / uqsub16 / qsub16 + possible extend.
2065 // Otherwise return the cost of expanding the node. Generally an add +
2066 // icmp + sel.
2068 Type *CondTy = RetTy->getWithNewBitWidth(1);
2069 return getArithmeticInstrCost(IsAdd ? Instruction::Add : Instruction::Sub,
2070 RetTy, CostKind) +
2071 2 * getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, Pred,
2072 CostKind) +
2073 2 * getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, Pred,
2074 CostKind);
2075 }
2076
2077 if (!ST->hasMVEIntegerOps())
2078 break;
2079
2080 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
2081 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2082 LT.second == MVT::v16i8) {
2083 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
2084 // need to extend the type, as it uses shr(qadd(shl, shl)).
2085 unsigned Instrs =
2086 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1
2087 : 4;
2088 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
2089 }
2090 break;
2091 }
2092 case Intrinsic::abs:
2093 case Intrinsic::smin:
2094 case Intrinsic::smax:
2095 case Intrinsic::umin:
2096 case Intrinsic::umax: {
2097 if (!ST->hasMVEIntegerOps())
2098 break;
2099 Type *VT = ICA.getReturnType();
2100
2101 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
2102 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2103 LT.second == MVT::v16i8)
2104 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2105 break;
2106 }
2107 case Intrinsic::minnum:
2108 case Intrinsic::maxnum: {
2109 if (!ST->hasMVEFloatOps())
2110 break;
2111 Type *VT = ICA.getReturnType();
2112 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
2113 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
2114 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2115 break;
2116 }
2117 case Intrinsic::fptosi_sat:
2118 case Intrinsic::fptoui_sat: {
2119 if (ICA.getArgTypes().empty())
2120 break;
2121 bool IsSigned = Opc == Intrinsic::fptosi_sat;
2122 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
2123 EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
2124 // Check for the legal types, with the corect subtarget features.
2125 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
2126 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
2127 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
2128 return LT.first;
2129
2130 // Equally for MVE vector types
2131 if (ST->hasMVEFloatOps() &&
2132 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
2133 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
2134 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2135
2136 // If we can we use a legal convert followed by a min+max
2137 if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
2138 (ST->hasFP64() && LT.second == MVT::f64) ||
2139 (ST->hasFullFP16() && LT.second == MVT::f16) ||
2140 (ST->hasMVEFloatOps() &&
2141 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
2142 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
2143 Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
2144 LT.second.getScalarSizeInBits());
2146 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
2147 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
2148 : Intrinsic::umin,
2149 LegalTy, {LegalTy, LegalTy});
2151 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
2152 : Intrinsic::umax,
2153 LegalTy, {LegalTy, LegalTy});
2155 return LT.first * Cost;
2156 }
2157 // Otherwise we need to follow the default expansion that clamps the value
2158 // using a float min/max with a fcmp+sel for nan handling when signed.
2159 Type *FPTy = ICA.getArgTypes()[0];
2160 Type *RetTy = ICA.getReturnType();
2161 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
2163 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
2165 Cost +=
2166 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
2167 RetTy, FPTy, TTI::CastContextHint::None, CostKind);
2168 if (IsSigned) {
2169 Type *CondTy = RetTy->getWithNewBitWidth(1);
2170 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
2172 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2174 }
2175 return Cost;
2176 }
2177 }
2178
2180}
2181
2183 if (!F->isIntrinsic())
2184 return BaseT::isLoweredToCall(F);
2185
2186 // Assume all Arm-specific intrinsics map to an instruction.
2187 if (F->getName().starts_with("llvm.arm"))
2188 return false;
2189
2190 switch (F->getIntrinsicID()) {
2191 default: break;
2192 case Intrinsic::powi:
2193 case Intrinsic::sin:
2194 case Intrinsic::cos:
2195 case Intrinsic::sincos:
2196 case Intrinsic::pow:
2197 case Intrinsic::log:
2198 case Intrinsic::log10:
2199 case Intrinsic::log2:
2200 case Intrinsic::exp:
2201 case Intrinsic::exp2:
2202 return true;
2203 case Intrinsic::sqrt:
2204 case Intrinsic::fabs:
2205 case Intrinsic::copysign:
2206 case Intrinsic::floor:
2207 case Intrinsic::ceil:
2208 case Intrinsic::trunc:
2209 case Intrinsic::rint:
2210 case Intrinsic::nearbyint:
2211 case Intrinsic::round:
2212 case Intrinsic::canonicalize:
2213 case Intrinsic::lround:
2214 case Intrinsic::llround:
2215 case Intrinsic::lrint:
2216 case Intrinsic::llrint:
2217 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2218 return true;
2219 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2220 return true;
2221 // Some operations can be handled by vector instructions and assume
2222 // unsupported vectors will be expanded into supported scalar ones.
2223 // TODO Handle scalar operations properly.
2224 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2225 case Intrinsic::masked_store:
2226 case Intrinsic::masked_load:
2227 case Intrinsic::masked_gather:
2228 case Intrinsic::masked_scatter:
2229 return !ST->hasMVEIntegerOps();
2230 case Intrinsic::sadd_with_overflow:
2231 case Intrinsic::uadd_with_overflow:
2232 case Intrinsic::ssub_with_overflow:
2233 case Intrinsic::usub_with_overflow:
2234 case Intrinsic::sadd_sat:
2235 case Intrinsic::uadd_sat:
2236 case Intrinsic::ssub_sat:
2237 case Intrinsic::usub_sat:
2238 return false;
2239 }
2240
2241 return BaseT::isLoweredToCall(F);
2242}
2243
2245 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
2246 EVT VT = TLI->getValueType(DL, I.getType(), true);
2247 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
2248 return true;
2249
2250 // Check if an intrinsic will be lowered to a call and assume that any
2251 // other CallInst will generate a bl.
2252 if (auto *Call = dyn_cast<CallInst>(&I)) {
2253 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
2254 switch(II->getIntrinsicID()) {
2255 case Intrinsic::memcpy:
2256 case Intrinsic::memset:
2257 case Intrinsic::memmove:
2258 return getNumMemOps(II) == -1;
2259 default:
2260 if (const Function *F = Call->getCalledFunction())
2261 return isLoweredToCall(F);
2262 }
2263 }
2264 return true;
2265 }
2266
2267 // FPv5 provides conversions between integer, double-precision,
2268 // single-precision, and half-precision formats.
2269 switch (I.getOpcode()) {
2270 default:
2271 break;
2272 case Instruction::FPToSI:
2273 case Instruction::FPToUI:
2274 case Instruction::SIToFP:
2275 case Instruction::UIToFP:
2276 case Instruction::FPTrunc:
2277 case Instruction::FPExt:
2278 return !ST->hasFPARMv8Base();
2279 }
2280
2281 // FIXME: Unfortunately the approach of checking the Operation Action does
2282 // not catch all cases of Legalization that use library calls. Our
2283 // Legalization step categorizes some transformations into library calls as
2284 // Custom, Expand or even Legal when doing type legalization. So for now
2285 // we have to special case for instance the SDIV of 64bit integers and the
2286 // use of floating point emulation.
2287 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
2288 switch (ISD) {
2289 default:
2290 break;
2291 case ISD::SDIV:
2292 case ISD::UDIV:
2293 case ISD::SREM:
2294 case ISD::UREM:
2295 case ISD::SDIVREM:
2296 case ISD::UDIVREM:
2297 return true;
2298 }
2299 }
2300
2301 // Assume all other non-float operations are supported.
2302 if (!VT.isFloatingPoint())
2303 return false;
2304
2305 // We'll need a library call to handle most floats when using soft.
2306 if (TLI->useSoftFloat()) {
2307 switch (I.getOpcode()) {
2308 default:
2309 return true;
2310 case Instruction::Alloca:
2311 case Instruction::Load:
2312 case Instruction::Store:
2313 case Instruction::Select:
2314 case Instruction::PHI:
2315 return false;
2316 }
2317 }
2318
2319 // We'll need a libcall to perform double precision operations on a single
2320 // precision only FPU.
2321 if (I.getType()->isDoubleTy() && !ST->hasFP64())
2322 return true;
2323
2324 // Likewise for half precision arithmetic.
2325 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2326 return true;
2327
2328 return false;
2329}
2330
2332 AssumptionCache &AC,
2333 TargetLibraryInfo *LibInfo,
2334 HardwareLoopInfo &HWLoopInfo) const {
2335 // Low-overhead branches are only supported in the 'low-overhead branch'
2336 // extension of v8.1-m.
2337 if (!ST->hasLOB() || DisableLowOverheadLoops) {
2338 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2339 return false;
2340 }
2341
2343 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2344 return false;
2345 }
2346
2347 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2348 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2349 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2350 return false;
2351 }
2352
2353 const SCEV *TripCountSCEV =
2354 SE.getAddExpr(BackedgeTakenCount,
2355 SE.getOne(BackedgeTakenCount->getType()));
2356
2357 // We need to store the trip count in LR, a 32-bit register.
2358 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2359 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2360 return false;
2361 }
2362
2363 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2364 // point in generating a hardware loop if that's going to happen.
2365
2366 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2367 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2368 switch (Call->getIntrinsicID()) {
2369 default:
2370 break;
2371 case Intrinsic::start_loop_iterations:
2372 case Intrinsic::test_start_loop_iterations:
2373 case Intrinsic::loop_decrement:
2374 case Intrinsic::loop_decrement_reg:
2375 return true;
2376 }
2377 }
2378 return false;
2379 };
2380
2381 // Scan the instructions to see if there's any that we know will turn into a
2382 // call or if this loop is already a low-overhead loop or will become a tail
2383 // predicated loop.
2384 bool IsTailPredLoop = false;
2385 auto ScanLoop = [&](Loop *L) {
2386 for (auto *BB : L->getBlocks()) {
2387 for (auto &I : *BB) {
2388 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2389 isa<InlineAsm>(I)) {
2390 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2391 return false;
2392 }
2393 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2394 IsTailPredLoop |=
2395 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2396 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2397 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2398 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2399 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2400 }
2401 }
2402 return true;
2403 };
2404
2405 // Visit inner loops.
2406 for (auto *Inner : *L)
2407 if (!ScanLoop(Inner))
2408 return false;
2409
2410 if (!ScanLoop(L))
2411 return false;
2412
2413 // TODO: Check whether the trip count calculation is expensive. If L is the
2414 // inner loop but we know it has a low trip count, calculating that trip
2415 // count (in the parent loop) may be detrimental.
2416
2417 LLVMContext &C = L->getHeader()->getContext();
2418 HWLoopInfo.CounterInReg = true;
2419 HWLoopInfo.IsNestingLegal = false;
2420 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2421 HWLoopInfo.CountType = Type::getInt32Ty(C);
2422 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2423 return true;
2424}
2425
2426static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2427 // We don't allow icmp's, and because we only look at single block loops,
2428 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2429 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2430 return false;
2431 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2432 // not currently canonical, but soon will be. Code without them uses icmp, and
2433 // so is not tail predicated as per the condition above. In order to get the
2434 // same performance we treat min and max the same as an icmp for tailpred
2435 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2436 // pick more optimial instructions like VQDMULH. They need to be recognized
2437 // directly by the vectorizer).
2438 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2439 if ((II->getIntrinsicID() == Intrinsic::smin ||
2440 II->getIntrinsicID() == Intrinsic::smax ||
2441 II->getIntrinsicID() == Intrinsic::umin ||
2442 II->getIntrinsicID() == Intrinsic::umax) &&
2443 ++ICmpCount > 1)
2444 return false;
2445
2446 if (isa<FCmpInst>(&I))
2447 return false;
2448
2449 // We could allow extending/narrowing FP loads/stores, but codegen is
2450 // too inefficient so reject this for now.
2452 return false;
2453
2454 // Extends have to be extending-loads
2455 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2456 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2457 return false;
2458
2459 // Truncs have to be narrowing-stores
2460 if (isa<TruncInst>(&I) )
2461 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2462 return false;
2463
2464 return true;
2465}
2466
2467// To set up a tail-predicated loop, we need to know the total number of
2468// elements processed by that loop. Thus, we need to determine the element
2469// size and:
2470// 1) it should be uniform for all operations in the vector loop, so we
2471// e.g. don't want any widening/narrowing operations.
2472// 2) it should be smaller than i64s because we don't have vector operations
2473// that work on i64s.
2474// 3) we don't want elements to be reversed or shuffled, to make sure the
2475// tail-predication masks/predicates the right lanes.
2476//
2478 const DataLayout &DL,
2479 const LoopAccessInfo *LAI,
2480 const DominatorTree &DT) {
2481 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2482
2483 // If there are live-out values, it is probably a reduction. We can predicate
2484 // most reduction operations freely under MVE using a combination of
2485 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2486 // floating point and integer reductions, but don't check for operators
2487 // specifically here. If the value ends up not being a reduction (and so the
2488 // vectorizer cannot tailfold the loop), we should fall back to standard
2489 // vectorization automatically.
2491 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2492 bool ReductionsDisabled =
2495
2496 for (auto *I : LiveOuts) {
2497 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2498 !I->getType()->isHalfTy()) {
2499 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2500 "live-out value\n");
2501 return false;
2502 }
2503 if (ReductionsDisabled) {
2504 LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2505 return false;
2506 }
2507 }
2508
2509 // Next, check that all instructions can be tail-predicated.
2510 PredicatedScalarEvolution PSE = LAI->getPSE();
2511 int ICmpCount = 0;
2512
2513 for (BasicBlock *BB : L->blocks()) {
2514 for (Instruction &I : BB->instructionsWithoutDebug()) {
2515 if (isa<PHINode>(&I))
2516 continue;
2517 if (!canTailPredicateInstruction(I, ICmpCount)) {
2518 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2519 return false;
2520 }
2521
2522 Type *T = I.getType();
2523 if (T->getScalarSizeInBits() > 32) {
2524 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2525 return false;
2526 }
2527 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2529 Type *AccessTy = getLoadStoreType(&I);
2530 int64_t NextStride =
2531 getPtrStride(PSE, AccessTy, Ptr, L, DT).value_or(0);
2532 if (NextStride == 1) {
2533 // TODO: for now only allow consecutive strides of 1. We could support
2534 // other strides as long as it is uniform, but let's keep it simple
2535 // for now.
2536 continue;
2537 } else if (NextStride == -1 ||
2538 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2539 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2541 << "Consecutive strides of 2 found, vld2/vstr2 can't "
2542 "be tail-predicated\n.");
2543 return false;
2544 // TODO: don't tail predicate if there is a reversed load?
2545 } else if (EnableMaskedGatherScatters) {
2546 // Gather/scatters do allow loading from arbitrary strides, at
2547 // least if they are loop invariant.
2548 // TODO: Loop variant strides should in theory work, too, but
2549 // this requires further testing.
2550 const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2551 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2552 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2553 if (PSE.getSE()->isLoopInvariant(Step, L))
2554 continue;
2555 }
2556 }
2557 LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2558 "tail-predicate\n.");
2559 return false;
2560 }
2561 }
2562 }
2563
2564 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2565 return true;
2566}
2567
2569 if (!EnableTailPredication) {
2570 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2571 return false;
2572 }
2573
2574 // Creating a predicated vector loop is the first step for generating a
2575 // tail-predicated hardware loop, for which we need the MVE masked
2576 // load/stores instructions:
2577 if (!ST->hasMVEIntegerOps())
2578 return false;
2579
2580 LoopVectorizationLegality *LVL = TFI->LVL;
2581 Loop *L = LVL->getLoop();
2582
2583 // For now, restrict this to single block loops.
2584 if (L->getNumBlocks() > 1) {
2585 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2586 "loop.\n");
2587 return false;
2588 }
2589
2590 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2591
2592 LoopInfo *LI = LVL->getLoopInfo();
2593 HardwareLoopInfo HWLoopInfo(L);
2594 if (!HWLoopInfo.canAnalyze(*LI)) {
2595 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2596 "analyzable.\n");
2597 return false;
2598 }
2599
2602
2603 // This checks if we have the low-overhead branch architecture
2604 // extension, and if we will create a hardware-loop:
2605 if (!isHardwareLoopProfitable(L, *SE, *AC, TFI->TLI, HWLoopInfo)) {
2606 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2607 "profitable.\n");
2608 return false;
2609 }
2610
2611 DominatorTree *DT = LVL->getDominatorTree();
2612 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) {
2613 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2614 "a candidate.\n");
2615 return false;
2616 }
2617
2618 return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(),
2619 *LVL->getDominatorTree());
2620}
2621
2623ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
2624 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2626
2627 // Intrinsic @llvm.get.active.lane.mask is supported.
2628 // It is used in the MVETailPredication pass, which requires the number of
2629 // elements processed by this vector loop to setup the tail-predicated
2630 // loop.
2632}
2635 OptimizationRemarkEmitter *ORE) const {
2636 // Enable Upper bound unrolling universally, providing that we do not see an
2637 // active lane mask, which will be better kept as a loop to become tail
2638 // predicated than to be conditionally unrolled.
2639 UP.UpperBound =
2640 !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) {
2641 return isa<IntrinsicInst>(I) &&
2642 cast<IntrinsicInst>(I).getIntrinsicID() ==
2643 Intrinsic::get_active_lane_mask;
2644 });
2645
2646 // Only currently enable these preferences for M-Class cores.
2647 if (!ST->isMClass())
2648 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2649
2650 // Disable loop unrolling for Oz and Os.
2651 UP.OptSizeThreshold = 0;
2653 if (L->getHeader()->getParent()->hasOptSize())
2654 return;
2655
2656 SmallVector<BasicBlock*, 4> ExitingBlocks;
2657 L->getExitingBlocks(ExitingBlocks);
2658 LLVM_DEBUG(dbgs() << "Loop has:\n"
2659 << "Blocks: " << L->getNumBlocks() << "\n"
2660 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2661
2662 // Only allow another exit other than the latch. This acts as an early exit
2663 // as it mirrors the profitability calculation of the runtime unroller.
2664 if (ExitingBlocks.size() > 2)
2665 return;
2666
2667 // Limit the CFG of the loop body for targets with a branch predictor.
2668 // Allowing 4 blocks permits if-then-else diamonds in the body.
2669 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2670 return;
2671
2672 // Don't unroll vectorized loops, including the remainder loop
2673 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2674 return;
2675
2676 // Scan the loop: don't unroll loops with calls as this could prevent
2677 // inlining.
2679 for (auto *BB : L->getBlocks()) {
2680 for (auto &I : *BB) {
2681 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2682 // scalar code.
2683 if (I.getType()->isVectorTy())
2684 return;
2685
2686 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2687 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2688 if (!isLoweredToCall(F))
2689 continue;
2690 }
2691 return;
2692 }
2693
2694 SmallVector<const Value*, 4> Operands(I.operand_values());
2695 Cost += getInstructionCost(&I, Operands,
2697 }
2698 }
2699
2700 // On v6m cores, there are very few registers available. We can easily end up
2701 // spilling and reloading more registers in an unrolled loop. Look at the
2702 // number of LCSSA phis as a rough measure of how many registers will need to
2703 // be live out of the loop, reducing the default unroll count if more than 1
2704 // value is needed. In the long run, all of this should be being learnt by a
2705 // machine.
2706 unsigned UnrollCount = 4;
2707 if (ST->isThumb1Only()) {
2708 unsigned ExitingValues = 0;
2710 L->getExitBlocks(ExitBlocks);
2711 for (auto *Exit : ExitBlocks) {
2712 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2713 // only the last is expected to be needed for address operands.
2714 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2715 return PH.getNumOperands() != 1 ||
2716 !isa<GetElementPtrInst>(PH.getOperand(0));
2717 });
2718 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2719 }
2720 if (ExitingValues)
2721 UnrollCount /= ExitingValues;
2722 if (UnrollCount <= 1)
2723 return;
2724 }
2725
2726 // For processors with low overhead branching (LOB), runtime unrolling the
2727 // innermost loop is often detrimental to performance. In these cases the loop
2728 // remainder gets unrolled into a series of compare-and-jump blocks, which in
2729 // deeply nested loops get executed multiple times, negating the benefits of
2730 // LOB. This is particularly noticable when the loop trip count of the
2731 // innermost loop varies within the outer loop, such as in the case of
2732 // triangular matrix decompositions. In these cases we will prefer to not
2733 // unroll the innermost loop, with the intention for it to be executed as a
2734 // low overhead loop.
2735 bool Runtime = true;
2736 if (ST->hasLOB()) {
2738 const auto *BETC = SE.getBackedgeTakenCount(L);
2739 auto *Outer = L->getOutermostLoop();
2740 if ((L != Outer && Outer != L->getParentLoop()) ||
2741 (L != Outer && BETC && !SE.isLoopInvariant(BETC, Outer))) {
2742 Runtime = false;
2743 }
2744 }
2745 }
2746
2747 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2748 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2749
2750 UP.Partial = true;
2751 UP.Runtime = Runtime;
2752 UP.UnrollRemainder = true;
2754 UP.UnrollAndJam = true;
2756
2757 // Force unrolling small loops can be very useful because of the branch
2758 // taken cost of the backedge.
2760 UP.Force = true;
2761}
2762
2767
2769 if (!ST->hasMVEIntegerOps())
2770 return false;
2771
2772 unsigned ScalarBits = Ty->getScalarSizeInBits();
2773 switch (Kind) {
2774 case RecurKind::Add:
2775 return ScalarBits <= 64;
2776 default:
2777 return false;
2778 }
2779}
2780
2782 if (!ST->hasMVEIntegerOps())
2783 return false;
2784 return true;
2785}
2786
2788 StackOffset BaseOffset,
2789 bool HasBaseReg, int64_t Scale,
2790 unsigned AddrSpace) const {
2792 AM.BaseGV = BaseGV;
2793 AM.BaseOffs = BaseOffset.getFixed();
2794 AM.HasBaseReg = HasBaseReg;
2795 AM.Scale = Scale;
2796 AM.ScalableOffset = BaseOffset.getScalable();
2797 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2798 if (ST->hasFPAO())
2799 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2800 return 0;
2801 }
2803}
2804
2805bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2806 if (Thumb) {
2807 // B.W is available in any Thumb2-supporting target, and also in every
2808 // version of Armv8-M, even Baseline which does not include the rest of
2809 // Thumb2.
2810 return ST->isThumb2() || ST->hasV8MBaselineOps();
2811 } else {
2812 // B is available in all versions of the Arm ISA, so the only question is
2813 // whether that ISA is available at all.
2814 return ST->hasARMOps();
2815 }
2816}
2817
2818/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
2819/// of the vector elements.
2820static bool areExtractExts(Value *Ext1, Value *Ext2) {
2821 using namespace PatternMatch;
2822
2823 auto areExtDoubled = [](Instruction *Ext) {
2824 return Ext->getType()->getScalarSizeInBits() ==
2825 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
2826 };
2827
2828 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
2829 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
2830 !areExtDoubled(cast<Instruction>(Ext1)) ||
2831 !areExtDoubled(cast<Instruction>(Ext2)))
2832 return false;
2833
2834 return true;
2835}
2836
2837/// Check if sinking \p I's operands to I's basic block is profitable, because
2838/// the operands can be folded into a target instruction, e.g.
2839/// sext/zext can be folded into vsubl.
2841 SmallVectorImpl<Use *> &Ops) const {
2842 using namespace PatternMatch;
2843
2844 if (!I->getType()->isVectorTy())
2845 return false;
2846
2847 if (ST->hasNEON()) {
2848 switch (I->getOpcode()) {
2849 case Instruction::Sub:
2850 case Instruction::Add: {
2851 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
2852 return false;
2853 Ops.push_back(&I->getOperandUse(0));
2854 Ops.push_back(&I->getOperandUse(1));
2855 return true;
2856 }
2857 default:
2858 return false;
2859 }
2860 }
2861
2862 if (!ST->hasMVEIntegerOps())
2863 return false;
2864
2865 auto IsFMSMul = [&](Instruction *I) {
2866 if (!I->hasOneUse())
2867 return false;
2868 auto *Sub = cast<Instruction>(*I->users().begin());
2869 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
2870 };
2871 auto IsFMS = [&](Instruction *I) {
2872 if (match(I->getOperand(0), m_FNeg(m_Value())) ||
2873 match(I->getOperand(1), m_FNeg(m_Value())))
2874 return true;
2875 return false;
2876 };
2877
2878 auto IsSinker = [&](Instruction *I, int Operand) {
2879 switch (I->getOpcode()) {
2880 case Instruction::Add:
2881 case Instruction::Mul:
2882 case Instruction::FAdd:
2883 case Instruction::ICmp:
2884 case Instruction::FCmp:
2885 return true;
2886 case Instruction::FMul:
2887 return !IsFMSMul(I);
2888 case Instruction::Sub:
2889 case Instruction::FSub:
2890 case Instruction::Shl:
2891 case Instruction::LShr:
2892 case Instruction::AShr:
2893 return Operand == 1;
2894 case Instruction::Call:
2895 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
2896 switch (II->getIntrinsicID()) {
2897 case Intrinsic::fma:
2898 return !IsFMS(I);
2899 case Intrinsic::sadd_sat:
2900 case Intrinsic::uadd_sat:
2901 case Intrinsic::arm_mve_add_predicated:
2902 case Intrinsic::arm_mve_mul_predicated:
2903 case Intrinsic::arm_mve_qadd_predicated:
2904 case Intrinsic::arm_mve_vhadd:
2905 case Intrinsic::arm_mve_hadd_predicated:
2906 case Intrinsic::arm_mve_vqdmull:
2907 case Intrinsic::arm_mve_vqdmull_predicated:
2908 case Intrinsic::arm_mve_vqdmulh:
2909 case Intrinsic::arm_mve_qdmulh_predicated:
2910 case Intrinsic::arm_mve_vqrdmulh:
2911 case Intrinsic::arm_mve_qrdmulh_predicated:
2912 case Intrinsic::arm_mve_fma_predicated:
2913 return true;
2914 case Intrinsic::ssub_sat:
2915 case Intrinsic::usub_sat:
2916 case Intrinsic::arm_mve_sub_predicated:
2917 case Intrinsic::arm_mve_qsub_predicated:
2918 case Intrinsic::arm_mve_hsub_predicated:
2919 case Intrinsic::arm_mve_vhsub:
2920 return Operand == 1;
2921 default:
2922 return false;
2923 }
2924 }
2925 return false;
2926 default:
2927 return false;
2928 }
2929 };
2930
2931 for (auto OpIdx : enumerate(I->operands())) {
2932 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
2933 // Make sure we are not already sinking this operand
2934 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
2935 continue;
2936
2937 Instruction *Shuffle = Op;
2938 if (Shuffle->getOpcode() == Instruction::BitCast)
2939 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
2940 // We are looking for a splat that can be sunk.
2941 if (!Shuffle || !match(Shuffle, m_Shuffle(m_InsertElt(m_Undef(), m_Value(),
2942 m_ZeroInt()),
2943 m_Undef(), m_ZeroMask())))
2944 continue;
2945 if (!IsSinker(I, OpIdx.index()))
2946 continue;
2947
2948 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
2949 // and vector registers
2950 for (Use &U : Op->uses()) {
2951 Instruction *Insn = cast<Instruction>(U.getUser());
2952 if (!IsSinker(Insn, U.getOperandNo()))
2953 return false;
2954 }
2955
2956 Ops.push_back(&Shuffle->getOperandUse(0));
2957 if (Shuffle != Op)
2958 Ops.push_back(&Op->getOperandUse(0));
2959 Ops.push_back(&OpIdx.value());
2960 }
2961 return true;
2962}
2963
2965 Type *ArrayType) const {
2966 if (!UseWidenGlobalArrays) {
2967 LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
2968 return false;
2969 }
2970
2971 // Don't modify none integer array types
2972 if (!ArrayType || !ArrayType->isArrayTy() ||
2974 return 0;
2975
2976 // We pad to 4 byte boundaries
2977 if (Size % 4 == 0)
2978 return 0;
2979
2980 unsigned NumBytesToPad = 4 - (Size % 4);
2981 unsigned NewSize = Size + NumBytesToPad;
2982
2983 // Max number of bytes that memcpy allows for lowering to load/stores before
2984 // it uses library function (__aeabi_memcpy).
2985 unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
2986
2987 if (NewSize > MaxMemIntrinsicSize)
2988 return 0;
2989
2990 return NumBytesToPad;
2991}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
This file implements a class to represent arbitrary precision integral constant values and operations...
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< int > ArmForceUnrollThreshold("arm-force-unroll-threshold", cl::init(12), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in Arm architecture"))
static Value * isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI, const DominatorTree &DT)
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm)
static cl::opt< bool > UseWidenGlobalArrays("widen-global-strings", cl::Hidden, cl::init(true), cl::desc("Enable the widening of global strings to alignment boundaries"))
cl::opt< bool > EnableMaskedGatherScatters
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
cl::opt< TailPredication::Mode > EnableTailPredication
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
This file provides the interface for the instcombine pass implementation.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition APInt.h:78
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMemcpyCost(const Instruction *I) const override
bool maybeLoweredToCall(Instruction &I) const
bool preferInLoopReduction(RecurKind Kind, Type *Ty) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool hasArmWideBranch(bool Thumb) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty) const override
bool isLoweredToCall(const Function *F) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
bool isLegalMaskedStore(Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool isLegalMaskedLoad(Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
bool preferPredicatedReductionSelect() const override
bool isLegalMaskedGather(Type *Ty, Align Alignment) const override
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const override
bool isProfitableLSRChainElement(Instruction *I) const override
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
getScalingFactorCost - Return the cost of the scaling used in addressing mode represented by AM.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
Class to represent array types.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition IRBuilder.h:502
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
The core instruction combiner logic.
const DataLayout & getDataLayout() const
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
IRBuilder< TargetFolder, IRBuilderCallbackInserter > BuilderTy
An IRBuilder that automatically inserts new instructions into the worklist.
DominatorTree & getDominatorTree() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
AssumptionCache & getAssumptionCache() const
static InstructionCost getInvalid(CostType Val=0)
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isShift() const
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Drive the analysis of memory accesses in the loop.
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
AssumptionCache * getAssumptionCache() const
const LoopAccessInfo * getLAI() const
ScalarEvolution * getScalarEvolution() const
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
Provides information about what library functions are available for the current target.
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual bool isLoweredToCall(const Function *F) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
MaskKind
Some targets only support masked load/store with a constant mask.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
@ AMK_PreIndexed
Prefer pre-indexed addressing mode.
@ AMK_None
Don't prefer any addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:264
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
Type * getArrayElementType() const
Definition Type.h:408
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2484
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Runtime
Detect stack use after return if not disabled runtime with (ASAN_OPTIONS=detect_stack_use_after_retur...
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition Local.h:252
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
SelectPatternFlavor
Specific patterns of select instructions we can match.
@ SPF_ABS
Floating point maxnum.
@ SPF_FMAXNUM
Floating point minnum.
@ SPF_UMIN
Signed minimum.
@ SPF_UMAX
Signed maximum.
@ SPF_SMAX
Unsigned minimum.
@ SPF_FMINNUM
Unsigned maximum.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
RecurKind
These are the kinds of recurrences that we support.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1973
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
Attributes of a target dependent hardware loop.
LLVM_ABI bool canAnalyze(LoopInfo &LI)
LLVM_ABI bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
SelectPatternFlavor Flavor
TargetLibraryInfo * TLI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).