LLVM 20.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "ARMSubtarget.h"
12#include "llvm/ADT/APInt.h"
19#include "llvm/IR/BasicBlock.h"
20#include "llvm/IR/DataLayout.h"
22#include "llvm/IR/Instruction.h"
25#include "llvm/IR/Intrinsics.h"
26#include "llvm/IR/IntrinsicsARM.h"
28#include "llvm/IR/Type.h"
37#include <algorithm>
38#include <cassert>
39#include <cstdint>
40#include <optional>
41#include <utility>
42
43using namespace llvm;
44
45#define DEBUG_TYPE "armtti"
46
48 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
49 cl::desc("Enable the generation of masked loads and stores"));
50
52 "disable-arm-loloops", cl::Hidden, cl::init(false),
53 cl::desc("Disable the generation of low-overhead loops"));
54
55static cl::opt<bool>
56 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
57 cl::desc("Enable the generation of WLS loops"));
58
60 "widen-global-strings", cl::Hidden, cl::init(true),
61 cl::desc("Enable the widening of global strings to alignment boundaries"));
62
64
66
68
69/// Convert a vector load intrinsic into a simple llvm load instruction.
70/// This is beneficial when the underlying object being addressed comes
71/// from a constant, since we get constant-folding for free.
72static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
73 InstCombiner::BuilderTy &Builder) {
74 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
75
76 if (!IntrAlign)
77 return nullptr;
78
79 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
80 ? MemAlign
81 : IntrAlign->getLimitedValue();
82
83 if (!isPowerOf2_32(Alignment))
84 return nullptr;
85
86 return Builder.CreateAlignedLoad(II.getType(), II.getArgOperand(0),
87 Align(Alignment));
88}
89
91 const Function *Callee) const {
92 const TargetMachine &TM = getTLI()->getTargetMachine();
93 const FeatureBitset &CallerBits =
94 TM.getSubtargetImpl(*Caller)->getFeatureBits();
95 const FeatureBitset &CalleeBits =
96 TM.getSubtargetImpl(*Callee)->getFeatureBits();
97
98 // To inline a callee, all features not in the allowed list must match exactly.
99 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
100 (CalleeBits & ~InlineFeaturesAllowed);
101 // For features in the allowed list, the callee's features must be a subset of
102 // the callers'.
103 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
104 (CalleeBits & InlineFeaturesAllowed);
105 return MatchExact && MatchSubset;
106}
107
110 ScalarEvolution *SE) const {
111 if (ST->hasMVEIntegerOps())
113
114 if (L->getHeader()->getParent()->hasOptSize())
115 return TTI::AMK_None;
116
117 if (ST->isMClass() && ST->isThumb2() &&
118 L->getNumBlocks() == 1)
119 return TTI::AMK_PreIndexed;
120
121 return TTI::AMK_None;
122}
123
124std::optional<Instruction *>
126 using namespace PatternMatch;
127 Intrinsic::ID IID = II.getIntrinsicID();
128 switch (IID) {
129 default:
130 break;
131 case Intrinsic::arm_neon_vld1: {
132 Align MemAlign =
133 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
135 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
136 return IC.replaceInstUsesWith(II, V);
137 }
138 break;
139 }
140
141 case Intrinsic::arm_neon_vld2:
142 case Intrinsic::arm_neon_vld3:
143 case Intrinsic::arm_neon_vld4:
144 case Intrinsic::arm_neon_vld2lane:
145 case Intrinsic::arm_neon_vld3lane:
146 case Intrinsic::arm_neon_vld4lane:
147 case Intrinsic::arm_neon_vst1:
148 case Intrinsic::arm_neon_vst2:
149 case Intrinsic::arm_neon_vst3:
150 case Intrinsic::arm_neon_vst4:
151 case Intrinsic::arm_neon_vst2lane:
152 case Intrinsic::arm_neon_vst3lane:
153 case Intrinsic::arm_neon_vst4lane: {
154 Align MemAlign =
155 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
157 unsigned AlignArg = II.arg_size() - 1;
158 Value *AlignArgOp = II.getArgOperand(AlignArg);
159 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
160 if (Align && *Align < MemAlign) {
161 return IC.replaceOperand(
162 II, AlignArg,
163 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
164 false));
165 }
166 break;
167 }
168
169 case Intrinsic::arm_neon_vld1x2:
170 case Intrinsic::arm_neon_vld1x3:
171 case Intrinsic::arm_neon_vld1x4:
172 case Intrinsic::arm_neon_vst1x2:
173 case Intrinsic::arm_neon_vst1x3:
174 case Intrinsic::arm_neon_vst1x4: {
175 Align NewAlign =
176 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
178 Align OldAlign = II.getParamAlign(0).valueOrOne();
179 if (NewAlign > OldAlign)
180 II.addParamAttr(0,
181 Attribute::getWithAlignment(II.getContext(), NewAlign));
182 break;
183 }
184
185 case Intrinsic::arm_mve_pred_i2v: {
186 Value *Arg = II.getArgOperand(0);
187 Value *ArgArg;
188 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
189 PatternMatch::m_Value(ArgArg))) &&
190 II.getType() == ArgArg->getType()) {
191 return IC.replaceInstUsesWith(II, ArgArg);
192 }
193 Constant *XorMask;
194 if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
195 PatternMatch::m_Value(ArgArg)),
196 PatternMatch::m_Constant(XorMask))) &&
197 II.getType() == ArgArg->getType()) {
198 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
199 if (CI->getValue().trunc(16).isAllOnes()) {
200 auto TrueVector = IC.Builder.CreateVectorSplat(
201 cast<FixedVectorType>(II.getType())->getNumElements(),
202 IC.Builder.getTrue());
203 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
204 }
205 }
206 }
207 KnownBits ScalarKnown(32);
208 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
209 ScalarKnown)) {
210 return &II;
211 }
212 break;
213 }
214 case Intrinsic::arm_mve_pred_v2i: {
215 Value *Arg = II.getArgOperand(0);
216 Value *ArgArg;
217 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
218 PatternMatch::m_Value(ArgArg)))) {
219 return IC.replaceInstUsesWith(II, ArgArg);
220 }
221
222 if (II.getMetadata(LLVMContext::MD_range))
223 break;
224
225 ConstantRange Range(APInt(32, 0), APInt(32, 0x10000));
226
227 if (auto CurrentRange = II.getRange()) {
228 Range = Range.intersectWith(*CurrentRange);
229 if (Range == CurrentRange)
230 break;
231 }
232
233 II.addRangeRetAttr(Range);
234 II.addRetAttr(Attribute::NoUndef);
235 return &II;
236 }
237 case Intrinsic::arm_mve_vadc:
238 case Intrinsic::arm_mve_vadc_predicated: {
239 unsigned CarryOp =
240 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
241 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
242 "Bad type for intrinsic!");
243
244 KnownBits CarryKnown(32);
245 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
246 CarryKnown)) {
247 return &II;
248 }
249 break;
250 }
251 case Intrinsic::arm_mve_vmldava: {
252 Instruction *I = cast<Instruction>(&II);
253 if (I->hasOneUse()) {
254 auto *User = cast<Instruction>(*I->user_begin());
255 Value *OpZ;
256 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
257 match(I->getOperand(3), m_Zero())) {
258 Value *OpX = I->getOperand(4);
259 Value *OpY = I->getOperand(5);
260 Type *OpTy = OpX->getType();
261
263 Value *V =
264 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
265 {I->getOperand(0), I->getOperand(1),
266 I->getOperand(2), OpZ, OpX, OpY});
267
269 return IC.eraseInstFromFunction(*User);
270 }
271 }
272 return std::nullopt;
273 }
274 }
275 return std::nullopt;
276}
277
279 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
280 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
281 std::function<void(Instruction *, unsigned, APInt, APInt &)>
282 SimplifyAndSetOp) const {
283
284 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
285 // opcode specifying a Top/Bottom instruction, which can change between
286 // instructions.
287 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
288 unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
289 unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
290
291 // The only odd/even lanes of operand 0 will only be demanded depending
292 // on whether this is a top/bottom instruction.
293 APInt DemandedElts =
294 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
295 : APInt::getHighBitsSet(2, 1));
296 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
297 // The other lanes will be defined from the inserted elements.
298 UndefElts &= APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
299 : APInt::getHighBitsSet(2, 1));
300 return std::nullopt;
301 };
302
303 switch (II.getIntrinsicID()) {
304 default:
305 break;
306 case Intrinsic::arm_mve_vcvt_narrow:
307 SimplifyNarrowInstrTopBottom(2);
308 break;
309 case Intrinsic::arm_mve_vqmovn:
310 SimplifyNarrowInstrTopBottom(4);
311 break;
312 case Intrinsic::arm_mve_vshrn:
313 SimplifyNarrowInstrTopBottom(7);
314 break;
315 }
316
317 return std::nullopt;
318}
319
322 assert(Ty->isIntegerTy());
323
324 unsigned Bits = Ty->getPrimitiveSizeInBits();
325 if (Bits == 0 || Imm.getActiveBits() >= 64)
326 return 4;
327
328 int64_t SImmVal = Imm.getSExtValue();
329 uint64_t ZImmVal = Imm.getZExtValue();
330 if (!ST->isThumb()) {
331 if ((SImmVal >= 0 && SImmVal < 65536) ||
332 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
333 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
334 return 1;
335 return ST->hasV6T2Ops() ? 2 : 3;
336 }
337 if (ST->isThumb2()) {
338 if ((SImmVal >= 0 && SImmVal < 65536) ||
339 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
340 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
341 return 1;
342 return ST->hasV6T2Ops() ? 2 : 3;
343 }
344 // Thumb1, any i8 imm cost 1.
345 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
346 return 1;
347 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
348 return 2;
349 // Load from constantpool.
350 return 3;
351}
352
353// Constants smaller than 256 fit in the immediate field of
354// Thumb1 instructions so we return a zero cost and 1 otherwise.
356 const APInt &Imm, Type *Ty) {
357 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
358 return 0;
359
360 return 1;
361}
362
363// Checks whether Inst is part of a min(max()) or max(min()) pattern
364// that will match to an SSAT instruction. Returns the instruction being
365// saturated, or null if no saturation pattern was found.
366static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
367 Value *LHS, *RHS;
368 ConstantInt *C;
370
371 if (InstSPF == SPF_SMAX &&
373 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
374
375 auto isSSatMin = [&](Value *MinInst) {
376 if (isa<SelectInst>(MinInst)) {
377 Value *MinLHS, *MinRHS;
378 ConstantInt *MinC;
379 SelectPatternFlavor MinSPF =
380 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
381 if (MinSPF == SPF_SMIN &&
383 MinC->getValue() == ((-Imm) - 1))
384 return true;
385 }
386 return false;
387 };
388
389 if (isSSatMin(Inst->getOperand(1)))
390 return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
391 if (Inst->hasNUses(2) &&
392 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
393 return Inst->getOperand(1);
394 }
395 return nullptr;
396}
397
398// Look for a FP Saturation pattern, where the instruction can be simplified to
399// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
400static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
401 if (Imm.getBitWidth() != 64 ||
402 Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
403 return false;
404 Value *FP = isSSATMinMaxPattern(Inst, Imm);
405 if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
406 FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
407 if (!FP)
408 return false;
409 return isa<FPToSIInst>(FP);
410}
411
413 const APInt &Imm, Type *Ty,
415 Instruction *Inst) {
416 // Division by a constant can be turned into multiplication, but only if we
417 // know it's constant. So it's not so much that the immediate is cheap (it's
418 // not), but that the alternative is worse.
419 // FIXME: this is probably unneeded with GlobalISel.
420 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
421 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
422 Idx == 1)
423 return 0;
424
425 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
426 // splitting any large offsets.
427 if (Opcode == Instruction::GetElementPtr && Idx != 0)
428 return 0;
429
430 if (Opcode == Instruction::And) {
431 // UXTB/UXTH
432 if (Imm == 255 || Imm == 65535)
433 return 0;
434 // Conversion to BIC is free, and means we can use ~Imm instead.
435 return std::min(getIntImmCost(Imm, Ty, CostKind),
436 getIntImmCost(~Imm, Ty, CostKind));
437 }
438
439 if (Opcode == Instruction::Add)
440 // Conversion to SUB is free, and means we can use -Imm instead.
441 return std::min(getIntImmCost(Imm, Ty, CostKind),
442 getIntImmCost(-Imm, Ty, CostKind));
443
444 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
445 Ty->getIntegerBitWidth() == 32) {
446 int64_t NegImm = -Imm.getSExtValue();
447 if (ST->isThumb2() && NegImm < 1<<12)
448 // icmp X, #-C -> cmn X, #C
449 return 0;
450 if (ST->isThumb() && NegImm < 1<<8)
451 // icmp X, #-C -> adds X, #C
452 return 0;
453 }
454
455 // xor a, -1 can always be folded to MVN
456 if (Opcode == Instruction::Xor && Imm.isAllOnes())
457 return 0;
458
459 // Ensures negative constant of min(max()) or max(min()) patterns that
460 // match to SSAT instructions don't get hoisted
461 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
462 Ty->getIntegerBitWidth() <= 32) {
463 if (isSSATMinMaxPattern(Inst, Imm) ||
464 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
465 isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
466 return 0;
467 }
468
469 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
470 return 0;
471
472 // We can convert <= -1 to < 0, which is generally quite cheap.
473 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
474 ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
475 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
476 return std::min(getIntImmCost(Imm, Ty, CostKind),
477 getIntImmCost(Imm + 1, Ty, CostKind));
478 }
479
480 return getIntImmCost(Imm, Ty, CostKind);
481}
482
485 const Instruction *I) {
487 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
488 // FIXME: The vectorizer is highly sensistive to the cost of these
489 // instructions, which suggests that it may be using the costs incorrectly.
490 // But, for now, just make them free to avoid performance regressions for
491 // vector targets.
492 return 0;
493 }
494 return BaseT::getCFInstrCost(Opcode, CostKind, I);
495}
496
498 Type *Src,
501 const Instruction *I) {
502 int ISD = TLI->InstructionOpcodeToISD(Opcode);
503 assert(ISD && "Invalid opcode");
504
505 // TODO: Allow non-throughput costs that aren't binary.
506 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
508 return Cost == 0 ? 0 : 1;
509 return Cost;
510 };
511 auto IsLegalFPType = [this](EVT VT) {
512 EVT EltVT = VT.getScalarType();
513 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
514 (EltVT == MVT::f64 && ST->hasFP64()) ||
515 (EltVT == MVT::f16 && ST->hasFullFP16());
516 };
517
518 EVT SrcTy = TLI->getValueType(DL, Src);
519 EVT DstTy = TLI->getValueType(DL, Dst);
520
521 if (!SrcTy.isSimple() || !DstTy.isSimple())
522 return AdjustCost(
523 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
524
525 // Extending masked load/Truncating masked stores is expensive because we
526 // currently don't split them. This means that we'll likely end up
527 // loading/storing each element individually (hence the high cost).
528 if ((ST->hasMVEIntegerOps() &&
529 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
530 Opcode == Instruction::SExt)) ||
531 (ST->hasMVEFloatOps() &&
532 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
533 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
534 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
535 return 2 * DstTy.getVectorNumElements() *
537
538 // The extend of other kinds of load is free
539 if (CCH == TTI::CastContextHint::Normal ||
541 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
542 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
543 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
544 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
545 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
546 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
547 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
548 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
549 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
550 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
551 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
552 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
553 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
554 };
555 if (const auto *Entry = ConvertCostTableLookup(
556 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
557 return AdjustCost(Entry->Cost);
558
559 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
560 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
561 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
562 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
563 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
564 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
565 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
566 // The following extend from a legal type to an illegal type, so need to
567 // split the load. This introduced an extra load operation, but the
568 // extend is still "free".
569 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
570 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
571 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
572 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
573 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
574 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
575 };
576 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
577 if (const auto *Entry =
578 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
579 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
580 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
581 }
582
583 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
584 // FPExtends are similar but also require the VCVT instructions.
585 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
586 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
587 };
588 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
589 if (const auto *Entry =
590 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
591 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
592 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
593 }
594
595 // The truncate of a store is free. This is the mirror of extends above.
596 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
597 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
598 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
599 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
600 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
601 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
602 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
603 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
604 };
605 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
606 if (const auto *Entry =
607 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
608 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
609 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
610 }
611
612 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
613 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
614 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
615 };
616 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
617 if (const auto *Entry =
618 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
619 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
620 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
621 }
622 }
623
624 // NEON vector operations that can extend their inputs.
625 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
626 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
627 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
628 // vaddl
629 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
630 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
631 // vsubl
632 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
633 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
634 // vmull
635 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
636 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
637 // vshll
638 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
639 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
640 };
641
642 auto *User = cast<Instruction>(*I->user_begin());
643 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
644 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
645 DstTy.getSimpleVT(),
646 SrcTy.getSimpleVT())) {
647 return AdjustCost(Entry->Cost);
648 }
649 }
650
651 // Single to/from double precision conversions.
652 if (Src->isVectorTy() && ST->hasNEON() &&
653 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
654 DstTy.getScalarType() == MVT::f32) ||
655 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
656 DstTy.getScalarType() == MVT::f64))) {
657 static const CostTblEntry NEONFltDblTbl[] = {
658 // Vector fptrunc/fpext conversions.
659 {ISD::FP_ROUND, MVT::v2f64, 2},
660 {ISD::FP_EXTEND, MVT::v2f32, 2},
661 {ISD::FP_EXTEND, MVT::v4f32, 4}};
662
663 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
664 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
665 return AdjustCost(LT.first * Entry->Cost);
666 }
667
668 // Some arithmetic, load and store operations have specific instructions
669 // to cast up/down their types automatically at no extra cost.
670 // TODO: Get these tables to know at least what the related operations are.
671 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
672 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
673 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
674 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
675 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
676 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
677 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
678
679 // The number of vmovl instructions for the extension.
680 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
681 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
682 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
683 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
684 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
685 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
686 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
687 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
688 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
689 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
690 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
691 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
692 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
693 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
694 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
695 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
696 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
697 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
698
699 // Operations that we legalize using splitting.
700 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
701 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
702
703 // Vector float <-> i32 conversions.
704 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
705 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
706
707 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
708 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
709 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
710 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
711 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
712 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
713 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
714 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
715 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
716 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
717 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
718 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
719 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
720 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
721 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
722 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
723 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
724 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
725 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
726 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
727
728 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
729 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
730 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
731 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
732 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
733 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
734
735 // Vector double <-> i32 conversions.
736 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
737 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
738
739 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
740 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
741 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
742 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
743 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
744 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
745
746 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
747 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
748 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
749 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
750 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
751 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
752 };
753
754 if (SrcTy.isVector() && ST->hasNEON()) {
755 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
756 DstTy.getSimpleVT(),
757 SrcTy.getSimpleVT()))
758 return AdjustCost(Entry->Cost);
759 }
760
761 // Scalar float to integer conversions.
762 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
763 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
764 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
765 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
766 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
767 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
768 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
769 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
770 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
771 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
772 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
773 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
774 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
775 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
776 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
777 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
778 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
779 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
780 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
781 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
782 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
783 };
784 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
785 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
786 DstTy.getSimpleVT(),
787 SrcTy.getSimpleVT()))
788 return AdjustCost(Entry->Cost);
789 }
790
791 // Scalar integer to float conversions.
792 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
793 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
794 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
795 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
796 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
797 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
798 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
799 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
800 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
801 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
802 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
803 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
804 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
805 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
806 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
807 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
808 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
809 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
810 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
811 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
812 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
813 };
814
815 if (SrcTy.isInteger() && ST->hasNEON()) {
816 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
817 ISD, DstTy.getSimpleVT(),
818 SrcTy.getSimpleVT()))
819 return AdjustCost(Entry->Cost);
820 }
821
822 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
823 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
824 // are linearised so take more.
825 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
826 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
827 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
828 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
829 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
830 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
831 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
832 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
833 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
834 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
835 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
836 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
837 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
838 };
839
840 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
841 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
842 ISD, DstTy.getSimpleVT(),
843 SrcTy.getSimpleVT()))
844 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
845 }
846
847 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
848 // As general rule, fp converts that were not matched above are scalarized
849 // and cost 1 vcvt for each lane, so long as the instruction is available.
850 // If not it will become a series of function calls.
851 const InstructionCost CallCost =
852 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
853 int Lanes = 1;
854 if (SrcTy.isFixedLengthVector())
855 Lanes = SrcTy.getVectorNumElements();
856
857 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
858 return Lanes;
859 else
860 return Lanes * CallCost;
861 }
862
863 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
864 SrcTy.isFixedLengthVector()) {
865 // Treat a truncate with larger than legal source (128bits for MVE) as
866 // expensive, 2 instructions per lane.
867 if ((SrcTy.getScalarType() == MVT::i8 ||
868 SrcTy.getScalarType() == MVT::i16 ||
869 SrcTy.getScalarType() == MVT::i32) &&
870 SrcTy.getSizeInBits() > 128 &&
871 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
872 return SrcTy.getVectorNumElements() * 2;
873 }
874
875 // Scalar integer conversion costs.
876 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
877 // i16 -> i64 requires two dependent operations.
878 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
879
880 // Truncates on i64 are assumed to be free.
881 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
882 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
883 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
884 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
885 };
886
887 if (SrcTy.isInteger()) {
888 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
889 DstTy.getSimpleVT(),
890 SrcTy.getSimpleVT()))
891 return AdjustCost(Entry->Cost);
892 }
893
894 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
896 : 1;
897 return AdjustCost(
898 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
899}
900
903 unsigned Index, Value *Op0,
904 Value *Op1) {
905 // Penalize inserting into an D-subregister. We end up with a three times
906 // lower estimated throughput on swift.
907 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
908 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
909 return 3;
910
911 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
912 Opcode == Instruction::ExtractElement)) {
913 // Cross-class copies are expensive on many microarchitectures,
914 // so assume they are expensive by default.
915 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
916 return 3;
917
918 // Even if it's not a cross class copy, this likely leads to mixing
919 // of NEON and VFP code and should be therefore penalized.
920 if (ValTy->isVectorTy() &&
921 ValTy->getScalarSizeInBits() <= 32)
922 return std::max<InstructionCost>(
923 BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1),
924 2U);
925 }
926
927 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
928 Opcode == Instruction::ExtractElement)) {
929 // Integer cross-lane moves are more expensive than float, which can
930 // sometimes just be vmovs. Integer involve being passes to GPR registers,
931 // causing more of a delay.
932 std::pair<InstructionCost, MVT> LT =
934 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
935 }
936
937 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
938}
939
941 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
943 TTI::OperandValueInfo Op2Info, const Instruction *I) {
944 int ISD = TLI->InstructionOpcodeToISD(Opcode);
945
946 // Thumb scalar code size cost for select.
947 if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
948 ST->isThumb() && !ValTy->isVectorTy()) {
949 // Assume expensive structs.
950 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
951 return TTI::TCC_Expensive;
952
953 // Select costs can vary because they:
954 // - may require one or more conditional mov (including an IT),
955 // - can't operate directly on immediates,
956 // - require live flags, which we can't copy around easily.
958
959 // Possible IT instruction for Thumb2, or more for Thumb1.
960 ++Cost;
961
962 // i1 values may need rematerialising by using mov immediates and/or
963 // flag setting instructions.
964 if (ValTy->isIntegerTy(1))
965 ++Cost;
966
967 return Cost;
968 }
969
970 // If this is a vector min/max/abs, use the cost of that intrinsic directly
971 // instead. Hopefully when min/max intrinsics are more prevalent this code
972 // will not be needed.
973 const Instruction *Sel = I;
974 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
975 Sel->hasOneUse())
976 Sel = cast<Instruction>(Sel->user_back());
977 if (Sel && ValTy->isVectorTy() &&
978 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
979 const Value *LHS, *RHS;
981 unsigned IID = 0;
982 switch (SPF) {
983 case SPF_ABS:
984 IID = Intrinsic::abs;
985 break;
986 case SPF_SMIN:
987 IID = Intrinsic::smin;
988 break;
989 case SPF_SMAX:
990 IID = Intrinsic::smax;
991 break;
992 case SPF_UMIN:
993 IID = Intrinsic::umin;
994 break;
995 case SPF_UMAX:
996 IID = Intrinsic::umax;
997 break;
998 case SPF_FMINNUM:
999 IID = Intrinsic::minnum;
1000 break;
1001 case SPF_FMAXNUM:
1002 IID = Intrinsic::maxnum;
1003 break;
1004 default:
1005 break;
1006 }
1007 if (IID) {
1008 // The ICmp is free, the select gets the cost of the min/max/etc
1009 if (Sel != I)
1010 return 0;
1011 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
1012 return getIntrinsicInstrCost(CostAttrs, CostKind);
1013 }
1014 }
1015
1016 // On NEON a vector select gets lowered to vbsl.
1017 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
1018 // Lowering of some vector selects is currently far from perfect.
1019 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
1020 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
1021 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
1022 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
1023 };
1024
1025 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1026 EVT SelValTy = TLI->getValueType(DL, ValTy);
1027 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1028 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1029 SelCondTy.getSimpleVT(),
1030 SelValTy.getSimpleVT()))
1031 return Entry->Cost;
1032 }
1033
1034 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1035 return LT.first;
1036 }
1037
1038 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1039 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1040 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1041 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1042 FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
1043 if (!VecCondTy)
1044 VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
1045
1046 // If we don't have mve.fp any fp operations will need to be scalarized.
1047 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1048 // One scalaization insert, one scalarization extract and the cost of the
1049 // fcmps.
1050 return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
1051 /*Extract*/ true, CostKind) +
1052 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1053 /*Extract*/ false, CostKind) +
1054 VecValTy->getNumElements() *
1055 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1056 VecCondTy->getScalarType(), VecPred,
1057 CostKind, Op1Info, Op2Info, I);
1058 }
1059
1060 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1061 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1062 // There are two types - the input that specifies the type of the compare
1063 // and the output vXi1 type. Because we don't know how the output will be
1064 // split, we may need an expensive shuffle to get two in sync. This has the
1065 // effect of making larger than legal compares (v8i32 for example)
1066 // expensive.
1067 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1068 if (LT.first > 1)
1069 return LT.first * BaseCost +
1070 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1071 /*Extract*/ false, CostKind);
1072 return BaseCost;
1073 }
1074 }
1075
1076 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1077 // for "multiple beats" potentially needed by MVE instructions.
1078 int BaseCost = 1;
1079 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1080 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1081
1082 return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred,
1083 CostKind, Op1Info, Op2Info, I);
1084}
1085
1087 ScalarEvolution *SE,
1088 const SCEV *Ptr) {
1089 // Address computations in vectorized code with non-consecutive addresses will
1090 // likely result in more instructions compared to scalar code where the
1091 // computation can more often be merged into the index mode. The resulting
1092 // extra micro-ops can significantly decrease throughput.
1093 unsigned NumVectorInstToHideOverhead = 10;
1094 int MaxMergeDistance = 64;
1095
1096 if (ST->hasNEON()) {
1097 if (Ty->isVectorTy() && SE &&
1098 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1099 return NumVectorInstToHideOverhead;
1100
1101 // In many cases the address computation is not merged into the instruction
1102 // addressing mode.
1103 return 1;
1104 }
1105 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1106}
1107
1109 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1110 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1111 // optimized, else LSR may block tail-predication.
1112 switch (II->getIntrinsicID()) {
1113 case Intrinsic::arm_mve_vctp8:
1114 case Intrinsic::arm_mve_vctp16:
1115 case Intrinsic::arm_mve_vctp32:
1116 case Intrinsic::arm_mve_vctp64:
1117 return true;
1118 default:
1119 break;
1120 }
1121 }
1122 return false;
1123}
1124
1125bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1126 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1127 return false;
1128
1129 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1130 // Don't support v2i1 yet.
1131 if (VecTy->getNumElements() == 2)
1132 return false;
1133
1134 // We don't support extending fp types.
1135 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1136 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1137 return false;
1138 }
1139
1140 unsigned EltWidth = DataTy->getScalarSizeInBits();
1141 return (EltWidth == 32 && Alignment >= 4) ||
1142 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1143}
1144
1146 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1147 return false;
1148
1149 unsigned EltWidth = Ty->getScalarSizeInBits();
1150 return ((EltWidth == 32 && Alignment >= 4) ||
1151 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1152}
1153
1154/// Given a memcpy/memset/memmove instruction, return the number of memory
1155/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1156/// call is used.
1158 MemOp MOp;
1159 unsigned DstAddrSpace = ~0u;
1160 unsigned SrcAddrSpace = ~0u;
1161 const Function *F = I->getParent()->getParent();
1162
1163 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1164 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1165 // If 'size' is not a constant, a library call will be generated.
1166 if (!C)
1167 return -1;
1168
1169 const unsigned Size = C->getValue().getZExtValue();
1170 const Align DstAlign = *MC->getDestAlign();
1171 const Align SrcAlign = *MC->getSourceAlign();
1172
1173 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1174 /*IsVolatile*/ false);
1175 DstAddrSpace = MC->getDestAddressSpace();
1176 SrcAddrSpace = MC->getSourceAddressSpace();
1177 }
1178 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1179 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1180 // If 'size' is not a constant, a library call will be generated.
1181 if (!C)
1182 return -1;
1183
1184 const unsigned Size = C->getValue().getZExtValue();
1185 const Align DstAlign = *MS->getDestAlign();
1186
1187 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1188 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1189 DstAddrSpace = MS->getDestAddressSpace();
1190 }
1191 else
1192 llvm_unreachable("Expected a memcpy/move or memset!");
1193
1194 unsigned Limit, Factor = 2;
1195 switch(I->getIntrinsicID()) {
1196 case Intrinsic::memcpy:
1197 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1198 break;
1199 case Intrinsic::memmove:
1200 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1201 break;
1202 case Intrinsic::memset:
1203 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1204 Factor = 1;
1205 break;
1206 default:
1207 llvm_unreachable("Expected a memcpy/move or memset!");
1208 }
1209
1210 // MemOps will be poplulated with a list of data types that needs to be
1211 // loaded and stored. That's why we multiply the number of elements by 2 to
1212 // get the cost for this memcpy.
1213 std::vector<EVT> MemOps;
1214 if (getTLI()->findOptimalMemOpLowering(
1215 MemOps, Limit, MOp, DstAddrSpace,
1216 SrcAddrSpace, F->getAttributes()))
1217 return MemOps.size() * Factor;
1218
1219 // If we can't find an optimal memop lowering, return the default cost
1220 return -1;
1221}
1222
1224 int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1225
1226 // To model the cost of a library call, we assume 1 for the call, and
1227 // 3 for the argument setup.
1228 if (NumOps == -1)
1229 return 4;
1230 return NumOps;
1231}
1232
1234 VectorType *Tp, ArrayRef<int> Mask,
1236 int Index, VectorType *SubTp,
1238 const Instruction *CxtI) {
1239 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
1240 // Treat extractsubvector as single op permutation.
1241 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1242 if (IsExtractSubvector)
1244 if (ST->hasNEON()) {
1245 if (Kind == TTI::SK_Broadcast) {
1246 static const CostTblEntry NEONDupTbl[] = {
1247 // VDUP handles these cases.
1248 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1249 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1250 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1251 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1252 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1253 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1254
1255 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1256 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1257 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1258 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1259
1260 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1261 if (const auto *Entry =
1262 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1263 return LT.first * Entry->Cost;
1264 }
1265 if (Kind == TTI::SK_Reverse) {
1266 static const CostTblEntry NEONShuffleTbl[] = {
1267 // Reverse shuffle cost one instruction if we are shuffling within a
1268 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1269 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1270 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1271 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1272 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1273 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1274 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1275
1276 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1277 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1278 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1279 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1280
1281 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1282 if (const auto *Entry =
1283 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1284 return LT.first * Entry->Cost;
1285 }
1286 if (Kind == TTI::SK_Select) {
1287 static const CostTblEntry NEONSelShuffleTbl[] = {
1288 // Select shuffle cost table for ARM. Cost is the number of
1289 // instructions
1290 // required to create the shuffled vector.
1291
1292 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1293 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1294 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1295 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1296
1297 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1298 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1299 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1300
1301 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1302
1303 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1304
1305 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1306 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1307 ISD::VECTOR_SHUFFLE, LT.second))
1308 return LT.first * Entry->Cost;
1309 }
1310 }
1311 if (ST->hasMVEIntegerOps()) {
1312 if (Kind == TTI::SK_Broadcast) {
1313 static const CostTblEntry MVEDupTbl[] = {
1314 // VDUP handles these cases.
1315 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1316 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1317 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1318 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1319 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1320
1321 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1322 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1323 LT.second))
1324 return LT.first * Entry->Cost *
1326 }
1327
1328 if (!Mask.empty()) {
1329 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1330 if (LT.second.isVector() &&
1331 Mask.size() <= LT.second.getVectorNumElements() &&
1332 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1333 isVREVMask(Mask, LT.second, 64)))
1334 return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1335 }
1336 }
1337
1338 // Restore optimal kind.
1339 if (IsExtractSubvector)
1341 int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1343 : 1;
1344 return BaseCost *
1345 BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
1346}
1347
1349 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1352 const Instruction *CxtI) {
1353 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1354 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1355 // Make operations on i1 relatively expensive as this often involves
1356 // combining predicates. AND and XOR should be easier to handle with IT
1357 // blocks.
1358 switch (ISDOpcode) {
1359 default:
1360 break;
1361 case ISD::AND:
1362 case ISD::XOR:
1363 return 2;
1364 case ISD::OR:
1365 return 3;
1366 }
1367 }
1368
1369 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1370
1371 if (ST->hasNEON()) {
1372 const unsigned FunctionCallDivCost = 20;
1373 const unsigned ReciprocalDivCost = 10;
1374 static const CostTblEntry CostTbl[] = {
1375 // Division.
1376 // These costs are somewhat random. Choose a cost of 20 to indicate that
1377 // vectorizing devision (added function call) is going to be very expensive.
1378 // Double registers types.
1379 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1380 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1381 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1382 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1383 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1384 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1385 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1386 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1387 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1388 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1389 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1390 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1391 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1392 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1393 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1394 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1395 // Quad register types.
1396 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1397 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1398 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1399 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1400 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1401 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1402 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1403 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1404 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1405 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1406 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1407 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1408 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1409 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1410 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1411 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1412 // Multiplication.
1413 };
1414
1415 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1416 return LT.first * Entry->Cost;
1417
1419 Opcode, Ty, CostKind, Op1Info, Op2Info);
1420
1421 // This is somewhat of a hack. The problem that we are facing is that SROA
1422 // creates a sequence of shift, and, or instructions to construct values.
1423 // These sequences are recognized by the ISel and have zero-cost. Not so for
1424 // the vectorized code. Because we have support for v2i64 but not i64 those
1425 // sequences look particularly beneficial to vectorize.
1426 // To work around this we increase the cost of v2i64 operations to make them
1427 // seem less beneficial.
1428 if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1429 Cost += 4;
1430
1431 return Cost;
1432 }
1433
1434 // If this operation is a shift on arm/thumb2, it might well be folded into
1435 // the following instruction, hence having a cost of 0.
1436 auto LooksLikeAFreeShift = [&]() {
1437 if (ST->isThumb1Only() || Ty->isVectorTy())
1438 return false;
1439
1440 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1441 return false;
1442 if (!Op2Info.isUniform() || !Op2Info.isConstant())
1443 return false;
1444
1445 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1446 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1447 case Instruction::Add:
1448 case Instruction::Sub:
1449 case Instruction::And:
1450 case Instruction::Xor:
1451 case Instruction::Or:
1452 case Instruction::ICmp:
1453 return true;
1454 default:
1455 return false;
1456 }
1457 };
1458 if (LooksLikeAFreeShift())
1459 return 0;
1460
1461 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1462 // for "multiple beats" potentially needed by MVE instructions.
1463 int BaseCost = 1;
1464 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1465 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1466
1467 // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1468 // without treating floats as more expensive that scalars or increasing the
1469 // costs for custom operations. The results is also multiplied by the
1470 // MVEVectorCostFactor where appropriate.
1471 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1472 return LT.first * BaseCost;
1473
1474 // Else this is expand, assume that we need to scalarize this op.
1475 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1476 unsigned Num = VTy->getNumElements();
1479 // Return the cost of multiple scalar invocation plus the cost of
1480 // inserting and extracting the values.
1481 SmallVector<Type *> Tys(Args.size(), Ty);
1482 return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1483 Num * Cost;
1484 }
1485
1486 return BaseCost;
1487}
1488
1490 MaybeAlign Alignment,
1491 unsigned AddressSpace,
1493 TTI::OperandValueInfo OpInfo,
1494 const Instruction *I) {
1495 // TODO: Handle other cost kinds.
1497 return 1;
1498
1499 // Type legalization can't handle structs
1500 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1501 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1502 CostKind);
1503
1504 if (ST->hasNEON() && Src->isVectorTy() &&
1505 (Alignment && *Alignment != Align(16)) &&
1506 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1507 // Unaligned loads/stores are extremely inefficient.
1508 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1509 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1510 return LT.first * 4;
1511 }
1512
1513 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1514 // Same for stores.
1515 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1516 ((Opcode == Instruction::Load && I->hasOneUse() &&
1517 isa<FPExtInst>(*I->user_begin())) ||
1518 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1519 FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1520 Type *DstTy =
1521 Opcode == Instruction::Load
1522 ? (*I->user_begin())->getType()
1523 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1524 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1525 DstTy->getScalarType()->isFloatTy())
1526 return ST->getMVEVectorCostFactor(CostKind);
1527 }
1528
1529 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1531 : 1;
1532 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1533 CostKind, OpInfo, I);
1534}
1535
1537ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1538 unsigned AddressSpace,
1540 if (ST->hasMVEIntegerOps()) {
1541 if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1542 return ST->getMVEVectorCostFactor(CostKind);
1543 if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1544 return ST->getMVEVectorCostFactor(CostKind);
1545 }
1546 if (!isa<FixedVectorType>(Src))
1547 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1548 CostKind);
1549 // Scalar cost, which is currently very high due to the efficiency of the
1550 // generated code.
1551 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1552}
1553
1555 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1556 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1557 bool UseMaskForCond, bool UseMaskForGaps) {
1558 assert(Factor >= 2 && "Invalid interleave factor");
1559 assert(isa<VectorType>(VecTy) && "Expect a vector type");
1560
1561 // vldN/vstN doesn't support vector types of i64/f64 element.
1562 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1563
1564 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1565 !UseMaskForCond && !UseMaskForGaps) {
1566 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1567 auto *SubVecTy =
1568 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1569
1570 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1571 // Accesses having vector types that are a multiple of 128 bits can be
1572 // matched to more than one vldN/vstN instruction.
1573 int BaseCost =
1574 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1575 if (NumElts % Factor == 0 &&
1576 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1577 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1578
1579 // Some smaller than legal interleaved patterns are cheap as we can make
1580 // use of the vmovn or vrev patterns to interleave a standard load. This is
1581 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1582 // promoted differently). The cost of 2 here is then a load and vrev or
1583 // vmovn.
1584 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1585 VecTy->isIntOrIntVectorTy() &&
1586 DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
1587 return 2 * BaseCost;
1588 }
1589
1590 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1591 Alignment, AddressSpace, CostKind,
1592 UseMaskForCond, UseMaskForGaps);
1593}
1594
1596 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1597 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1598 using namespace PatternMatch;
1599 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1600 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1601 Alignment, CostKind, I);
1602
1603 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1604 auto *VTy = cast<FixedVectorType>(DataTy);
1605
1606 // TODO: Splitting, once we do that.
1607
1608 unsigned NumElems = VTy->getNumElements();
1609 unsigned EltSize = VTy->getScalarSizeInBits();
1610 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
1611
1612 // For now, it is assumed that for the MVE gather instructions the loads are
1613 // all effectively serialised. This means the cost is the scalar cost
1614 // multiplied by the number of elements being loaded. This is possibly very
1615 // conservative, but even so we still end up vectorising loops because the
1616 // cost per iteration for many loops is lower than for scalar loops.
1617 InstructionCost VectorCost =
1618 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1619 // The scalarization cost should be a lot higher. We use the number of vector
1620 // elements plus the scalarization overhead. If masking is required then a lot
1621 // of little blocks will be needed and potentially a scalarized p0 mask,
1622 // greatly increasing the cost.
1623 InstructionCost ScalarCost =
1624 NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1625 BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
1626 CostKind) +
1627 BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
1628 CostKind);
1629
1630 if (EltSize < 8 || Alignment < EltSize / 8)
1631 return ScalarCost;
1632
1633 unsigned ExtSize = EltSize;
1634 // Check whether there's a single user that asks for an extended type
1635 if (I != nullptr) {
1636 // Dependent of the caller of this function, a gather instruction will
1637 // either have opcode Instruction::Load or be a call to the masked_gather
1638 // intrinsic
1639 if ((I->getOpcode() == Instruction::Load ||
1640 match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1641 I->hasOneUse()) {
1642 const User *Us = *I->users().begin();
1643 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1644 // only allow valid type combinations
1645 unsigned TypeSize =
1646 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1647 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1648 (TypeSize == 16 && EltSize == 8)) &&
1649 TypeSize * NumElems == 128) {
1650 ExtSize = TypeSize;
1651 }
1652 }
1653 }
1654 // Check whether the input data needs to be truncated
1655 TruncInst *T;
1656 if ((I->getOpcode() == Instruction::Store ||
1657 match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1658 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1659 // Only allow valid type combinations
1660 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1661 if (((EltSize == 16 && TypeSize == 32) ||
1662 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1663 TypeSize * NumElems == 128)
1664 ExtSize = TypeSize;
1665 }
1666 }
1667
1668 if (ExtSize * NumElems != 128 || NumElems < 4)
1669 return ScalarCost;
1670
1671 // Any (aligned) i32 gather will not need to be scalarised.
1672 if (ExtSize == 32)
1673 return VectorCost;
1674 // For smaller types, we need to ensure that the gep's inputs are correctly
1675 // extended from a small enough value. Other sizes (including i64) are
1676 // scalarized for now.
1677 if (ExtSize != 8 && ExtSize != 16)
1678 return ScalarCost;
1679
1680 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1681 Ptr = BC->getOperand(0);
1682 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1683 if (GEP->getNumOperands() != 2)
1684 return ScalarCost;
1685 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1686 // Scale needs to be correct (which is only relevant for i16s).
1687 if (Scale != 1 && Scale * 8 != ExtSize)
1688 return ScalarCost;
1689 // And we need to zext (not sext) the indexes from a small enough type.
1690 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1691 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1692 return VectorCost;
1693 }
1694 return ScalarCost;
1695 }
1696 return ScalarCost;
1697}
1698
1701 std::optional<FastMathFlags> FMF,
1703
1704 EVT ValVT = TLI->getValueType(DL, ValTy);
1705 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1706 unsigned EltSize = ValVT.getScalarSizeInBits();
1707
1708 // In general floating point reductions are a series of elementwise
1709 // operations, with free extracts on each step. These are either in-order or
1710 // treewise depending on whether that is allowed by the fast math flags.
1711 if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
1712 ((EltSize == 32 && ST->hasVFP2Base()) ||
1713 (EltSize == 64 && ST->hasFP64()) ||
1714 (EltSize == 16 && ST->hasFullFP16()))) {
1715 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1716 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1717 InstructionCost VecCost = 0;
1718 while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(NumElts) &&
1719 NumElts * EltSize > VecLimit) {
1720 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1721 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1722 NumElts /= 2;
1723 }
1724
1725 // For fp16 we need to extract the upper lane elements. MVE can add a
1726 // VREV+FMIN/MAX to perform another vector step instead.
1727 InstructionCost ExtractCost = 0;
1728 if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1729 ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
1730 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1731 NumElts /= 2;
1732 } else if (ValVT.getVectorElementType() == MVT::f16)
1733 ExtractCost = NumElts / 2;
1734
1735 return VecCost + ExtractCost +
1736 NumElts *
1738 }
1739
1740 if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
1741 (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
1742 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1743 unsigned VecLimit =
1744 ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1745 InstructionCost VecCost = 0;
1746 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1747 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1748 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1749 NumElts /= 2;
1750 }
1751 // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1752 // step.
1753 if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
1754 NumElts * EltSize == 64) {
1755 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts);
1756 VecCost += ST->getMVEVectorCostFactor(CostKind) +
1757 getArithmeticInstrCost(Opcode, VecTy, CostKind);
1758 NumElts /= 2;
1759 }
1760
1761 // From here we extract the elements and perform the and/or/xor.
1762 InstructionCost ExtractCost = NumElts;
1763 return VecCost + ExtractCost +
1764 (NumElts - 1) * getArithmeticInstrCost(
1765 Opcode, ValTy->getElementType(), CostKind);
1766 }
1767
1768 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
1770 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1771
1772 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1773
1774 static const CostTblEntry CostTblAdd[]{
1775 {ISD::ADD, MVT::v16i8, 1},
1776 {ISD::ADD, MVT::v8i16, 1},
1777 {ISD::ADD, MVT::v4i32, 1},
1778 };
1779 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1780 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1781
1782 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1783}
1784
1786 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1788 EVT ValVT = TLI->getValueType(DL, ValTy);
1789 EVT ResVT = TLI->getValueType(DL, ResTy);
1790
1791 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1792
1793 switch (ISD) {
1794 case ISD::ADD:
1795 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1796 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1797
1798 // The legal cases are:
1799 // VADDV u/s 8/16/32
1800 // VADDLV u/s 32
1801 // Codegen currently cannot always handle larger than legal vectors very
1802 // well, especially for predicated reductions where the mask needs to be
1803 // split, so restrict to 128bit or smaller input types.
1804 unsigned RevVTSize = ResVT.getSizeInBits();
1805 if (ValVT.getSizeInBits() <= 128 &&
1806 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1807 (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1808 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1809 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1810 }
1811 break;
1812 default:
1813 break;
1814 }
1815 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1816 CostKind);
1817}
1818
1821 VectorType *ValTy,
1823 EVT ValVT = TLI->getValueType(DL, ValTy);
1824 EVT ResVT = TLI->getValueType(DL, ResTy);
1825
1826 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1827 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1828
1829 // The legal cases are:
1830 // VMLAV u/s 8/16/32
1831 // VMLALV u/s 16/32
1832 // Codegen currently cannot always handle larger than legal vectors very
1833 // well, especially for predicated reductions where the mask needs to be
1834 // split, so restrict to 128bit or smaller input types.
1835 unsigned RevVTSize = ResVT.getSizeInBits();
1836 if (ValVT.getSizeInBits() <= 128 &&
1837 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1838 (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
1839 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1840 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1841 }
1842
1843 return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
1844}
1845
1848 FastMathFlags FMF,
1850 EVT ValVT = TLI->getValueType(DL, Ty);
1851
1852 // In general floating point reductions are a series of elementwise
1853 // operations, with free extracts on each step. These are either in-order or
1854 // treewise depending on whether that is allowed by the fast math flags.
1855 if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
1856 ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
1857 (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
1858 (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
1859 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
1860 unsigned EltSize = ValVT.getScalarSizeInBits();
1861 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1862 InstructionCost VecCost;
1863 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1864 Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);
1865 IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
1866 VecCost += getIntrinsicInstrCost(ICA, CostKind);
1867 NumElts /= 2;
1868 }
1869
1870 // For fp16 we need to extract the upper lane elements. MVE can add a
1871 // VREV+FMIN/MAX to perform another vector step instead.
1872 InstructionCost ExtractCost = 0;
1873 if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
1874 NumElts == 8) {
1875 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1876 NumElts /= 2;
1877 } else if (ValVT.getVectorElementType() == MVT::f16)
1878 ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;
1879
1881 {Ty->getElementType(), Ty->getElementType()},
1882 FMF);
1883 return VecCost + ExtractCost +
1884 (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
1885 }
1886
1887 if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
1888 IID == Intrinsic::umin || IID == Intrinsic::umax) {
1889 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1890
1891 // All costs are the same for u/s min/max. These lower to vminv, which are
1892 // given a slightly higher cost as they tend to take multiple cycles for
1893 // smaller type sizes.
1894 static const CostTblEntry CostTblAdd[]{
1895 {ISD::SMIN, MVT::v16i8, 4},
1896 {ISD::SMIN, MVT::v8i16, 3},
1897 {ISD::SMIN, MVT::v4i32, 2},
1898 };
1899 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))
1900 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1901 }
1902
1903 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1904}
1905
1909 unsigned Opc = ICA.getID();
1910 switch (Opc) {
1911 case Intrinsic::get_active_lane_mask:
1912 // Currently we make a somewhat optimistic assumption that
1913 // active_lane_mask's are always free. In reality it may be freely folded
1914 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1915 // of add/icmp code. We may need to improve this in the future, but being
1916 // able to detect if it is free or not involves looking at a lot of other
1917 // code. We currently assume that the vectorizer inserted these, and knew
1918 // what it was doing in adding one.
1919 if (ST->hasMVEIntegerOps())
1920 return 0;
1921 break;
1922 case Intrinsic::sadd_sat:
1923 case Intrinsic::ssub_sat:
1924 case Intrinsic::uadd_sat:
1925 case Intrinsic::usub_sat: {
1926 bool IsAdd = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
1927 bool IsSigned = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
1928 Type *RetTy = ICA.getReturnType();
1929
1930 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
1931 if (IsSigned && ST->hasDSP() && ITy->getBitWidth() == 32)
1932 return 1; // qadd / qsub
1933 if (ST->hasDSP() && (ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16))
1934 return 2; // uqadd16 / qadd16 / uqsub16 / qsub16 + possible extend.
1935 // Otherwise return the cost of expanding the node. Generally an add +
1936 // icmp + sel.
1938 Type *CondTy = RetTy->getWithNewBitWidth(1);
1939 return getArithmeticInstrCost(IsAdd ? Instruction::Add : Instruction::Sub,
1940 RetTy, CostKind) +
1941 2 * getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, Pred,
1942 CostKind) +
1943 2 * getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, Pred,
1944 CostKind);
1945 }
1946
1947 if (!ST->hasMVEIntegerOps())
1948 break;
1949
1950 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
1951 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1952 LT.second == MVT::v16i8) {
1953 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1954 // need to extend the type, as it uses shr(qadd(shl, shl)).
1955 unsigned Instrs =
1956 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1
1957 : 4;
1958 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1959 }
1960 break;
1961 }
1962 case Intrinsic::abs:
1963 case Intrinsic::smin:
1964 case Intrinsic::smax:
1965 case Intrinsic::umin:
1966 case Intrinsic::umax: {
1967 if (!ST->hasMVEIntegerOps())
1968 break;
1969 Type *VT = ICA.getReturnType();
1970
1971 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1972 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1973 LT.second == MVT::v16i8)
1974 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1975 break;
1976 }
1977 case Intrinsic::minnum:
1978 case Intrinsic::maxnum: {
1979 if (!ST->hasMVEFloatOps())
1980 break;
1981 Type *VT = ICA.getReturnType();
1982 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1983 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1984 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1985 break;
1986 }
1987 case Intrinsic::fptosi_sat:
1988 case Intrinsic::fptoui_sat: {
1989 if (ICA.getArgTypes().empty())
1990 break;
1991 bool IsSigned = Opc == Intrinsic::fptosi_sat;
1992 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1993 EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
1994 // Check for the legal types, with the corect subtarget features.
1995 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
1996 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
1997 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
1998 return LT.first;
1999
2000 // Equally for MVE vector types
2001 if (ST->hasMVEFloatOps() &&
2002 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
2003 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
2004 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2005
2006 // If we can we use a legal convert followed by a min+max
2007 if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
2008 (ST->hasFP64() && LT.second == MVT::f64) ||
2009 (ST->hasFullFP16() && LT.second == MVT::f16) ||
2010 (ST->hasMVEFloatOps() &&
2011 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
2012 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
2013 Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
2014 LT.second.getScalarSizeInBits());
2016 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
2017 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
2018 : Intrinsic::umin,
2019 LegalTy, {LegalTy, LegalTy});
2021 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
2022 : Intrinsic::umax,
2023 LegalTy, {LegalTy, LegalTy});
2025 return LT.first * Cost;
2026 }
2027 // Otherwise we need to follow the default expansion that clamps the value
2028 // using a float min/max with a fcmp+sel for nan handling when signed.
2029 Type *FPTy = ICA.getArgTypes()[0];
2030 Type *RetTy = ICA.getReturnType();
2031 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
2033 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
2035 Cost +=
2036 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
2038 if (IsSigned) {
2039 Type *CondTy = RetTy->getWithNewBitWidth(1);
2040 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
2042 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2044 }
2045 return Cost;
2046 }
2047 }
2048
2050}
2051
2053 if (!F->isIntrinsic())
2054 return BaseT::isLoweredToCall(F);
2055
2056 // Assume all Arm-specific intrinsics map to an instruction.
2057 if (F->getName().starts_with("llvm.arm"))
2058 return false;
2059
2060 switch (F->getIntrinsicID()) {
2061 default: break;
2062 case Intrinsic::powi:
2063 case Intrinsic::sin:
2064 case Intrinsic::cos:
2065 case Intrinsic::sincos:
2066 case Intrinsic::pow:
2067 case Intrinsic::log:
2068 case Intrinsic::log10:
2069 case Intrinsic::log2:
2070 case Intrinsic::exp:
2071 case Intrinsic::exp2:
2072 return true;
2073 case Intrinsic::sqrt:
2074 case Intrinsic::fabs:
2075 case Intrinsic::copysign:
2076 case Intrinsic::floor:
2077 case Intrinsic::ceil:
2078 case Intrinsic::trunc:
2079 case Intrinsic::rint:
2080 case Intrinsic::nearbyint:
2081 case Intrinsic::round:
2082 case Intrinsic::canonicalize:
2083 case Intrinsic::lround:
2084 case Intrinsic::llround:
2085 case Intrinsic::lrint:
2086 case Intrinsic::llrint:
2087 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2088 return true;
2089 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2090 return true;
2091 // Some operations can be handled by vector instructions and assume
2092 // unsupported vectors will be expanded into supported scalar ones.
2093 // TODO Handle scalar operations properly.
2094 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2095 case Intrinsic::masked_store:
2096 case Intrinsic::masked_load:
2097 case Intrinsic::masked_gather:
2098 case Intrinsic::masked_scatter:
2099 return !ST->hasMVEIntegerOps();
2100 case Intrinsic::sadd_with_overflow:
2101 case Intrinsic::uadd_with_overflow:
2102 case Intrinsic::ssub_with_overflow:
2103 case Intrinsic::usub_with_overflow:
2104 case Intrinsic::sadd_sat:
2105 case Intrinsic::uadd_sat:
2106 case Intrinsic::ssub_sat:
2107 case Intrinsic::usub_sat:
2108 return false;
2109 }
2110
2111 return BaseT::isLoweredToCall(F);
2112}
2113
2115 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
2116 EVT VT = TLI->getValueType(DL, I.getType(), true);
2117 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
2118 return true;
2119
2120 // Check if an intrinsic will be lowered to a call and assume that any
2121 // other CallInst will generate a bl.
2122 if (auto *Call = dyn_cast<CallInst>(&I)) {
2123 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
2124 switch(II->getIntrinsicID()) {
2125 case Intrinsic::memcpy:
2126 case Intrinsic::memset:
2127 case Intrinsic::memmove:
2128 return getNumMemOps(II) == -1;
2129 default:
2130 if (const Function *F = Call->getCalledFunction())
2131 return isLoweredToCall(F);
2132 }
2133 }
2134 return true;
2135 }
2136
2137 // FPv5 provides conversions between integer, double-precision,
2138 // single-precision, and half-precision formats.
2139 switch (I.getOpcode()) {
2140 default:
2141 break;
2142 case Instruction::FPToSI:
2143 case Instruction::FPToUI:
2144 case Instruction::SIToFP:
2145 case Instruction::UIToFP:
2146 case Instruction::FPTrunc:
2147 case Instruction::FPExt:
2148 return !ST->hasFPARMv8Base();
2149 }
2150
2151 // FIXME: Unfortunately the approach of checking the Operation Action does
2152 // not catch all cases of Legalization that use library calls. Our
2153 // Legalization step categorizes some transformations into library calls as
2154 // Custom, Expand or even Legal when doing type legalization. So for now
2155 // we have to special case for instance the SDIV of 64bit integers and the
2156 // use of floating point emulation.
2157 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
2158 switch (ISD) {
2159 default:
2160 break;
2161 case ISD::SDIV:
2162 case ISD::UDIV:
2163 case ISD::SREM:
2164 case ISD::UREM:
2165 case ISD::SDIVREM:
2166 case ISD::UDIVREM:
2167 return true;
2168 }
2169 }
2170
2171 // Assume all other non-float operations are supported.
2172 if (!VT.isFloatingPoint())
2173 return false;
2174
2175 // We'll need a library call to handle most floats when using soft.
2176 if (TLI->useSoftFloat()) {
2177 switch (I.getOpcode()) {
2178 default:
2179 return true;
2180 case Instruction::Alloca:
2181 case Instruction::Load:
2182 case Instruction::Store:
2183 case Instruction::Select:
2184 case Instruction::PHI:
2185 return false;
2186 }
2187 }
2188
2189 // We'll need a libcall to perform double precision operations on a single
2190 // precision only FPU.
2191 if (I.getType()->isDoubleTy() && !ST->hasFP64())
2192 return true;
2193
2194 // Likewise for half precision arithmetic.
2195 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2196 return true;
2197
2198 return false;
2199}
2200
2202 AssumptionCache &AC,
2203 TargetLibraryInfo *LibInfo,
2204 HardwareLoopInfo &HWLoopInfo) {
2205 // Low-overhead branches are only supported in the 'low-overhead branch'
2206 // extension of v8.1-m.
2207 if (!ST->hasLOB() || DisableLowOverheadLoops) {
2208 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2209 return false;
2210 }
2211
2213 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2214 return false;
2215 }
2216
2217 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2218 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2219 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2220 return false;
2221 }
2222
2223 const SCEV *TripCountSCEV =
2224 SE.getAddExpr(BackedgeTakenCount,
2225 SE.getOne(BackedgeTakenCount->getType()));
2226
2227 // We need to store the trip count in LR, a 32-bit register.
2228 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2229 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2230 return false;
2231 }
2232
2233 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2234 // point in generating a hardware loop if that's going to happen.
2235
2236 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2237 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2238 switch (Call->getIntrinsicID()) {
2239 default:
2240 break;
2241 case Intrinsic::start_loop_iterations:
2242 case Intrinsic::test_start_loop_iterations:
2243 case Intrinsic::loop_decrement:
2244 case Intrinsic::loop_decrement_reg:
2245 return true;
2246 }
2247 }
2248 return false;
2249 };
2250
2251 // Scan the instructions to see if there's any that we know will turn into a
2252 // call or if this loop is already a low-overhead loop or will become a tail
2253 // predicated loop.
2254 bool IsTailPredLoop = false;
2255 auto ScanLoop = [&](Loop *L) {
2256 for (auto *BB : L->getBlocks()) {
2257 for (auto &I : *BB) {
2258 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2259 isa<InlineAsm>(I)) {
2260 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2261 return false;
2262 }
2263 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2264 IsTailPredLoop |=
2265 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2266 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2267 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2268 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2269 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2270 }
2271 }
2272 return true;
2273 };
2274
2275 // Visit inner loops.
2276 for (auto *Inner : *L)
2277 if (!ScanLoop(Inner))
2278 return false;
2279
2280 if (!ScanLoop(L))
2281 return false;
2282
2283 // TODO: Check whether the trip count calculation is expensive. If L is the
2284 // inner loop but we know it has a low trip count, calculating that trip
2285 // count (in the parent loop) may be detrimental.
2286
2287 LLVMContext &C = L->getHeader()->getContext();
2288 HWLoopInfo.CounterInReg = true;
2289 HWLoopInfo.IsNestingLegal = false;
2290 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2291 HWLoopInfo.CountType = Type::getInt32Ty(C);
2292 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2293 return true;
2294}
2295
2296static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2297 // We don't allow icmp's, and because we only look at single block loops,
2298 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2299 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2300 return false;
2301 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2302 // not currently canonical, but soon will be. Code without them uses icmp, and
2303 // so is not tail predicated as per the condition above. In order to get the
2304 // same performance we treat min and max the same as an icmp for tailpred
2305 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2306 // pick more optimial instructions like VQDMULH. They need to be recognized
2307 // directly by the vectorizer).
2308 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2309 if ((II->getIntrinsicID() == Intrinsic::smin ||
2310 II->getIntrinsicID() == Intrinsic::smax ||
2311 II->getIntrinsicID() == Intrinsic::umin ||
2312 II->getIntrinsicID() == Intrinsic::umax) &&
2313 ++ICmpCount > 1)
2314 return false;
2315
2316 if (isa<FCmpInst>(&I))
2317 return false;
2318
2319 // We could allow extending/narrowing FP loads/stores, but codegen is
2320 // too inefficient so reject this for now.
2321 if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2322 return false;
2323
2324 // Extends have to be extending-loads
2325 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2326 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2327 return false;
2328
2329 // Truncs have to be narrowing-stores
2330 if (isa<TruncInst>(&I) )
2331 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2332 return false;
2333
2334 return true;
2335}
2336
2337// To set up a tail-predicated loop, we need to know the total number of
2338// elements processed by that loop. Thus, we need to determine the element
2339// size and:
2340// 1) it should be uniform for all operations in the vector loop, so we
2341// e.g. don't want any widening/narrowing operations.
2342// 2) it should be smaller than i64s because we don't have vector operations
2343// that work on i64s.
2344// 3) we don't want elements to be reversed or shuffled, to make sure the
2345// tail-predication masks/predicates the right lanes.
2346//
2348 const DataLayout &DL,
2349 const LoopAccessInfo *LAI) {
2350 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2351
2352 // If there are live-out values, it is probably a reduction. We can predicate
2353 // most reduction operations freely under MVE using a combination of
2354 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2355 // floating point and integer reductions, but don't check for operators
2356 // specifically here. If the value ends up not being a reduction (and so the
2357 // vectorizer cannot tailfold the loop), we should fall back to standard
2358 // vectorization automatically.
2360 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2361 bool ReductionsDisabled =
2364
2365 for (auto *I : LiveOuts) {
2366 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2367 !I->getType()->isHalfTy()) {
2368 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2369 "live-out value\n");
2370 return false;
2371 }
2372 if (ReductionsDisabled) {
2373 LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2374 return false;
2375 }
2376 }
2377
2378 // Next, check that all instructions can be tail-predicated.
2379 PredicatedScalarEvolution PSE = LAI->getPSE();
2381 int ICmpCount = 0;
2382
2383 for (BasicBlock *BB : L->blocks()) {
2384 for (Instruction &I : BB->instructionsWithoutDebug()) {
2385 if (isa<PHINode>(&I))
2386 continue;
2387 if (!canTailPredicateInstruction(I, ICmpCount)) {
2388 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2389 return false;
2390 }
2391
2392 Type *T = I.getType();
2393 if (T->getScalarSizeInBits() > 32) {
2394 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2395 return false;
2396 }
2397 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2399 Type *AccessTy = getLoadStoreType(&I);
2400 int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
2401 if (NextStride == 1) {
2402 // TODO: for now only allow consecutive strides of 1. We could support
2403 // other strides as long as it is uniform, but let's keep it simple
2404 // for now.
2405 continue;
2406 } else if (NextStride == -1 ||
2407 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2408 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2410 << "Consecutive strides of 2 found, vld2/vstr2 can't "
2411 "be tail-predicated\n.");
2412 return false;
2413 // TODO: don't tail predicate if there is a reversed load?
2414 } else if (EnableMaskedGatherScatters) {
2415 // Gather/scatters do allow loading from arbitrary strides, at
2416 // least if they are loop invariant.
2417 // TODO: Loop variant strides should in theory work, too, but
2418 // this requires further testing.
2419 const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2420 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2421 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2422 if (PSE.getSE()->isLoopInvariant(Step, L))
2423 continue;
2424 }
2425 }
2426 LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2427 "tail-predicate\n.");
2428 return false;
2429 }
2430 }
2431 }
2432
2433 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2434 return true;
2435}
2436
2438 if (!EnableTailPredication) {
2439 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2440 return false;
2441 }
2442
2443 // Creating a predicated vector loop is the first step for generating a
2444 // tail-predicated hardware loop, for which we need the MVE masked
2445 // load/stores instructions:
2446 if (!ST->hasMVEIntegerOps())
2447 return false;
2448
2449 LoopVectorizationLegality *LVL = TFI->LVL;
2450 Loop *L = LVL->getLoop();
2451
2452 // For now, restrict this to single block loops.
2453 if (L->getNumBlocks() > 1) {
2454 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2455 "loop.\n");
2456 return false;
2457 }
2458
2459 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2460
2461 LoopInfo *LI = LVL->getLoopInfo();
2462 HardwareLoopInfo HWLoopInfo(L);
2463 if (!HWLoopInfo.canAnalyze(*LI)) {
2464 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2465 "analyzable.\n");
2466 return false;
2467 }
2468
2471
2472 // This checks if we have the low-overhead branch architecture
2473 // extension, and if we will create a hardware-loop:
2474 if (!isHardwareLoopProfitable(L, *SE, *AC, TFI->TLI, HWLoopInfo)) {
2475 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2476 "profitable.\n");
2477 return false;
2478 }
2479
2480 DominatorTree *DT = LVL->getDominatorTree();
2481 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) {
2482 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2483 "a candidate.\n");
2484 return false;
2485 }
2486
2487 return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI());
2488}
2489
2491ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
2492 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2494
2495 // Intrinsic @llvm.get.active.lane.mask is supported.
2496 // It is used in the MVETailPredication pass, which requires the number of
2497 // elements processed by this vector loop to setup the tail-predicated
2498 // loop.
2500}
2504 // Enable Upper bound unrolling universally, providing that we do not see an
2505 // active lane mask, which will be better kept as a loop to become tail
2506 // predicated than to be conditionally unrolled.
2507 UP.UpperBound =
2508 !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) {
2509 return isa<IntrinsicInst>(I) &&
2510 cast<IntrinsicInst>(I).getIntrinsicID() ==
2511 Intrinsic::get_active_lane_mask;
2512 });
2513
2514 // Only currently enable these preferences for M-Class cores.
2515 if (!ST->isMClass())
2516 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2517
2518 // Disable loop unrolling for Oz and Os.
2519 UP.OptSizeThreshold = 0;
2521 if (L->getHeader()->getParent()->hasOptSize())
2522 return;
2523
2524 SmallVector<BasicBlock*, 4> ExitingBlocks;
2525 L->getExitingBlocks(ExitingBlocks);
2526 LLVM_DEBUG(dbgs() << "Loop has:\n"
2527 << "Blocks: " << L->getNumBlocks() << "\n"
2528 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2529
2530 // Only allow another exit other than the latch. This acts as an early exit
2531 // as it mirrors the profitability calculation of the runtime unroller.
2532 if (ExitingBlocks.size() > 2)
2533 return;
2534
2535 // Limit the CFG of the loop body for targets with a branch predictor.
2536 // Allowing 4 blocks permits if-then-else diamonds in the body.
2537 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2538 return;
2539
2540 // Don't unroll vectorized loops, including the remainder loop
2541 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2542 return;
2543
2544 // Scan the loop: don't unroll loops with calls as this could prevent
2545 // inlining.
2547 for (auto *BB : L->getBlocks()) {
2548 for (auto &I : *BB) {
2549 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2550 // scalar code.
2551 if (I.getType()->isVectorTy())
2552 return;
2553
2554 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2555 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2556 if (!isLoweredToCall(F))
2557 continue;
2558 }
2559 return;
2560 }
2561
2562 SmallVector<const Value*, 4> Operands(I.operand_values());
2565 }
2566 }
2567
2568 // On v6m cores, there are very few registers available. We can easily end up
2569 // spilling and reloading more registers in an unrolled loop. Look at the
2570 // number of LCSSA phis as a rough measure of how many registers will need to
2571 // be live out of the loop, reducing the default unroll count if more than 1
2572 // value is needed. In the long run, all of this should be being learnt by a
2573 // machine.
2574 unsigned UnrollCount = 4;
2575 if (ST->isThumb1Only()) {
2576 unsigned ExitingValues = 0;
2578 L->getExitBlocks(ExitBlocks);
2579 for (auto *Exit : ExitBlocks) {
2580 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2581 // only the last is expected to be needed for address operands.
2582 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2583 return PH.getNumOperands() != 1 ||
2584 !isa<GetElementPtrInst>(PH.getOperand(0));
2585 });
2586 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2587 }
2588 if (ExitingValues)
2589 UnrollCount /= ExitingValues;
2590 if (UnrollCount <= 1)
2591 return;
2592 }
2593
2594 // For processors with low overhead branching (LOB), runtime unrolling the
2595 // innermost loop is often detrimental to performance. In these cases the loop
2596 // remainder gets unrolled into a series of compare-and-jump blocks, which in
2597 // deeply nested loops get executed multiple times, negating the benefits of
2598 // LOB. This is particularly noticable when the loop trip count of the
2599 // innermost loop varies within the outer loop, such as in the case of
2600 // triangular matrix decompositions. In these cases we will prefer to not
2601 // unroll the innermost loop, with the intention for it to be executed as a
2602 // low overhead loop.
2603 bool Runtime = true;
2604 if (ST->hasLOB()) {
2606 const auto *BETC = SE.getBackedgeTakenCount(L);
2607 auto *Outer = L->getOutermostLoop();
2608 if ((L != Outer && Outer != L->getParentLoop()) ||
2609 (L != Outer && BETC && !SE.isLoopInvariant(BETC, Outer))) {
2610 Runtime = false;
2611 }
2612 }
2613 }
2614
2615 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2616 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2617
2618 UP.Partial = true;
2619 UP.Runtime = Runtime;
2620 UP.UnrollRemainder = true;
2622 UP.UnrollAndJam = true;
2624
2625 // Force unrolling small loops can be very useful because of the branch
2626 // taken cost of the backedge.
2627 if (Cost < 12)
2628 UP.Force = true;
2629}
2630
2634}
2635
2636bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2637 TTI::ReductionFlags Flags) const {
2638 if (!ST->hasMVEIntegerOps())
2639 return false;
2640
2641 unsigned ScalarBits = Ty->getScalarSizeInBits();
2642 switch (Opcode) {
2643 case Instruction::Add:
2644 return ScalarBits <= 64;
2645 default:
2646 return false;
2647 }
2648}
2649
2651 unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2652 if (!ST->hasMVEIntegerOps())
2653 return false;
2654 return true;
2655}
2656
2658 StackOffset BaseOffset,
2659 bool HasBaseReg, int64_t Scale,
2660 unsigned AddrSpace) const {
2662 AM.BaseGV = BaseGV;
2663 AM.BaseOffs = BaseOffset.getFixed();
2664 AM.HasBaseReg = HasBaseReg;
2665 AM.Scale = Scale;
2666 AM.ScalableOffset = BaseOffset.getScalable();
2667 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2668 if (ST->hasFPAO())
2669 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2670 return 0;
2671 }
2672 return -1;
2673}
2674
2675bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2676 if (Thumb) {
2677 // B.W is available in any Thumb2-supporting target, and also in every
2678 // version of Armv8-M, even Baseline which does not include the rest of
2679 // Thumb2.
2680 return ST->isThumb2() || ST->hasV8MBaselineOps();
2681 } else {
2682 // B is available in all versions of the Arm ISA, so the only question is
2683 // whether that ISA is available at all.
2684 return ST->hasARMOps();
2685 }
2686}
2687
2688/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
2689/// of the vector elements.
2690static bool areExtractExts(Value *Ext1, Value *Ext2) {
2691 using namespace PatternMatch;
2692
2693 auto areExtDoubled = [](Instruction *Ext) {
2694 return Ext->getType()->getScalarSizeInBits() ==
2695 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
2696 };
2697
2698 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
2699 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
2700 !areExtDoubled(cast<Instruction>(Ext1)) ||
2701 !areExtDoubled(cast<Instruction>(Ext2)))
2702 return false;
2703
2704 return true;
2705}
2706
2707/// Check if sinking \p I's operands to I's basic block is profitable, because
2708/// the operands can be folded into a target instruction, e.g.
2709/// sext/zext can be folded into vsubl.
2711 SmallVectorImpl<Use *> &Ops) const {
2712 using namespace PatternMatch;
2713
2714 if (!I->getType()->isVectorTy())
2715 return false;
2716
2717 if (ST->hasNEON()) {
2718 switch (I->getOpcode()) {
2719 case Instruction::Sub:
2720 case Instruction::Add: {
2721 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
2722 return false;
2723 Ops.push_back(&I->getOperandUse(0));
2724 Ops.push_back(&I->getOperandUse(1));
2725 return true;
2726 }
2727 default:
2728 return false;
2729 }
2730 }
2731
2732 if (!ST->hasMVEIntegerOps())
2733 return false;
2734
2735 auto IsFMSMul = [&](Instruction *I) {
2736 if (!I->hasOneUse())
2737 return false;
2738 auto *Sub = cast<Instruction>(*I->users().begin());
2739 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
2740 };
2741 auto IsFMS = [&](Instruction *I) {
2742 if (match(I->getOperand(0), m_FNeg(m_Value())) ||
2743 match(I->getOperand(1), m_FNeg(m_Value())))
2744 return true;
2745 return false;
2746 };
2747
2748 auto IsSinker = [&](Instruction *I, int Operand) {
2749 switch (I->getOpcode()) {
2750 case Instruction::Add:
2751 case Instruction::Mul:
2752 case Instruction::FAdd:
2753 case Instruction::ICmp:
2754 case Instruction::FCmp:
2755 return true;
2756 case Instruction::FMul:
2757 return !IsFMSMul(I);
2758 case Instruction::Sub:
2759 case Instruction::FSub:
2760 case Instruction::Shl:
2761 case Instruction::LShr:
2762 case Instruction::AShr:
2763 return Operand == 1;
2764 case Instruction::Call:
2765 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
2766 switch (II->getIntrinsicID()) {
2767 case Intrinsic::fma:
2768 return !IsFMS(I);
2769 case Intrinsic::sadd_sat:
2770 case Intrinsic::uadd_sat:
2771 case Intrinsic::arm_mve_add_predicated:
2772 case Intrinsic::arm_mve_mul_predicated:
2773 case Intrinsic::arm_mve_qadd_predicated:
2774 case Intrinsic::arm_mve_vhadd:
2775 case Intrinsic::arm_mve_hadd_predicated:
2776 case Intrinsic::arm_mve_vqdmull:
2777 case Intrinsic::arm_mve_vqdmull_predicated:
2778 case Intrinsic::arm_mve_vqdmulh:
2779 case Intrinsic::arm_mve_qdmulh_predicated:
2780 case Intrinsic::arm_mve_vqrdmulh:
2781 case Intrinsic::arm_mve_qrdmulh_predicated:
2782 case Intrinsic::arm_mve_fma_predicated:
2783 return true;
2784 case Intrinsic::ssub_sat:
2785 case Intrinsic::usub_sat:
2786 case Intrinsic::arm_mve_sub_predicated:
2787 case Intrinsic::arm_mve_qsub_predicated:
2788 case Intrinsic::arm_mve_hsub_predicated:
2789 case Intrinsic::arm_mve_vhsub:
2790 return Operand == 1;
2791 default:
2792 return false;
2793 }
2794 }
2795 return false;
2796 default:
2797 return false;
2798 }
2799 };
2800
2801 for (auto OpIdx : enumerate(I->operands())) {
2802 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
2803 // Make sure we are not already sinking this operand
2804 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
2805 continue;
2806
2807 Instruction *Shuffle = Op;
2808 if (Shuffle->getOpcode() == Instruction::BitCast)
2809 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
2810 // We are looking for a splat that can be sunk.
2811 if (!Shuffle || !match(Shuffle, m_Shuffle(m_InsertElt(m_Undef(), m_Value(),
2812 m_ZeroInt()),
2813 m_Undef(), m_ZeroMask())))
2814 continue;
2815 if (!IsSinker(I, OpIdx.index()))
2816 continue;
2817
2818 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
2819 // and vector registers
2820 for (Use &U : Op->uses()) {
2821 Instruction *Insn = cast<Instruction>(U.getUser());
2822 if (!IsSinker(Insn, U.getOperandNo()))
2823 return false;
2824 }
2825
2826 Ops.push_back(&Shuffle->getOperandUse(0));
2827 if (Shuffle != Op)
2828 Ops.push_back(&Op->getOperandUse(0));
2829 Ops.push_back(&OpIdx.value());
2830 }
2831 return true;
2832}
2833
2835 Type *ArrayType) const {
2836 if (!UseWidenGlobalArrays) {
2837 LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
2838 return false;
2839 }
2840
2841 // Don't modify none integer array types
2842 if (!ArrayType || !ArrayType->isArrayTy() ||
2844 return 0;
2845
2846 // We pad to 4 byte boundaries
2847 if (Size % 4 == 0)
2848 return 0;
2849
2850 unsigned NumBytesToPad = 4 - (Size % 4);
2851 unsigned NewSize = Size + NumBytesToPad;
2852
2853 // Max number of bytes that memcpy allows for lowering to load/stores before
2854 // it uses library function (__aeabi_memcpy).
2855 unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
2856
2857 if (NewSize > MaxMemIntrinsicSize)
2858 return 0;
2859
2860 return NumBytesToPad;
2861}
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
This file implements a class to represent arbitrary precision integral constant values and operations...
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static Value * isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm)
static cl::opt< bool > UseWidenGlobalArrays("widen-global-strings", cl::Hidden, cl::init(true), cl::desc("Enable the widening of global strings to alignment boundaries"))
cl::opt< bool > EnableMaskedGatherScatters
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor
cl::opt< TailPredication::Mode > EnableTailPredication
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI)
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
This file a TargetTransformInfo::Concept conforming object specific to the ARM target machine.
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Size
Hexagon Common GEP
This file provides the interface for the instcombine pass implementation.
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:78
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:624
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
bool hasARMOps() const
Definition: ARMSubtarget.h:302
bool isThumb1Only() const
Definition: ARMSubtarget.h:403
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:311
bool isThumb2() const
Definition: ARMSubtarget.h:404
bool hasVFP2Base() const
Definition: ARMSubtarget.h:308
bool isMClass() const
Definition: ARMSubtarget.h:405
unsigned getMVEVectorCostFactor(TargetTransformInfo::TargetCostKind CostKind) const
Definition: ARMSubtarget.h:519
bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr)
bool maybeLoweredToCall(Instruction &I)
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
bool isLegalMaskedStore(Type *DataTy, Align Alignment)
bool isLegalMaskedLoad(Type *DataTy, Align Alignment)
InstructionCost getMemcpyCost(const Instruction *I)
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLoweredToCall(const Function *F)
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
getScalingFactorCost - Return the cost of the scaling used in addressing mode represented by AM.
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool hasArmWideBranch(bool Thumb) const
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool isLegalMaskedGather(Type *Ty, Align Alignment)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty)
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
bool isProfitableLSRChainElement(Instruction *I)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool useSoftFloat() const override
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Class to represent array types.
Definition: DerivedTypes.h:395
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:234
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:622
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:694
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:922
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:806
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:958
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:379
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:980
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:148
This class represents a range of values.
Definition: ConstantRange.h:47
ConstantRange intersectWith(const ConstantRange &CR, PreferredRangeType Type=Smallest) const
Return the range that results from the intersection of this range with another range.
This is an important base class in LLVM.
Definition: Constant.h:42
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1815
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1163
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:485
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:900
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
The core instruction combiner logic.
Definition: InstCombiner.h:48
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:337
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:336
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:388
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, unsigned Depth, const SimplifyQuery &Q)=0
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:412
BuilderTy & Builder
Definition: InstCombiner.h:61
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:334
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:169
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
bool isShift() const
Definition: Instruction.h:282
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Drive the analysis of memory accesses in the loop.
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
AssumptionCache * getAssumptionCache() const
const LoopAccessInfo * getLAI() const
ScalarEvolution * getScalarEvolution() const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
The optimization diagnostic interface.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
This class represents an analyzed expression in the program.
Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
Provides information about what library functions are available for the current target.
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxStoresPerMemcpy(bool OptSize) const
Get maximum # of store operations permitted for llvm.memcpy.
unsigned getMaxStoresPerMemmove(bool OptSize) const
Get maximum # of store operations permitted for llvm.memmove.
unsigned getMaxStoresPerMemset(bool OptSize) const
Get maximum # of store operations permitted for llvm.memset.
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:261
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
Type * getArrayElementType() const
Definition: Type.h:411
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:241
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
Type * getElementType() const
Definition: DerivedTypes.h:460
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
Definition: PatternMatch.h:165
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1109
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
@ Runtime
Detect stack use after return if not disabled runtime with (ASAN_OPTIONS=detect_stack_use_after_retur...
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition: Local.h:242
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
Definition: LoopUtils.cpp:123
SelectPatternFlavor
Specific patterns of select instructions we can match.
@ SPF_ABS
Floating point maxnum.
@ SPF_FMAXNUM
Floating point minnum.
@ SPF_UMIN
Signed minimum.
@ SPF_UMAX
Signed maximum.
@ SPF_SMAX
Unsigned minimum.
@ SPF_FMINNUM
Unsigned maximum.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:293
SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
InstructionCost Cost
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool isFixedLengthVector() const
Definition: ValueTypes.h:181
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
Attributes of a target dependent hardware loop.
bool canAnalyze(LoopInfo &LI)
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
SelectPatternFlavor Flavor
TargetLibraryInfo * TLI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Type Conversion Cost Table.
Definition: CostTable.h:55