LLVM 17.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
20#include "llvm/IR/Intrinsics.h"
21#include "llvm/IR/IntrinsicsAArch64.h"
23#include "llvm/Support/Debug.h"
26#include <algorithm>
27#include <optional>
28using namespace llvm;
29using namespace llvm::PatternMatch;
30
31#define DEBUG_TYPE "aarch64tti"
32
33static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
34 cl::init(true), cl::Hidden);
35
36static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
38
39static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
40 cl::init(10), cl::Hidden);
41
42namespace {
43class TailFoldingKind {
44private:
45 uint8_t Bits = 0; // Currently defaults to disabled.
46
47public:
48 enum TailFoldingOpts {
49 TFDisabled = 0x0,
50 TFReductions = 0x01,
51 TFRecurrences = 0x02,
52 TFReverse = 0x04,
53 TFSimple = 0x80,
54 TFAll = TFReductions | TFRecurrences | TFReverse | TFSimple
55 };
56
57 void operator=(const std::string &Val) {
58 if (Val.empty())
59 return;
60 SmallVector<StringRef, 6> TailFoldTypes;
61 StringRef(Val).split(TailFoldTypes, '+', -1, false);
62 for (auto TailFoldType : TailFoldTypes) {
63 if (TailFoldType == "disabled")
64 Bits = 0;
65 else if (TailFoldType == "all")
66 Bits = TFAll;
67 else if (TailFoldType == "default")
68 Bits = 0; // Currently defaults to never tail-folding.
69 else if (TailFoldType == "simple")
70 add(TFSimple);
71 else if (TailFoldType == "reductions")
72 add(TFReductions);
73 else if (TailFoldType == "recurrences")
74 add(TFRecurrences);
75 else if (TailFoldType == "reverse")
76 add(TFReverse);
77 else if (TailFoldType == "noreductions")
78 remove(TFReductions);
79 else if (TailFoldType == "norecurrences")
80 remove(TFRecurrences);
81 else if (TailFoldType == "noreverse")
82 remove(TFReverse);
83 else {
84 errs()
85 << "invalid argument " << TailFoldType.str()
86 << " to -sve-tail-folding=; each element must be one of: disabled, "
87 "all, default, simple, reductions, noreductions, recurrences, "
88 "norecurrences\n";
89 }
90 }
91 }
92
93 operator uint8_t() const { return Bits; }
94
95 void add(uint8_t Flag) { Bits |= Flag; }
96 void remove(uint8_t Flag) { Bits &= ~Flag; }
97};
98} // namespace
99
100TailFoldingKind TailFoldingKindLoc;
101
103 "sve-tail-folding",
104 cl::desc(
105 "Control the use of vectorisation using tail-folding for SVE:"
106 "\ndisabled No loop types will vectorize using tail-folding"
107 "\ndefault Uses the default tail-folding settings for the target "
108 "CPU"
109 "\nall All legal loop types will vectorize using tail-folding"
110 "\nsimple Use tail-folding for simple loops (not reductions or "
111 "recurrences)"
112 "\nreductions Use tail-folding for loops containing reductions"
113 "\nrecurrences Use tail-folding for loops containing fixed order "
114 "recurrences"
115 "\nreverse Use tail-folding for loops requiring reversed "
116 "predicates"),
118
119// Experimental option that will only be fully functional when the
120// code-generator is changed to use SVE instead of NEON for all fixed-width
121// operations.
123 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
124
125// Experimental option that will only be fully functional when the cost-model
126// and code-generator have been changed to avoid using scalable vector
127// instructions that are not legal in streaming SVE mode.
129 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
130
132 const Function *Callee) const {
133 SMEAttrs CallerAttrs(*Caller);
134 SMEAttrs CalleeAttrs(*Callee);
135 if (CallerAttrs.requiresSMChange(CalleeAttrs,
136 /*BodyOverridesInterface=*/true) ||
137 CallerAttrs.requiresLazySave(CalleeAttrs) ||
138 CalleeAttrs.hasNewZAInterface())
139 return false;
140
141 const TargetMachine &TM = getTLI()->getTargetMachine();
142
143 const FeatureBitset &CallerBits =
144 TM.getSubtargetImpl(*Caller)->getFeatureBits();
145 const FeatureBitset &CalleeBits =
146 TM.getSubtargetImpl(*Callee)->getFeatureBits();
147
148 // Inline a callee if its target-features are a subset of the callers
149 // target-features.
150 return (CallerBits & CalleeBits) == CalleeBits;
151}
152
157}
158
159/// Calculate the cost of materializing a 64-bit value. This helper
160/// method might only calculate a fraction of a larger immediate. Therefore it
161/// is valid to return a cost of ZERO.
163 // Check if the immediate can be encoded within an instruction.
164 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
165 return 0;
166
167 if (Val < 0)
168 Val = ~Val;
169
170 // Calculate how many moves we will need to materialize this constant.
173 return Insn.size();
174}
175
176/// Calculate the cost of materializing the given constant.
179 assert(Ty->isIntegerTy());
180
181 unsigned BitSize = Ty->getPrimitiveSizeInBits();
182 if (BitSize == 0)
183 return ~0U;
184
185 // Sign-extend all constants to a multiple of 64-bit.
186 APInt ImmVal = Imm;
187 if (BitSize & 0x3f)
188 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
189
190 // Split the constant into 64-bit chunks and calculate the cost for each
191 // chunk.
193 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
194 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
195 int64_t Val = Tmp.getSExtValue();
196 Cost += getIntImmCost(Val);
197 }
198 // We need at least one instruction to materialze the constant.
199 return std::max<InstructionCost>(1, Cost);
200}
201
203 const APInt &Imm, Type *Ty,
205 Instruction *Inst) {
206 assert(Ty->isIntegerTy());
207
208 unsigned BitSize = Ty->getPrimitiveSizeInBits();
209 // There is no cost model for constants with a bit size of 0. Return TCC_Free
210 // here, so that constant hoisting will ignore this constant.
211 if (BitSize == 0)
212 return TTI::TCC_Free;
213
214 unsigned ImmIdx = ~0U;
215 switch (Opcode) {
216 default:
217 return TTI::TCC_Free;
218 case Instruction::GetElementPtr:
219 // Always hoist the base address of a GetElementPtr.
220 if (Idx == 0)
221 return 2 * TTI::TCC_Basic;
222 return TTI::TCC_Free;
223 case Instruction::Store:
224 ImmIdx = 0;
225 break;
226 case Instruction::Add:
227 case Instruction::Sub:
228 case Instruction::Mul:
229 case Instruction::UDiv:
230 case Instruction::SDiv:
231 case Instruction::URem:
232 case Instruction::SRem:
233 case Instruction::And:
234 case Instruction::Or:
235 case Instruction::Xor:
236 case Instruction::ICmp:
237 ImmIdx = 1;
238 break;
239 // Always return TCC_Free for the shift value of a shift instruction.
240 case Instruction::Shl:
241 case Instruction::LShr:
242 case Instruction::AShr:
243 if (Idx == 1)
244 return TTI::TCC_Free;
245 break;
246 case Instruction::Trunc:
247 case Instruction::ZExt:
248 case Instruction::SExt:
249 case Instruction::IntToPtr:
250 case Instruction::PtrToInt:
251 case Instruction::BitCast:
252 case Instruction::PHI:
253 case Instruction::Call:
254 case Instruction::Select:
255 case Instruction::Ret:
256 case Instruction::Load:
257 break;
258 }
259
260 if (Idx == ImmIdx) {
261 int NumConstants = (BitSize + 63) / 64;
263 return (Cost <= NumConstants * TTI::TCC_Basic)
264 ? static_cast<int>(TTI::TCC_Free)
265 : Cost;
266 }
268}
269
272 const APInt &Imm, Type *Ty,
274 assert(Ty->isIntegerTy());
275
276 unsigned BitSize = Ty->getPrimitiveSizeInBits();
277 // There is no cost model for constants with a bit size of 0. Return TCC_Free
278 // here, so that constant hoisting will ignore this constant.
279 if (BitSize == 0)
280 return TTI::TCC_Free;
281
282 // Most (all?) AArch64 intrinsics do not support folding immediates into the
283 // selected instruction, so we compute the materialization cost for the
284 // immediate directly.
285 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
287
288 switch (IID) {
289 default:
290 return TTI::TCC_Free;
291 case Intrinsic::sadd_with_overflow:
292 case Intrinsic::uadd_with_overflow:
293 case Intrinsic::ssub_with_overflow:
294 case Intrinsic::usub_with_overflow:
295 case Intrinsic::smul_with_overflow:
296 case Intrinsic::umul_with_overflow:
297 if (Idx == 1) {
298 int NumConstants = (BitSize + 63) / 64;
300 return (Cost <= NumConstants * TTI::TCC_Basic)
301 ? static_cast<int>(TTI::TCC_Free)
302 : Cost;
303 }
304 break;
305 case Intrinsic::experimental_stackmap:
306 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
307 return TTI::TCC_Free;
308 break;
309 case Intrinsic::experimental_patchpoint_void:
310 case Intrinsic::experimental_patchpoint_i64:
311 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
312 return TTI::TCC_Free;
313 break;
314 case Intrinsic::experimental_gc_statepoint:
315 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
316 return TTI::TCC_Free;
317 break;
318 }
320}
321
324 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
325 if (TyWidth == 32 || TyWidth == 64)
327 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
328 return TTI::PSK_Software;
329}
330
334 auto *RetTy = ICA.getReturnType();
335 switch (ICA.getID()) {
336 case Intrinsic::umin:
337 case Intrinsic::umax:
338 case Intrinsic::smin:
339 case Intrinsic::smax: {
340 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
343 // v2i64 types get converted to cmp+bif hence the cost of 2
344 if (LT.second == MVT::v2i64)
345 return LT.first * 2;
346 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
347 return LT.first;
348 break;
349 }
350 case Intrinsic::sadd_sat:
351 case Intrinsic::ssub_sat:
352 case Intrinsic::uadd_sat:
353 case Intrinsic::usub_sat: {
354 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
356 MVT::v2i64};
358 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
359 // need to extend the type, as it uses shr(qadd(shl, shl)).
360 unsigned Instrs =
361 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
362 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
363 return LT.first * Instrs;
364 break;
365 }
366 case Intrinsic::abs: {
367 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
369 MVT::v2i64};
371 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
372 return LT.first;
373 break;
374 }
375 case Intrinsic::experimental_stepvector: {
376 InstructionCost Cost = 1; // Cost of the `index' instruction
378 // Legalisation of illegal vectors involves an `index' instruction plus
379 // (LT.first - 1) vector adds.
380 if (LT.first > 1) {
381 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
382 InstructionCost AddCost =
383 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
384 Cost += AddCost * (LT.first - 1);
385 }
386 return Cost;
387 }
388 case Intrinsic::bitreverse: {
389 static const CostTblEntry BitreverseTbl[] = {
390 {Intrinsic::bitreverse, MVT::i32, 1},
391 {Intrinsic::bitreverse, MVT::i64, 1},
392 {Intrinsic::bitreverse, MVT::v8i8, 1},
393 {Intrinsic::bitreverse, MVT::v16i8, 1},
394 {Intrinsic::bitreverse, MVT::v4i16, 2},
395 {Intrinsic::bitreverse, MVT::v8i16, 2},
396 {Intrinsic::bitreverse, MVT::v2i32, 2},
397 {Intrinsic::bitreverse, MVT::v4i32, 2},
398 {Intrinsic::bitreverse, MVT::v1i64, 2},
399 {Intrinsic::bitreverse, MVT::v2i64, 2},
400 };
401 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
402 const auto *Entry =
403 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
404 if (Entry) {
405 // Cost Model is using the legal type(i32) that i8 and i16 will be
406 // converted to +1 so that we match the actual lowering cost
407 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
408 TLI->getValueType(DL, RetTy, true) == MVT::i16)
409 return LegalisationCost.first * Entry->Cost + 1;
410
411 return LegalisationCost.first * Entry->Cost;
412 }
413 break;
414 }
415 case Intrinsic::ctpop: {
416 if (!ST->hasNEON()) {
417 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
418 return getTypeLegalizationCost(RetTy).first * 12;
419 }
420 static const CostTblEntry CtpopCostTbl[] = {
425 {ISD::CTPOP, MVT::i64, 4},
428 {ISD::CTPOP, MVT::v8i8, 1},
429 {ISD::CTPOP, MVT::i32, 5},
430 };
432 MVT MTy = LT.second;
433 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
434 // Extra cost of +1 when illegal vector types are legalized by promoting
435 // the integer type.
436 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
437 RetTy->getScalarSizeInBits()
438 ? 1
439 : 0;
440 return LT.first * Entry->Cost + ExtraCost;
441 }
442 break;
443 }
444 case Intrinsic::sadd_with_overflow:
445 case Intrinsic::uadd_with_overflow:
446 case Intrinsic::ssub_with_overflow:
447 case Intrinsic::usub_with_overflow:
448 case Intrinsic::smul_with_overflow:
449 case Intrinsic::umul_with_overflow: {
450 static const CostTblEntry WithOverflowCostTbl[] = {
451 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
452 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
453 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
454 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
455 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
456 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
457 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
458 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
459 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
460 {Intrinsic::usub_with_overflow, MVT::i8, 3},
461 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
462 {Intrinsic::usub_with_overflow, MVT::i16, 3},
463 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
464 {Intrinsic::usub_with_overflow, MVT::i32, 1},
465 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
466 {Intrinsic::usub_with_overflow, MVT::i64, 1},
467 {Intrinsic::smul_with_overflow, MVT::i8, 5},
468 {Intrinsic::umul_with_overflow, MVT::i8, 4},
469 {Intrinsic::smul_with_overflow, MVT::i16, 5},
470 {Intrinsic::umul_with_overflow, MVT::i16, 4},
471 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
472 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
473 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
474 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
475 };
476 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
477 if (MTy.isSimple())
478 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
479 MTy.getSimpleVT()))
480 return Entry->Cost;
481 break;
482 }
483 case Intrinsic::fptosi_sat:
484 case Intrinsic::fptoui_sat: {
485 if (ICA.getArgTypes().empty())
486 break;
487 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
488 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
489 EVT MTy = TLI->getValueType(DL, RetTy);
490 // Check for the legal types, which are where the size of the input and the
491 // output are the same, or we are using cvt f64->i32 or f32->i64.
492 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
493 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
494 LT.second == MVT::v2f64) &&
495 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
496 (LT.second == MVT::f64 && MTy == MVT::i32) ||
497 (LT.second == MVT::f32 && MTy == MVT::i64)))
498 return LT.first;
499 // Similarly for fp16 sizes
500 if (ST->hasFullFP16() &&
501 ((LT.second == MVT::f16 && MTy == MVT::i32) ||
502 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
503 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
504 return LT.first;
505
506 // Otherwise we use a legal convert followed by a min+max
507 if ((LT.second.getScalarType() == MVT::f32 ||
508 LT.second.getScalarType() == MVT::f64 ||
509 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
510 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
511 Type *LegalTy =
512 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
513 if (LT.second.isVector())
514 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
516 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
517 LegalTy, {LegalTy, LegalTy});
519 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
520 LegalTy, {LegalTy, LegalTy});
522 return LT.first * Cost;
523 }
524 break;
525 }
526 default:
527 break;
528 }
530}
531
532/// The function will remove redundant reinterprets casting in the presence
533/// of the control flow
534static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
535 IntrinsicInst &II) {
537 auto RequiredType = II.getType();
538
539 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
540 assert(PN && "Expected Phi Node!");
541
542 // Don't create a new Phi unless we can remove the old one.
543 if (!PN->hasOneUse())
544 return std::nullopt;
545
546 for (Value *IncValPhi : PN->incoming_values()) {
547 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
548 if (!Reinterpret ||
549 Reinterpret->getIntrinsicID() !=
550 Intrinsic::aarch64_sve_convert_to_svbool ||
551 RequiredType != Reinterpret->getArgOperand(0)->getType())
552 return std::nullopt;
553 }
554
555 // Create the new Phi
556 LLVMContext &Ctx = PN->getContext();
557 IRBuilder<> Builder(Ctx);
558 Builder.SetInsertPoint(PN);
559 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
560 Worklist.push_back(PN);
561
562 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
563 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
564 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
565 Worklist.push_back(Reinterpret);
566 }
567
568 // Cleanup Phi Node and reinterprets
569 return IC.replaceInstUsesWith(II, NPN);
570}
571
572// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
573// => (binop (pred) (from_svbool _) (from_svbool _))
574//
575// The above transformation eliminates a `to_svbool` in the predicate
576// operand of bitwise operation `binop` by narrowing the vector width of
577// the operation. For example, it would convert a `<vscale x 16 x i1>
578// and` into a `<vscale x 4 x i1> and`. This is profitable because
579// to_svbool must zero the new lanes during widening, whereas
580// from_svbool is free.
581static std::optional<Instruction *>
583 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
584 if (!BinOp)
585 return std::nullopt;
586
587 auto IntrinsicID = BinOp->getIntrinsicID();
588 switch (IntrinsicID) {
589 case Intrinsic::aarch64_sve_and_z:
590 case Intrinsic::aarch64_sve_bic_z:
591 case Intrinsic::aarch64_sve_eor_z:
592 case Intrinsic::aarch64_sve_nand_z:
593 case Intrinsic::aarch64_sve_nor_z:
594 case Intrinsic::aarch64_sve_orn_z:
595 case Intrinsic::aarch64_sve_orr_z:
596 break;
597 default:
598 return std::nullopt;
599 }
600
601 auto BinOpPred = BinOp->getOperand(0);
602 auto BinOpOp1 = BinOp->getOperand(1);
603 auto BinOpOp2 = BinOp->getOperand(2);
604
605 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
606 if (!PredIntr ||
607 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
608 return std::nullopt;
609
610 auto PredOp = PredIntr->getOperand(0);
611 auto PredOpTy = cast<VectorType>(PredOp->getType());
612 if (PredOpTy != II.getType())
613 return std::nullopt;
614
616 Builder.SetInsertPoint(&II);
617
618 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
619 auto NarrowBinOpOp1 = Builder.CreateIntrinsic(
620 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
621 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
622 if (BinOpOp1 == BinOpOp2)
623 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
624 else
625 NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic(
626 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
627
628 auto NarrowedBinOp =
629 Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
630 return IC.replaceInstUsesWith(II, NarrowedBinOp);
631}
632
633static std::optional<Instruction *>
635 // If the reinterpret instruction operand is a PHI Node
636 if (isa<PHINode>(II.getArgOperand(0)))
637 return processPhiNode(IC, II);
638
639 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
640 return BinOpCombine;
641
642 SmallVector<Instruction *, 32> CandidatesForRemoval;
643 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
644
645 const auto *IVTy = cast<VectorType>(II.getType());
646
647 // Walk the chain of conversions.
648 while (Cursor) {
649 // If the type of the cursor has fewer lanes than the final result, zeroing
650 // must take place, which breaks the equivalence chain.
651 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
652 if (CursorVTy->getElementCount().getKnownMinValue() <
653 IVTy->getElementCount().getKnownMinValue())
654 break;
655
656 // If the cursor has the same type as I, it is a viable replacement.
657 if (Cursor->getType() == IVTy)
658 EarliestReplacement = Cursor;
659
660 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
661
662 // If this is not an SVE conversion intrinsic, this is the end of the chain.
663 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
664 Intrinsic::aarch64_sve_convert_to_svbool ||
665 IntrinsicCursor->getIntrinsicID() ==
666 Intrinsic::aarch64_sve_convert_from_svbool))
667 break;
668
669 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
670 Cursor = IntrinsicCursor->getOperand(0);
671 }
672
673 // If no viable replacement in the conversion chain was found, there is
674 // nothing to do.
675 if (!EarliestReplacement)
676 return std::nullopt;
677
678 return IC.replaceInstUsesWith(II, EarliestReplacement);
679}
680
681static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
682 IntrinsicInst &II) {
683 IRBuilder<> Builder(&II);
684 auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1),
685 II.getOperand(2));
686 return IC.replaceInstUsesWith(II, Select);
687}
688
689static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
690 IntrinsicInst &II) {
691 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
692 if (!Pg)
693 return std::nullopt;
694
695 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
696 return std::nullopt;
697
698 const auto PTruePattern =
699 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
700 if (PTruePattern != AArch64SVEPredPattern::vl1)
701 return std::nullopt;
702
703 // The intrinsic is inserting into lane zero so use an insert instead.
704 auto *IdxTy = Type::getInt64Ty(II.getContext());
705 auto *Insert = InsertElementInst::Create(
706 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
707 Insert->insertBefore(&II);
708 Insert->takeName(&II);
709
710 return IC.replaceInstUsesWith(II, Insert);
711}
712
713static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
714 IntrinsicInst &II) {
715 // Replace DupX with a regular IR splat.
717 Builder.SetInsertPoint(&II);
718 auto *RetTy = cast<ScalableVectorType>(II.getType());
719 Value *Splat =
720 Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0));
721 Splat->takeName(&II);
722 return IC.replaceInstUsesWith(II, Splat);
723}
724
725static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
726 IntrinsicInst &II) {
727 LLVMContext &Ctx = II.getContext();
728 IRBuilder<> Builder(Ctx);
729 Builder.SetInsertPoint(&II);
730
731 // Check that the predicate is all active
732 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
733 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
734 return std::nullopt;
735
736 const auto PTruePattern =
737 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
738 if (PTruePattern != AArch64SVEPredPattern::all)
739 return std::nullopt;
740
741 // Check that we have a compare of zero..
742 auto *SplatValue =
743 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
744 if (!SplatValue || !SplatValue->isZero())
745 return std::nullopt;
746
747 // ..against a dupq
748 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
749 if (!DupQLane ||
750 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
751 return std::nullopt;
752
753 // Where the dupq is a lane 0 replicate of a vector insert
754 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
755 return std::nullopt;
756
757 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
758 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
759 return std::nullopt;
760
761 // Where the vector insert is a fixed constant vector insert into undef at
762 // index zero
763 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
764 return std::nullopt;
765
766 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
767 return std::nullopt;
768
769 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
770 if (!ConstVec)
771 return std::nullopt;
772
773 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
774 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
775 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
776 return std::nullopt;
777
778 unsigned NumElts = VecTy->getNumElements();
779 unsigned PredicateBits = 0;
780
781 // Expand intrinsic operands to a 16-bit byte level predicate
782 for (unsigned I = 0; I < NumElts; ++I) {
783 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
784 if (!Arg)
785 return std::nullopt;
786 if (!Arg->isZero())
787 PredicateBits |= 1 << (I * (16 / NumElts));
788 }
789
790 // If all bits are zero bail early with an empty predicate
791 if (PredicateBits == 0) {
792 auto *PFalse = Constant::getNullValue(II.getType());
793 PFalse->takeName(&II);
794 return IC.replaceInstUsesWith(II, PFalse);
795 }
796
797 // Calculate largest predicate type used (where byte predicate is largest)
798 unsigned Mask = 8;
799 for (unsigned I = 0; I < 16; ++I)
800 if ((PredicateBits & (1 << I)) != 0)
801 Mask |= (I % 8);
802
803 unsigned PredSize = Mask & -Mask;
804 auto *PredType = ScalableVectorType::get(
805 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
806
807 // Ensure all relevant bits are set
808 for (unsigned I = 0; I < 16; I += PredSize)
809 if ((PredicateBits & (1 << I)) == 0)
810 return std::nullopt;
811
812 auto *PTruePat =
813 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
814 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
815 {PredType}, {PTruePat});
816 auto *ConvertToSVBool = Builder.CreateIntrinsic(
817 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
818 auto *ConvertFromSVBool =
819 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
820 {II.getType()}, {ConvertToSVBool});
821
822 ConvertFromSVBool->takeName(&II);
823 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
824}
825
826static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
827 IntrinsicInst &II) {
829 Builder.SetInsertPoint(&II);
830 Value *Pg = II.getArgOperand(0);
831 Value *Vec = II.getArgOperand(1);
832 auto IntrinsicID = II.getIntrinsicID();
833 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
834
835 // lastX(splat(X)) --> X
836 if (auto *SplatVal = getSplatValue(Vec))
837 return IC.replaceInstUsesWith(II, SplatVal);
838
839 // If x and/or y is a splat value then:
840 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
841 Value *LHS, *RHS;
842 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
843 if (isSplatValue(LHS) || isSplatValue(RHS)) {
844 auto *OldBinOp = cast<BinaryOperator>(Vec);
845 auto OpC = OldBinOp->getOpcode();
846 auto *NewLHS =
847 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
848 auto *NewRHS =
849 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
851 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
852 return IC.replaceInstUsesWith(II, NewBinOp);
853 }
854 }
855
856 auto *C = dyn_cast<Constant>(Pg);
857 if (IsAfter && C && C->isNullValue()) {
858 // The intrinsic is extracting lane 0 so use an extract instead.
859 auto *IdxTy = Type::getInt64Ty(II.getContext());
860 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
861 Extract->insertBefore(&II);
862 Extract->takeName(&II);
863 return IC.replaceInstUsesWith(II, Extract);
864 }
865
866 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
867 if (!IntrPG)
868 return std::nullopt;
869
870 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
871 return std::nullopt;
872
873 const auto PTruePattern =
874 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
875
876 // Can the intrinsic's predicate be converted to a known constant index?
877 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
878 if (!MinNumElts)
879 return std::nullopt;
880
881 unsigned Idx = MinNumElts - 1;
882 // Increment the index if extracting the element after the last active
883 // predicate element.
884 if (IsAfter)
885 ++Idx;
886
887 // Ignore extracts whose index is larger than the known minimum vector
888 // length. NOTE: This is an artificial constraint where we prefer to
889 // maintain what the user asked for until an alternative is proven faster.
890 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
891 if (Idx >= PgVTy->getMinNumElements())
892 return std::nullopt;
893
894 // The intrinsic is extracting a fixed lane so use an extract instead.
895 auto *IdxTy = Type::getInt64Ty(II.getContext());
896 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
897 Extract->insertBefore(&II);
898 Extract->takeName(&II);
899 return IC.replaceInstUsesWith(II, Extract);
900}
901
902static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
903 IntrinsicInst &II) {
904 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
905 // integer variant across a variety of micro-architectures. Replace scalar
906 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
907 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
908 // depending on the micro-architecture, but has been observed as generally
909 // being faster, particularly when the CLAST[AB] op is a loop-carried
910 // dependency.
912 Builder.SetInsertPoint(&II);
913 Value *Pg = II.getArgOperand(0);
915 Value *Vec = II.getArgOperand(2);
916 Type *Ty = II.getType();
917
918 if (!Ty->isIntegerTy())
919 return std::nullopt;
920
921 Type *FPTy;
922 switch (cast<IntegerType>(Ty)->getBitWidth()) {
923 default:
924 return std::nullopt;
925 case 16:
926 FPTy = Builder.getHalfTy();
927 break;
928 case 32:
929 FPTy = Builder.getFloatTy();
930 break;
931 case 64:
932 FPTy = Builder.getDoubleTy();
933 break;
934 }
935
936 Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy);
937 auto *FPVTy = VectorType::get(
938 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
939 Value *FPVec = Builder.CreateBitCast(Vec, FPVTy);
940 auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()},
941 {Pg, FPFallBack, FPVec});
942 Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType());
943 return IC.replaceInstUsesWith(II, FPIItoInt);
944}
945
946static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
947 IntrinsicInst &II) {
948 LLVMContext &Ctx = II.getContext();
949 IRBuilder<> Builder(Ctx);
950 Builder.SetInsertPoint(&II);
951 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
952 // can work with RDFFR_PP for ptest elimination.
953 auto *AllPat =
954 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
955 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
956 {II.getType()}, {AllPat});
957 auto *RDFFR =
958 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
959 RDFFR->takeName(&II);
960 return IC.replaceInstUsesWith(II, RDFFR);
961}
962
963static std::optional<Instruction *>
965 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
966
967 if (Pattern == AArch64SVEPredPattern::all) {
968 LLVMContext &Ctx = II.getContext();
969 IRBuilder<> Builder(Ctx);
970 Builder.SetInsertPoint(&II);
971
972 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
973 auto *VScale = Builder.CreateVScale(StepVal);
974 VScale->takeName(&II);
975 return IC.replaceInstUsesWith(II, VScale);
976 }
977
978 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
979
980 return MinNumElts && NumElts >= MinNumElts
981 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
982 II, ConstantInt::get(II.getType(), MinNumElts)))
983 : std::nullopt;
984}
985
986static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
987 IntrinsicInst &II) {
988 Value *PgVal = II.getArgOperand(0);
989 Value *OpVal = II.getArgOperand(1);
990
992 Builder.SetInsertPoint(&II);
993
994 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
995 // Later optimizations prefer this form.
996 if (PgVal == OpVal &&
997 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
998 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
999 Value *Ops[] = {PgVal, OpVal};
1000 Type *Tys[] = {PgVal->getType()};
1001
1002 auto *PTest =
1003 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1004 PTest->takeName(&II);
1005
1006 return IC.replaceInstUsesWith(II, PTest);
1007 }
1008
1009 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1010 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1011
1012 if (!Pg || !Op)
1013 return std::nullopt;
1014
1015 Intrinsic::ID OpIID = Op->getIntrinsicID();
1016
1017 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1018 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1019 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1020 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1021 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1022
1023 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1024
1025 PTest->takeName(&II);
1026 return IC.replaceInstUsesWith(II, PTest);
1027 }
1028
1029 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1030 // Later optimizations may rewrite sequence to use the flag-setting variant
1031 // of instruction X to remove PTEST.
1032 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1033 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1034 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1035 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1036 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1037 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1038 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1039 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1040 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1041 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1042 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1043 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1044 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1045 Value *Ops[] = {Pg->getArgOperand(0), Pg};
1046 Type *Tys[] = {Pg->getType()};
1047
1048 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1049 PTest->takeName(&II);
1050
1051 return IC.replaceInstUsesWith(II, PTest);
1052 }
1053
1054 return std::nullopt;
1055}
1056
1057template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1058static std::optional<Instruction *>
1060 bool MergeIntoAddendOp) {
1061 Value *P = II.getOperand(0);
1062 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1063 if (MergeIntoAddendOp) {
1064 AddendOp = II.getOperand(1);
1065 Mul = II.getOperand(2);
1066 } else {
1067 AddendOp = II.getOperand(2);
1068 Mul = II.getOperand(1);
1069 }
1070
1071 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1072 m_Value(MulOp1))))
1073 return std::nullopt;
1074
1075 if (!Mul->hasOneUse())
1076 return std::nullopt;
1077
1078 Instruction *FMFSource = nullptr;
1079 if (II.getType()->isFPOrFPVectorTy()) {
1080 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1081 // Stop the combine when the flags on the inputs differ in case dropping
1082 // flags would lead to us missing out on more beneficial optimizations.
1083 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1084 return std::nullopt;
1085 if (!FAddFlags.allowContract())
1086 return std::nullopt;
1087 FMFSource = &II;
1088 }
1089
1091 Builder.SetInsertPoint(&II);
1092
1093 CallInst *Res;
1094 if (MergeIntoAddendOp)
1095 Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1096 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1097 else
1098 Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1099 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1100
1101 return IC.replaceInstUsesWith(II, Res);
1102}
1103
1104static bool isAllActivePredicate(Value *Pred) {
1105 // Look through convert.from.svbool(convert.to.svbool(...) chain.
1106 Value *UncastedPred;
1107 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1108 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1109 m_Value(UncastedPred)))))
1110 // If the predicate has the same or less lanes than the uncasted
1111 // predicate then we know the casting has no effect.
1112 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1113 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1114 Pred = UncastedPred;
1115
1116 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1117 m_ConstantInt<AArch64SVEPredPattern::all>()));
1118}
1119
1120static std::optional<Instruction *>
1123 Builder.SetInsertPoint(&II);
1124
1125 Value *Pred = II.getOperand(0);
1126 Value *PtrOp = II.getOperand(1);
1127 Type *VecTy = II.getType();
1128 Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo());
1129
1130 if (isAllActivePredicate(Pred)) {
1131 LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr);
1132 Load->copyMetadata(II);
1133 return IC.replaceInstUsesWith(II, Load);
1134 }
1135
1136 CallInst *MaskedLoad =
1137 Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL),
1138 Pred, ConstantAggregateZero::get(VecTy));
1139 MaskedLoad->copyMetadata(II);
1140 return IC.replaceInstUsesWith(II, MaskedLoad);
1141}
1142
1143static std::optional<Instruction *>
1146 Builder.SetInsertPoint(&II);
1147
1148 Value *VecOp = II.getOperand(0);
1149 Value *Pred = II.getOperand(1);
1150 Value *PtrOp = II.getOperand(2);
1151 Value *VecPtr =
1152 Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo());
1153
1154 if (isAllActivePredicate(Pred)) {
1155 StoreInst *Store = Builder.CreateStore(VecOp, VecPtr);
1156 Store->copyMetadata(II);
1157 return IC.eraseInstFromFunction(II);
1158 }
1159
1160 CallInst *MaskedStore = Builder.CreateMaskedStore(
1161 VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred);
1162 MaskedStore->copyMetadata(II);
1163 return IC.eraseInstFromFunction(II);
1164}
1165
1167 switch (Intrinsic) {
1168 case Intrinsic::aarch64_sve_fmul:
1169 return Instruction::BinaryOps::FMul;
1170 case Intrinsic::aarch64_sve_fadd:
1171 return Instruction::BinaryOps::FAdd;
1172 case Intrinsic::aarch64_sve_fsub:
1173 return Instruction::BinaryOps::FSub;
1174 default:
1175 return Instruction::BinaryOpsEnd;
1176 }
1177}
1178
1179static std::optional<Instruction *>
1181 auto *OpPredicate = II.getOperand(0);
1182 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1183 if (BinOpCode == Instruction::BinaryOpsEnd ||
1184 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1185 m_ConstantInt<AArch64SVEPredPattern::all>())))
1186 return std::nullopt;
1188 Builder.SetInsertPoint(&II);
1189 Builder.setFastMathFlags(II.getFastMathFlags());
1190 auto BinOp =
1191 Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
1192 return IC.replaceInstUsesWith(II, BinOp);
1193}
1194
1195static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1196 IntrinsicInst &II) {
1197 if (auto FMLA =
1198 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1199 Intrinsic::aarch64_sve_fmla>(IC, II,
1200 true))
1201 return FMLA;
1202 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1203 Intrinsic::aarch64_sve_mla>(
1204 IC, II, true))
1205 return MLA;
1206 if (auto FMAD =
1207 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1208 Intrinsic::aarch64_sve_fmad>(IC, II,
1209 false))
1210 return FMAD;
1211 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1212 Intrinsic::aarch64_sve_mad>(
1213 IC, II, false))
1214 return MAD;
1215 return instCombineSVEVectorBinOp(IC, II);
1216}
1217
1218static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1219 IntrinsicInst &II) {
1220 if (auto FMLS =
1221 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1222 Intrinsic::aarch64_sve_fmls>(IC, II,
1223 true))
1224 return FMLS;
1225 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1226 Intrinsic::aarch64_sve_mls>(
1227 IC, II, true))
1228 return MLS;
1229 if (auto FMSB =
1230 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1231 Intrinsic::aarch64_sve_fnmsb>(
1232 IC, II, false))
1233 return FMSB;
1234 return instCombineSVEVectorBinOp(IC, II);
1235}
1236
1237static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1238 IntrinsicInst &II) {
1239 auto *OpPredicate = II.getOperand(0);
1240 auto *OpMultiplicand = II.getOperand(1);
1241 auto *OpMultiplier = II.getOperand(2);
1242
1244 Builder.SetInsertPoint(&II);
1245
1246 // Return true if a given instruction is a unit splat value, false otherwise.
1247 auto IsUnitSplat = [](auto *I) {
1248 auto *SplatValue = getSplatValue(I);
1249 if (!SplatValue)
1250 return false;
1251 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1252 };
1253
1254 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1255 // with a unit splat value, false otherwise.
1256 auto IsUnitDup = [](auto *I) {
1257 auto *IntrI = dyn_cast<IntrinsicInst>(I);
1258 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1259 return false;
1260
1261 auto *SplatValue = IntrI->getOperand(2);
1262 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1263 };
1264
1265 if (IsUnitSplat(OpMultiplier)) {
1266 // [f]mul pg %n, (dupx 1) => %n
1267 OpMultiplicand->takeName(&II);
1268 return IC.replaceInstUsesWith(II, OpMultiplicand);
1269 } else if (IsUnitDup(OpMultiplier)) {
1270 // [f]mul pg %n, (dup pg 1) => %n
1271 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1272 auto *DupPg = DupInst->getOperand(1);
1273 // TODO: this is naive. The optimization is still valid if DupPg
1274 // 'encompasses' OpPredicate, not only if they're the same predicate.
1275 if (OpPredicate == DupPg) {
1276 OpMultiplicand->takeName(&II);
1277 return IC.replaceInstUsesWith(II, OpMultiplicand);
1278 }
1279 }
1280
1281 return instCombineSVEVectorBinOp(IC, II);
1282}
1283
1284static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1285 IntrinsicInst &II) {
1287 Builder.SetInsertPoint(&II);
1288 Value *UnpackArg = II.getArgOperand(0);
1289 auto *RetTy = cast<ScalableVectorType>(II.getType());
1290 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1291 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1292
1293 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1294 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1295 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1296 ScalarArg =
1297 Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1298 Value *NewVal =
1299 Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1300 NewVal->takeName(&II);
1301 return IC.replaceInstUsesWith(II, NewVal);
1302 }
1303
1304 return std::nullopt;
1305}
1306static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1307 IntrinsicInst &II) {
1308 auto *OpVal = II.getOperand(0);
1309 auto *OpIndices = II.getOperand(1);
1310 VectorType *VTy = cast<VectorType>(II.getType());
1311
1312 // Check whether OpIndices is a constant splat value < minimal element count
1313 // of result.
1314 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1315 if (!SplatValue ||
1316 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1317 return std::nullopt;
1318
1319 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1320 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1322 Builder.SetInsertPoint(&II);
1323 auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
1324 auto *VectorSplat =
1325 Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1326
1327 VectorSplat->takeName(&II);
1328 return IC.replaceInstUsesWith(II, VectorSplat);
1329}
1330
1331static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1332 IntrinsicInst &II) {
1333 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1334 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1335 Value *A, *B;
1336 if (match(II.getArgOperand(0),
1337 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1338 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1339 m_Specific(A), m_Specific(B))))
1340 return IC.replaceInstUsesWith(
1341 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1342
1343 return std::nullopt;
1344}
1345
1346static std::optional<Instruction *>
1348 Value *Mask = II.getOperand(0);
1349 Value *BasePtr = II.getOperand(1);
1350 Value *Index = II.getOperand(2);
1351 Type *Ty = II.getType();
1352 Value *PassThru = ConstantAggregateZero::get(Ty);
1353
1354 // Contiguous gather => masked load.
1355 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1356 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1357 Value *IndexBase;
1358 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1359 m_Value(IndexBase), m_SpecificInt(1)))) {
1361 Builder.SetInsertPoint(&II);
1362
1363 Align Alignment =
1364 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1365
1366 Type *VecPtrTy = PointerType::getUnqual(Ty);
1367 Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1368 BasePtr, IndexBase);
1369 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1370 CallInst *MaskedLoad =
1371 Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1372 MaskedLoad->takeName(&II);
1373 return IC.replaceInstUsesWith(II, MaskedLoad);
1374 }
1375
1376 return std::nullopt;
1377}
1378
1379static std::optional<Instruction *>
1381 Value *Val = II.getOperand(0);
1382 Value *Mask = II.getOperand(1);
1383 Value *BasePtr = II.getOperand(2);
1384 Value *Index = II.getOperand(3);
1385 Type *Ty = Val->getType();
1386
1387 // Contiguous scatter => masked store.
1388 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1389 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1390 Value *IndexBase;
1391 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1392 m_Value(IndexBase), m_SpecificInt(1)))) {
1394 Builder.SetInsertPoint(&II);
1395
1396 Align Alignment =
1397 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1398
1399 Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1400 BasePtr, IndexBase);
1401 Type *VecPtrTy = PointerType::getUnqual(Ty);
1402 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1403
1404 (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1405
1406 return IC.eraseInstFromFunction(II);
1407 }
1408
1409 return std::nullopt;
1410}
1411
1412static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1413 IntrinsicInst &II) {
1415 Builder.SetInsertPoint(&II);
1416 Type *Int32Ty = Builder.getInt32Ty();
1417 Value *Pred = II.getOperand(0);
1418 Value *Vec = II.getOperand(1);
1419 Value *DivVec = II.getOperand(2);
1420
1421 Value *SplatValue = getSplatValue(DivVec);
1422 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1423 if (!SplatConstantInt)
1424 return std::nullopt;
1425 APInt Divisor = SplatConstantInt->getValue();
1426
1427 if (Divisor.isPowerOf2()) {
1428 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1429 auto ASRD = Builder.CreateIntrinsic(
1430 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1431 return IC.replaceInstUsesWith(II, ASRD);
1432 }
1433 if (Divisor.isNegatedPowerOf2()) {
1434 Divisor.negate();
1435 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1436 auto ASRD = Builder.CreateIntrinsic(
1437 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1438 auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
1439 {ASRD->getType()}, {ASRD, Pred, ASRD});
1440 return IC.replaceInstUsesWith(II, NEG);
1441 }
1442
1443 return std::nullopt;
1444}
1445
1446bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
1447 size_t VecSize = Vec.size();
1448 if (VecSize == 1)
1449 return true;
1450 if (!isPowerOf2_64(VecSize))
1451 return false;
1452 size_t HalfVecSize = VecSize / 2;
1453
1454 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
1455 RHS != Vec.end(); LHS++, RHS++) {
1456 if (*LHS != nullptr && *RHS != nullptr) {
1457 if (*LHS == *RHS)
1458 continue;
1459 else
1460 return false;
1461 }
1462 if (!AllowPoison)
1463 return false;
1464 if (*LHS == nullptr && *RHS != nullptr)
1465 *LHS = *RHS;
1466 }
1467
1468 Vec.resize(HalfVecSize);
1469 SimplifyValuePattern(Vec, AllowPoison);
1470 return true;
1471}
1472
1473// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
1474// to dupqlane(f64(C)) where C is A concatenated with B
1475static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
1476 IntrinsicInst &II) {
1477 Value *CurrentInsertElt = nullptr, *Default = nullptr;
1478 if (!match(II.getOperand(0),
1479 m_Intrinsic<Intrinsic::vector_insert>(
1480 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
1481 !isa<FixedVectorType>(CurrentInsertElt->getType()))
1482 return std::nullopt;
1483 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
1484
1485 // Insert the scalars into a container ordered by InsertElement index
1486 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
1487 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
1488 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
1489 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
1490 CurrentInsertElt = InsertElt->getOperand(0);
1491 }
1492
1493 bool AllowPoison =
1494 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
1495 if (!SimplifyValuePattern(Elts, AllowPoison))
1496 return std::nullopt;
1497
1498 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
1500 Builder.SetInsertPoint(&II);
1501 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
1502 for (size_t I = 0; I < Elts.size(); I++) {
1503 if (Elts[I] == nullptr)
1504 continue;
1505 InsertEltChain = Builder.CreateInsertElement(InsertEltChain, Elts[I],
1506 Builder.getInt64(I));
1507 }
1508 if (InsertEltChain == nullptr)
1509 return std::nullopt;
1510
1511 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
1512 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
1513 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
1514 // be narrowed back to the original type.
1515 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
1516 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1517 IIScalableTy->getMinNumElements() /
1518 PatternWidth;
1519
1520 IntegerType *WideTy = Builder.getIntNTy(PatternWidth);
1521 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
1522 auto *WideShuffleMaskTy =
1523 ScalableVectorType::get(Builder.getInt32Ty(), PatternElementCount);
1524
1525 auto ZeroIdx = ConstantInt::get(Builder.getInt64Ty(), APInt(64, 0));
1526 auto InsertSubvector = Builder.CreateInsertVector(
1527 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
1528 auto WideBitcast =
1529 Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
1530 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
1531 auto WideShuffle = Builder.CreateShuffleVector(
1532 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
1533 auto NarrowBitcast =
1534 Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
1535
1536 return IC.replaceInstUsesWith(II, NarrowBitcast);
1537}
1538
1539static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
1540 IntrinsicInst &II) {
1541 Value *A = II.getArgOperand(0);
1542 Value *B = II.getArgOperand(1);
1543 if (A == B)
1544 return IC.replaceInstUsesWith(II, A);
1545
1546 return std::nullopt;
1547}
1548
1549static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
1550 IntrinsicInst &II) {
1551 IRBuilder<> Builder(&II);
1552 Value *Pred = II.getOperand(0);
1553 Value *Vec = II.getOperand(1);
1554 Value *Shift = II.getOperand(2);
1555
1556 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
1557 Value *AbsPred, *MergedValue;
1558 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
1559 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
1560 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
1561 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
1562
1563 return std::nullopt;
1564
1565 // Transform is valid if any of the following are true:
1566 // * The ABS merge value is an undef or non-negative
1567 // * The ABS predicate is all active
1568 // * The ABS predicate and the SRSHL predicates are the same
1569 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
1570 AbsPred != Pred && !isAllActivePredicate(AbsPred))
1571 return std::nullopt;
1572
1573 // Only valid when the shift amount is non-negative, otherwise the rounding
1574 // behaviour of SRSHL cannot be ignored.
1575 if (!match(Shift, m_NonNegative()))
1576 return std::nullopt;
1577
1578 auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()},
1579 {Pred, Vec, Shift});
1580
1581 return IC.replaceInstUsesWith(II, LSL);
1582}
1583
1584std::optional<Instruction *>
1586 IntrinsicInst &II) const {
1587 Intrinsic::ID IID = II.getIntrinsicID();
1588 switch (IID) {
1589 default:
1590 break;
1591 case Intrinsic::aarch64_neon_fmaxnm:
1592 case Intrinsic::aarch64_neon_fminnm:
1593 return instCombineMaxMinNM(IC, II);
1594 case Intrinsic::aarch64_sve_convert_from_svbool:
1595 return instCombineConvertFromSVBool(IC, II);
1596 case Intrinsic::aarch64_sve_dup:
1597 return instCombineSVEDup(IC, II);
1598 case Intrinsic::aarch64_sve_dup_x:
1599 return instCombineSVEDupX(IC, II);
1600 case Intrinsic::aarch64_sve_cmpne:
1601 case Intrinsic::aarch64_sve_cmpne_wide:
1602 return instCombineSVECmpNE(IC, II);
1603 case Intrinsic::aarch64_sve_rdffr:
1604 return instCombineRDFFR(IC, II);
1605 case Intrinsic::aarch64_sve_lasta:
1606 case Intrinsic::aarch64_sve_lastb:
1607 return instCombineSVELast(IC, II);
1608 case Intrinsic::aarch64_sve_clasta_n:
1609 case Intrinsic::aarch64_sve_clastb_n:
1610 return instCombineSVECondLast(IC, II);
1611 case Intrinsic::aarch64_sve_cntd:
1612 return instCombineSVECntElts(IC, II, 2);
1613 case Intrinsic::aarch64_sve_cntw:
1614 return instCombineSVECntElts(IC, II, 4);
1615 case Intrinsic::aarch64_sve_cnth:
1616 return instCombineSVECntElts(IC, II, 8);
1617 case Intrinsic::aarch64_sve_cntb:
1618 return instCombineSVECntElts(IC, II, 16);
1619 case Intrinsic::aarch64_sve_ptest_any:
1620 case Intrinsic::aarch64_sve_ptest_first:
1621 case Intrinsic::aarch64_sve_ptest_last:
1622 return instCombineSVEPTest(IC, II);
1623 case Intrinsic::aarch64_sve_mul:
1624 case Intrinsic::aarch64_sve_fmul:
1625 return instCombineSVEVectorMul(IC, II);
1626 case Intrinsic::aarch64_sve_fadd:
1627 case Intrinsic::aarch64_sve_add:
1628 return instCombineSVEVectorAdd(IC, II);
1629 case Intrinsic::aarch64_sve_fadd_u:
1630 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1631 Intrinsic::aarch64_sve_fmla_u>(
1632 IC, II, true);
1633 case Intrinsic::aarch64_sve_fsub:
1634 case Intrinsic::aarch64_sve_sub:
1635 return instCombineSVEVectorSub(IC, II);
1636 case Intrinsic::aarch64_sve_fsub_u:
1637 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1638 Intrinsic::aarch64_sve_fmls_u>(
1639 IC, II, true);
1640 case Intrinsic::aarch64_sve_tbl:
1641 return instCombineSVETBL(IC, II);
1642 case Intrinsic::aarch64_sve_uunpkhi:
1643 case Intrinsic::aarch64_sve_uunpklo:
1644 case Intrinsic::aarch64_sve_sunpkhi:
1645 case Intrinsic::aarch64_sve_sunpklo:
1646 return instCombineSVEUnpack(IC, II);
1647 case Intrinsic::aarch64_sve_zip1:
1648 case Intrinsic::aarch64_sve_zip2:
1649 return instCombineSVEZip(IC, II);
1650 case Intrinsic::aarch64_sve_ld1_gather_index:
1651 return instCombineLD1GatherIndex(IC, II);
1652 case Intrinsic::aarch64_sve_st1_scatter_index:
1653 return instCombineST1ScatterIndex(IC, II);
1654 case Intrinsic::aarch64_sve_ld1:
1655 return instCombineSVELD1(IC, II, DL);
1656 case Intrinsic::aarch64_sve_st1:
1657 return instCombineSVEST1(IC, II, DL);
1658 case Intrinsic::aarch64_sve_sdiv:
1659 return instCombineSVESDIV(IC, II);
1660 case Intrinsic::aarch64_sve_sel:
1661 return instCombineSVESel(IC, II);
1662 case Intrinsic::aarch64_sve_srshl:
1663 return instCombineSVESrshl(IC, II);
1664 case Intrinsic::aarch64_sve_dupq_lane:
1665 return instCombineSVEDupqLane(IC, II);
1666 }
1667
1668 return std::nullopt;
1669}
1670
1672 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
1673 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
1674 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1675 SimplifyAndSetOp) const {
1676 switch (II.getIntrinsicID()) {
1677 default:
1678 break;
1679 case Intrinsic::aarch64_neon_fcvtxn:
1680 case Intrinsic::aarch64_neon_rshrn:
1681 case Intrinsic::aarch64_neon_sqrshrn:
1682 case Intrinsic::aarch64_neon_sqrshrun:
1683 case Intrinsic::aarch64_neon_sqshrn:
1684 case Intrinsic::aarch64_neon_sqshrun:
1685 case Intrinsic::aarch64_neon_sqxtn:
1686 case Intrinsic::aarch64_neon_sqxtun:
1687 case Intrinsic::aarch64_neon_uqrshrn:
1688 case Intrinsic::aarch64_neon_uqshrn:
1689 case Intrinsic::aarch64_neon_uqxtn:
1690 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
1691 break;
1692 }
1693
1694 return std::nullopt;
1695}
1696
1699 switch (K) {
1701 return TypeSize::getFixed(64);
1703 if (!ST->isStreamingSVEModeDisabled() &&
1705 return TypeSize::getFixed(0);
1706
1707 if (ST->hasSVE())
1708 return TypeSize::getFixed(
1709 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
1710
1711 return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
1714 return TypeSize::getScalable(0);
1715
1716 return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
1717 }
1718 llvm_unreachable("Unsupported register kind");
1719}
1720
1721bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
1723
1724 // A helper that returns a vector type from the given type. The number of
1725 // elements in type Ty determines the vector width.
1726 auto toVectorTy = [&](Type *ArgTy) {
1727 return VectorType::get(ArgTy->getScalarType(),
1728 cast<VectorType>(DstTy)->getElementCount());
1729 };
1730
1731 // Exit early if DstTy is not a vector type whose elements are at least
1732 // 16-bits wide. SVE doesn't generally have the same set of instructions to
1733 // perform an extend with the add/sub/mul. There are SMULLB style
1734 // instructions, but they operate on top/bottom, requiring some sort of lane
1735 // interleaving to be used with zext/sext.
1736 if (!useNeonVector(DstTy) || DstTy->getScalarSizeInBits() < 16)
1737 return false;
1738
1739 // Determine if the operation has a widening variant. We consider both the
1740 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
1741 // instructions.
1742 //
1743 // TODO: Add additional widening operations (e.g., shl, etc.) once we
1744 // verify that their extending operands are eliminated during code
1745 // generation.
1746 switch (Opcode) {
1747 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
1748 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
1749 case Instruction::Mul: // SMULL(2), UMULL(2)
1750 break;
1751 default:
1752 return false;
1753 }
1754
1755 // To be a widening instruction (either the "wide" or "long" versions), the
1756 // second operand must be a sign- or zero extend.
1757 if (Args.size() != 2 ||
1758 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])))
1759 return false;
1760 auto *Extend = cast<CastInst>(Args[1]);
1761 auto *Arg0 = dyn_cast<CastInst>(Args[0]);
1762
1763 // A mul only has a mull version (not like addw). Both operands need to be
1764 // extending and the same type.
1765 if (Opcode == Instruction::Mul &&
1766 (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() ||
1767 Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType()))
1768 return false;
1769
1770 // Legalize the destination type and ensure it can be used in a widening
1771 // operation.
1772 auto DstTyL = getTypeLegalizationCost(DstTy);
1773 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
1774 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
1775 return false;
1776
1777 // Legalize the source type and ensure it can be used in a widening
1778 // operation.
1779 auto *SrcTy = toVectorTy(Extend->getSrcTy());
1780 auto SrcTyL = getTypeLegalizationCost(SrcTy);
1781 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
1782 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
1783 return false;
1784
1785 // Get the total number of vector elements in the legalized types.
1786 InstructionCost NumDstEls =
1787 DstTyL.first * DstTyL.second.getVectorMinNumElements();
1788 InstructionCost NumSrcEls =
1789 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
1790
1791 // Return true if the legalized types have the same number of vector elements
1792 // and the destination element type size is twice that of the source type.
1793 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
1794}
1795
1797 Type *Src,
1800 const Instruction *I) {
1801 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1802 assert(ISD && "Invalid opcode");
1803
1804 // If the cast is observable, and it is used by a widening instruction (e.g.,
1805 // uaddl, saddw, etc.), it may be free.
1806 if (I && I->hasOneUser()) {
1807 auto *SingleUser = cast<Instruction>(*I->user_begin());
1808 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
1809 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
1810 // If the cast is the second operand, it is free. We will generate either
1811 // a "wide" or "long" version of the widening instruction.
1812 if (I == SingleUser->getOperand(1))
1813 return 0;
1814 // If the cast is not the second operand, it will be free if it looks the
1815 // same as the second operand. In this case, we will generate a "long"
1816 // version of the widening instruction.
1817 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
1818 if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
1819 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
1820 return 0;
1821 }
1822 }
1823
1824 // TODO: Allow non-throughput costs that aren't binary.
1825 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1827 return Cost == 0 ? 0 : 1;
1828 return Cost;
1829 };
1830
1831 EVT SrcTy = TLI->getValueType(DL, Src);
1832 EVT DstTy = TLI->getValueType(DL, Dst);
1833
1834 if (!SrcTy.isSimple() || !DstTy.isSimple())
1835 return AdjustCost(
1836 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1837
1838 static const TypeConversionCostTblEntry
1839 ConversionTbl[] = {
1840 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
1841 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
1842 { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
1843 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
1844 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
1845 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
1846 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
1847 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
1848 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
1849 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
1850 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
1851 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
1852 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
1853 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
1854 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
1855 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
1856 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
1857 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
1858 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
1859 { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
1860
1861 // Truncations on nxvmiN
1878
1879 // The number of shll instructions for the extension.
1896
1897 // LowerVectorINT_TO_FP:
1904
1905 // Complex: to v2f32
1912
1913 // Complex: to v4f32
1918
1919 // Complex: to v8f32
1924
1925 // Complex: to v16f32
1928
1929 // Complex: to v2f64
1936
1937 // Complex: to v4f64
1940
1941 // LowerVectorFP_TO_INT
1948
1949 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
1956
1957 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
1962
1963 // Complex, from nxv2f32.
1972
1973 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
1980
1981 // Complex, from nxv2f64.
1990
1991 // Complex, from nxv4f32.
2000
2001 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
2006
2007 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
2014
2015 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
2020
2021 // Complex, from nxv8f16.
2030
2031 // Complex, from nxv4f16.
2040
2041 // Complex, from nxv2f16.
2050
2051 // Truncate from nxvmf32 to nxvmf16.
2055
2056 // Truncate from nxvmf64 to nxvmf16.
2060
2061 // Truncate from nxvmf64 to nxvmf32.
2065
2066 // Extend from nxvmf16 to nxvmf32.
2070
2071 // Extend from nxvmf16 to nxvmf64.
2075
2076 // Extend from nxvmf32 to nxvmf64.
2080
2081 // Bitcasts from float to integer
2085
2086 // Bitcasts from integer to float
2090 };
2091
2092 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
2093 DstTy.getSimpleVT(),
2094 SrcTy.getSimpleVT()))
2095 return AdjustCost(Entry->Cost);
2096
2097 static const TypeConversionCostTblEntry FP16Tbl[] = {
2098 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
2100 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
2102 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
2104 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
2106 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
2108 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
2110 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
2112 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
2114 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
2116 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
2117 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
2118 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
2119 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
2120 };
2121
2122 if (ST->hasFullFP16())
2123 if (const auto *Entry = ConvertCostTableLookup(
2124 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2125 return AdjustCost(Entry->Cost);
2126
2127 return AdjustCost(
2128 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2129}
2130
2132 Type *Dst,
2133 VectorType *VecTy,
2134 unsigned Index) {
2135
2136 // Make sure we were given a valid extend opcode.
2137 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
2138 "Invalid opcode");
2139
2140 // We are extending an element we extract from a vector, so the source type
2141 // of the extend is the element type of the vector.
2142 auto *Src = VecTy->getElementType();
2143
2144 // Sign- and zero-extends are for integer types only.
2145 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
2146
2147 // Get the cost for the extract. We compute the cost (if any) for the extend
2148 // below.
2150 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
2151 CostKind, Index, nullptr, nullptr);
2152
2153 // Legalize the types.
2154 auto VecLT = getTypeLegalizationCost(VecTy);
2155 auto DstVT = TLI->getValueType(DL, Dst);
2156 auto SrcVT = TLI->getValueType(DL, Src);
2157
2158 // If the resulting type is still a vector and the destination type is legal,
2159 // we may get the extension for free. If not, get the default cost for the
2160 // extend.
2161 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
2162 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2163 CostKind);
2164
2165 // The destination type should be larger than the element type. If not, get
2166 // the default cost for the extend.
2167 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2168 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2169 CostKind);
2170
2171 switch (Opcode) {
2172 default:
2173 llvm_unreachable("Opcode should be either SExt or ZExt");
2174
2175 // For sign-extends, we only need a smov, which performs the extension
2176 // automatically.
2177 case Instruction::SExt:
2178 return Cost;
2179
2180 // For zero-extends, the extend is performed automatically by a umov unless
2181 // the destination type is i64 and the element type is i8 or i16.
2182 case Instruction::ZExt:
2183 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
2184 return Cost;
2185 }
2186
2187 // If we are unable to perform the extend for free, get the default cost.
2188 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2189 CostKind);
2190}
2191
2194 const Instruction *I) {
2196 return Opcode == Instruction::PHI ? 0 : 1;
2197 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
2198 // Branches are assumed to be predicted.
2199 return 0;
2200}
2201
2202InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
2203 Type *Val,
2204 unsigned Index,
2205 bool HasRealUse) {
2206 assert(Val->isVectorTy() && "This must be a vector type");
2207
2208 if (Index != -1U) {
2209 // Legalize the type.
2210 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2211
2212 // This type is legalized to a scalar type.
2213 if (!LT.second.isVector())
2214 return 0;
2215
2216 // The type may be split. For fixed-width vectors we can normalize the
2217 // index to the new type.
2218 if (LT.second.isFixedLengthVector()) {
2219 unsigned Width = LT.second.getVectorNumElements();
2220 Index = Index % Width;
2221 }
2222
2223 // The element at index zero is already inside the vector.
2224 // - For a physical (HasRealUse==true) insert-element or extract-element
2225 // instruction that extracts integers, an explicit FPR -> GPR move is
2226 // needed. So it has non-zero cost.
2227 // - For the rest of cases (virtual instruction or element type is float),
2228 // consider the instruction free.
2229 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
2230 return 0;
2231
2232 // This is recognising a LD1 single-element structure to one lane of one
2233 // register instruction. I.e., if this is an `insertelement` instruction,
2234 // and its second operand is a load, then we will generate a LD1, which
2235 // are expensive instructions.
2236 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
2237 return ST->getVectorInsertExtractBaseCost() + 1;
2238
2239 // FIXME:
2240 // If the extract-element and insert-element instructions could be
2241 // simplified away (e.g., could be combined into users by looking at use-def
2242 // context), they have no cost. This is not done in the first place for
2243 // compile-time considerations.
2244 }
2245
2246 // All other insert/extracts cost this much.
2247 return ST->getVectorInsertExtractBaseCost();
2248}
2249
2252 unsigned Index, Value *Op0,
2253 Value *Op1) {
2254 return getVectorInstrCostHelper(nullptr, Val, Index, false /* HasRealUse */);
2255}
2256
2258 Type *Val,
2260 unsigned Index) {
2261 return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
2262}
2263
2265 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2268 const Instruction *CxtI) {
2269
2270 // TODO: Handle more cost kinds.
2272 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2273 Op2Info, Args, CxtI);
2274
2275 // Legalize the type.
2276 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2277 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2278
2279 switch (ISD) {
2280 default:
2281 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2282 Op2Info);
2283 case ISD::SDIV:
2284 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
2285 // On AArch64, scalar signed division by constants power-of-two are
2286 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
2287 // The OperandValue properties many not be same as that of previous
2288 // operation; conservatively assume OP_None.
2290 Instruction::Add, Ty, CostKind,
2291 Op1Info.getNoProps(), Op2Info.getNoProps());
2292 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
2293 Op1Info.getNoProps(), Op2Info.getNoProps());
2295 Instruction::Select, Ty, CostKind,
2296 Op1Info.getNoProps(), Op2Info.getNoProps());
2297 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
2298 Op1Info.getNoProps(), Op2Info.getNoProps());
2299 return Cost;
2300 }
2301 [[fallthrough]];
2302 case ISD::UDIV: {
2303 if (Op2Info.isConstant() && Op2Info.isUniform()) {
2304 auto VT = TLI->getValueType(DL, Ty);
2305 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
2306 // Vector signed division by constant are expanded to the
2307 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
2308 // to MULHS + SUB + SRL + ADD + SRL.
2310 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2312 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2314 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2315 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
2316 }
2317 }
2318
2320 Opcode, Ty, CostKind, Op1Info, Op2Info);
2321 if (Ty->isVectorTy()) {
2322 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
2323 // SDIV/UDIV operations are lowered using SVE, then we can have less
2324 // costs.
2325 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
2326 ->getPrimitiveSizeInBits()
2327 .getFixedValue() < 128) {
2328 EVT VT = TLI->getValueType(DL, Ty);
2329 static const CostTblEntry DivTbl[]{
2336
2337 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
2338 if (nullptr != Entry)
2339 return Entry->Cost;
2340 }
2341 // For 8/16-bit elements, the cost is higher because the type
2342 // requires promotion and possibly splitting:
2343 if (LT.second.getScalarType() == MVT::i8)
2344 Cost *= 8;
2345 else if (LT.second.getScalarType() == MVT::i16)
2346 Cost *= 4;
2347 return Cost;
2348 } else {
2349 // If one of the operands is a uniform constant then the cost for each
2350 // element is Cost for insertion, extraction and division.
2351 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
2352 // operation with scalar type
2353 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
2354 (Op2Info.isConstant() && Op2Info.isUniform())) {
2355 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2357 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
2358 return (4 + DivCost) * VTy->getNumElements();
2359 }
2360 }
2361 // On AArch64, without SVE, vector divisions are expanded
2362 // into scalar divisions of each pair of elements.
2363 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
2364 CostKind, Op1Info, Op2Info);
2365 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
2366 Op1Info, Op2Info);
2367 }
2368
2369 // TODO: if one of the arguments is scalar, then it's not necessary to
2370 // double the cost of handling the vector elements.
2371 Cost += Cost;
2372 }
2373 return Cost;
2374 }
2375 case ISD::MUL:
2376 // When SVE is available, then we can lower the v2i64 operation using
2377 // the SVE mul instruction, which has a lower cost.
2378 if (LT.second == MVT::v2i64 && ST->hasSVE())
2379 return LT.first;
2380
2381 // When SVE is not available, there is no MUL.2d instruction,
2382 // which means mul <2 x i64> is expensive as elements are extracted
2383 // from the vectors and the muls scalarized.
2384 // As getScalarizationOverhead is a bit too pessimistic, we
2385 // estimate the cost for a i64 vector directly here, which is:
2386 // - four 2-cost i64 extracts,
2387 // - two 2-cost i64 inserts, and
2388 // - two 1-cost muls.
2389 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
2390 // LT.first = 2 the cost is 28. If both operands are extensions it will not
2391 // need to scalarize so the cost can be cheaper (smull or umull).
2392 // so the cost can be cheaper (smull or umull).
2393 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
2394 return LT.first;
2395 return LT.first * 14;
2396 case ISD::ADD:
2397 case ISD::XOR:
2398 case ISD::OR:
2399 case ISD::AND:
2400 case ISD::SRL:
2401 case ISD::SRA:
2402 case ISD::SHL:
2403 // These nodes are marked as 'custom' for combining purposes only.
2404 // We know that they are legal. See LowerAdd in ISelLowering.
2405 return LT.first;
2406
2407 case ISD::FADD:
2408 case ISD::FSUB:
2409 case ISD::FMUL:
2410 case ISD::FDIV:
2411 case ISD::FNEG:
2412 // These nodes are marked as 'custom' just to lower them to SVE.
2413 // We know said lowering will incur no additional cost.
2414 if (!Ty->getScalarType()->isFP128Ty())
2415 return 2 * LT.first;
2416
2417 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2418 Op2Info);
2419 }
2420}
2421
2423 ScalarEvolution *SE,
2424 const SCEV *Ptr) {
2425 // Address computations in vectorized code with non-consecutive addresses will
2426 // likely result in more instructions compared to scalar code where the
2427 // computation can more often be merged into the index mode. The resulting
2428 // extra micro-ops can significantly decrease throughput.
2429 unsigned NumVectorInstToHideOverhead = 10;
2430 int MaxMergeDistance = 64;
2431
2432 if (Ty->isVectorTy() && SE &&
2433 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
2434 return NumVectorInstToHideOverhead;
2435
2436 // In many cases the address computation is not merged into the instruction
2437 // addressing mode.
2438 return 1;
2439}
2440
2442 Type *CondTy,
2443 CmpInst::Predicate VecPred,
2445 const Instruction *I) {
2446 // TODO: Handle other cost kinds.
2448 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2449 I);
2450
2451 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2452 // We don't lower some vector selects well that are wider than the register
2453 // width.
2454 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
2455 // We would need this many instructions to hide the scalarization happening.
2456 const int AmortizationCost = 20;
2457
2458 // If VecPred is not set, check if we can get a predicate from the context
2459 // instruction, if its type matches the requested ValTy.
2460 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
2461 CmpInst::Predicate CurrentPred;
2462 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
2463 m_Value())))
2464 VecPred = CurrentPred;
2465 }
2466 // Check if we have a compare/select chain that can be lowered using
2467 // a (F)CMxx & BFI pair.
2468 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
2469 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
2470 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
2471 VecPred == CmpInst::FCMP_UNE) {
2472 static const auto ValidMinMaxTys = {
2475 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
2476
2477 auto LT = getTypeLegalizationCost(ValTy);
2478 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
2479 (ST->hasFullFP16() &&
2480 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
2481 return LT.first;
2482 }
2483
2484 static const TypeConversionCostTblEntry
2485 VectorSelectTbl[] = {
2489 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
2490 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
2491 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
2492 };
2493
2494 EVT SelCondTy = TLI->getValueType(DL, CondTy);
2495 EVT SelValTy = TLI->getValueType(DL, ValTy);
2496 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
2497 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
2498 SelCondTy.getSimpleVT(),
2499 SelValTy.getSimpleVT()))
2500 return Entry->Cost;
2501 }
2502 }
2503 // The base case handles scalable vectors fine for now, since it treats the
2504 // cost as 1 * legalization cost.
2505 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2506}
2507
2509AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
2511 if (ST->requiresStrictAlign()) {
2512 // TODO: Add cost modeling for strict align. Misaligned loads expand to
2513 // a bunch of instructions when strict align is enabled.
2514 return Options;
2515 }
2516 Options.AllowOverlappingLoads = true;
2517 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
2518 Options.NumLoadsPerBlock = Options.MaxNumLoads;
2519 // TODO: Though vector loads usually perform well on AArch64, in some targets
2520 // they may wake up the FP unit, which raises the power consumption. Perhaps
2521 // they could be used with no holds barred (-O3).
2522 Options.LoadSizes = {8, 4, 2, 1};
2523 return Options;
2524}
2525
2527 return ST->hasSVE();
2528}
2529
2532 Align Alignment, unsigned AddressSpace,
2534 if (useNeonVector(Src))
2535 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2536 CostKind);
2537 auto LT = getTypeLegalizationCost(Src);
2538 if (!LT.first.isValid())
2540
2541 // The code-generator is currently not able to handle scalable vectors
2542 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2543 // it. This change will be removed when code-generation for these types is
2544 // sufficiently reliable.
2545 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
2547
2548 return LT.first;
2549}
2550
2551static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
2552 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
2553}
2554
2556 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
2557 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
2558 if (useNeonVector(DataTy))
2559 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
2560 Alignment, CostKind, I);
2561 auto *VT = cast<VectorType>(DataTy);
2562 auto LT = getTypeLegalizationCost(DataTy);
2563 if (!LT.first.isValid())
2565
2566 // The code-generator is currently not able to handle scalable vectors
2567 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2568 // it. This change will be removed when code-generation for these types is
2569 // sufficiently reliable.
2570 if (cast<VectorType>(DataTy)->getElementCount() ==
2573
2574 ElementCount LegalVF = LT.second.getVectorElementCount();
2575 InstructionCost MemOpCost =
2576 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
2577 {TTI::OK_AnyValue, TTI::OP_None}, I);
2578 // Add on an overhead cost for using gathers/scatters.
2579 // TODO: At the moment this is applied unilaterally for all CPUs, but at some
2580 // point we may want a per-CPU overhead.
2581 MemOpCost *= getSVEGatherScatterOverhead(Opcode);
2582 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
2583}
2584
2586 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
2587}
2588
2590 MaybeAlign Alignment,
2591 unsigned AddressSpace,
2593 TTI::OperandValueInfo OpInfo,
2594 const Instruction *I) {
2595 EVT VT = TLI->getValueType(DL, Ty, true);
2596 // Type legalization can't handle structs
2597 if (VT == MVT::Other)
2598 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
2599 CostKind);
2600
2601 auto LT = getTypeLegalizationCost(Ty);
2602 if (!LT.first.isValid())
2604
2605 // The code-generator is currently not able to handle scalable vectors
2606 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2607 // it. This change will be removed when code-generation for these types is
2608 // sufficiently reliable.
2609 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
2610 if (VTy->getElementCount() == ElementCount::getScalable(1))
2612
2613 // TODO: consider latency as well for TCK_SizeAndLatency.
2615 return LT.first;
2616
2618 return 1;
2619
2620 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
2621 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
2622 // Unaligned stores are extremely inefficient. We don't split all
2623 // unaligned 128-bit stores because the negative impact that has shown in
2624 // practice on inlined block copy code.
2625 // We make such stores expensive so that we will only vectorize if there
2626 // are 6 other instructions getting vectorized.
2627 const int AmortizationCost = 6;
2628
2629 return LT.first * 2 * AmortizationCost;
2630 }
2631
2632 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
2633 if (Ty->isPtrOrPtrVectorTy())
2634 return LT.first;
2635
2636 // Check truncating stores and extending loads.
2637 if (useNeonVector(Ty) &&
2638 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
2639 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
2640 if (VT == MVT::v4i8)
2641 return 2;
2642 // Otherwise we need to scalarize.
2643 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
2644 }
2645
2646 return LT.first;
2647}
2648
2650 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
2651 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
2652 bool UseMaskForCond, bool UseMaskForGaps) {
2653 assert(Factor >= 2 && "Invalid interleave factor");
2654 auto *VecVTy = cast<FixedVectorType>(VecTy);
2655
2656 if (!UseMaskForCond && !UseMaskForGaps &&
2657 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
2658 unsigned NumElts = VecVTy->getNumElements();
2659 auto *SubVecTy =
2660 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
2661
2662 // ldN/stN only support legal vector types of size 64 or 128 in bits.
2663 // Accesses having vector types that are a multiple of 128 bits can be
2664 // matched to more than one ldN/stN instruction.
2665 bool UseScalable;
2666 if (NumElts % Factor == 0 &&
2667 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
2668 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
2669 }
2670
2671 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2672 Alignment, AddressSpace, CostKind,
2673 UseMaskForCond, UseMaskForGaps);
2674}
2675
2680 for (auto *I : Tys) {
2681 if (!I->isVectorTy())
2682 continue;
2683 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
2684 128)
2685 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
2686 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
2687 }
2688 return Cost;
2689}
2690
2692 return ST->getMaxInterleaveFactor();
2693}
2694
2695// For Falkor, we want to avoid having too many strided loads in a loop since
2696// that can exhaust the HW prefetcher resources. We adjust the unroller
2697// MaxCount preference below to attempt to ensure unrolling doesn't create too
2698// many strided loads.
2699static void
2702 enum { MaxStridedLoads = 7 };
2703 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
2704 int StridedLoads = 0;
2705 // FIXME? We could make this more precise by looking at the CFG and
2706 // e.g. not counting loads in each side of an if-then-else diamond.
2707 for (const auto BB : L->blocks()) {
2708 for (auto &I : *BB) {
2709 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
2710 if (!LMemI)
2711 continue;
2712
2713 Value *PtrValue = LMemI->getPointerOperand();
2714 if (L->isLoopInvariant(PtrValue))
2715 continue;
2716
2717 const SCEV *LSCEV = SE.getSCEV(PtrValue);
2718 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
2719 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
2720 continue;
2721
2722 // FIXME? We could take pairing of unrolled load copies into account
2723 // by looking at the AddRec, but we would probably have to limit this
2724 // to loops with no stores or other memory optimization barriers.
2725 ++StridedLoads;
2726 // We've seen enough strided loads that seeing more won't make a
2727 // difference.
2728 if (StridedLoads > MaxStridedLoads / 2)
2729 return StridedLoads;
2730 }
2731 }
2732 return StridedLoads;
2733 };
2734
2735 int StridedLoads = countStridedLoads(L, SE);
2736 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
2737 << " strided loads\n");
2738 // Pick the largest power of 2 unroll count that won't result in too many
2739 // strided loads.
2740 if (StridedLoads) {
2741 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
2742 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
2743 << UP.MaxCount << '\n');
2744 }
2745}
2746
2750 // Enable partial unrolling and runtime unrolling.
2751 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
2752
2753 UP.UpperBound = true;
2754
2755 // For inner loop, it is more likely to be a hot one, and the runtime check
2756 // can be promoted out from LICM pass, so the overhead is less, let's try
2757 // a larger threshold to unroll more loops.
2758 if (L->getLoopDepth() > 1)
2759 UP.PartialThreshold *= 2;
2760
2761 // Disable partial & runtime unrolling on -Os.
2763
2767
2768 // Scan the loop: don't unroll loops with calls as this could prevent
2769 // inlining. Don't unroll vector loops either, as they don't benefit much from
2770 // unrolling.
2771 for (auto *BB : L->getBlocks()) {
2772 for (auto &I : *BB) {
2773 // Don't unroll vectorised loop.
2774 if (I.getType()->isVectorTy())
2775 return;
2776
2777 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2778 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2779 if (!isLoweredToCall(F))
2780 continue;
2781 }
2782 return;
2783 }
2784 }
2785 }
2786
2787 // Enable runtime unrolling for in-order models
2788 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
2789 // checking for that case, we can ensure that the default behaviour is
2790 // unchanged
2792 !ST->getSchedModel().isOutOfOrder()) {
2793 UP.Runtime = true;
2794 UP.Partial = true;
2795 UP.UnrollRemainder = true;
2797
2798 UP.UnrollAndJam = true;
2800 }
2801}
2802
2806}
2807
2809 Type *ExpectedType) {
2810 switch (Inst->getIntrinsicID()) {
2811 default:
2812 return nullptr;
2813 case Intrinsic::aarch64_neon_st2:
2814 case Intrinsic::aarch64_neon_st3:
2815 case Intrinsic::aarch64_neon_st4: {
2816 // Create a struct type
2817 StructType *ST = dyn_cast<StructType>(ExpectedType);
2818 if (!ST)
2819 return nullptr;
2820 unsigned NumElts = Inst->arg_size() - 1;
2821 if (ST->getNumElements() != NumElts)
2822 return nullptr;
2823 for (unsigned i = 0, e = NumElts; i != e; ++i) {
2824 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
2825 return nullptr;
2826 }
2827 Value *Res = PoisonValue::get(ExpectedType);
2828 IRBuilder<> Builder(Inst);
2829 for (unsigned i = 0, e = NumElts; i != e; ++i) {
2830 Value *L = Inst->getArgOperand(i);
2831 Res = Builder.CreateInsertValue(Res, L, i);
2832 }
2833 return Res;
2834 }
2835 case Intrinsic::aarch64_neon_ld2:
2836 case Intrinsic::aarch64_neon_ld3:
2837 case Intrinsic::aarch64_neon_ld4:
2838 if (Inst->getType() == ExpectedType)
2839 return Inst;
2840 return nullptr;
2841 }
2842}
2843
2845 MemIntrinsicInfo &Info) {
2846 switch (Inst->getIntrinsicID()) {
2847 default:
2848 break;
2849 case Intrinsic::aarch64_neon_ld2:
2850 case Intrinsic::aarch64_neon_ld3:
2851 case Intrinsic::aarch64_neon_ld4:
2852 Info.ReadMem = true;
2853 Info.WriteMem = false;
2854 Info.PtrVal = Inst->getArgOperand(0);
2855 break;
2856 case Intrinsic::aarch64_neon_st2:
2857 case Intrinsic::aarch64_neon_st3:
2858 case Intrinsic::aarch64_neon_st4:
2859 Info.ReadMem = false;
2860 Info.WriteMem = true;
2861 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
2862 break;
2863 }
2864
2865 switch (Inst->getIntrinsicID()) {
2866 default:
2867 return false;
2868 case Intrinsic::aarch64_neon_ld2:
2869 case Intrinsic::aarch64_neon_st2:
2870 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
2871 break;
2872 case Intrinsic::aarch64_neon_ld3:
2873 case Intrinsic::aarch64_neon_st3:
2874 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
2875 break;
2876 case Intrinsic::aarch64_neon_ld4:
2877 case Intrinsic::aarch64_neon_st4:
2878 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
2879 break;
2880 }
2881 return true;
2882}
2883
2884/// See if \p I should be considered for address type promotion. We check if \p
2885/// I is a sext with right type and used in memory accesses. If it used in a
2886/// "complex" getelementptr, we allow it to be promoted without finding other
2887/// sext instructions that sign extended the same initial value. A getelementptr
2888/// is considered as "complex" if it has more than 2 operands.
2890 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
2891 bool Considerable = false;
2892 AllowPromotionWithoutCommonHeader = false;
2893 if (!isa<SExtInst>(&I))
2894 return false;
2895 Type *ConsideredSExtType =
2896 Type::getInt64Ty(I.getParent()->getParent()->getContext());
2897 if (I.getType() != ConsideredSExtType)
2898 return false;
2899 // See if the sext is the one with the right type and used in at least one
2900 // GetElementPtrInst.
2901 for (const User *U : I.users()) {
2902 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2903 Considerable = true;
2904 // A getelementptr is considered as "complex" if it has more than 2
2905 // operands. We will promote a SExt used in such complex GEP as we
2906 // expect some computation to be merged if they are done on 64 bits.
2907 if (GEPInst->getNumOperands() > 2) {
2908 AllowPromotionWithoutCommonHeader = true;
2909 break;
2910 }
2911 }
2912 }
2913 return Considerable;
2914}
2915
2917 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
2918 if (!VF.isScalable())
2919 return true;
2920
2921 Type *Ty = RdxDesc.getRecurrenceType();
2923 return false;
2924
2925 switch (RdxDesc.getRecurrenceKind()) {
2926 case RecurKind::Add:
2927 case RecurKind::FAdd:
2928 case RecurKind::And:
2929 case RecurKind::Or:
2930 case RecurKind::Xor:
2931 case RecurKind::SMin:
2932 case RecurKind::SMax:
2933 case RecurKind::UMin:
2934 case RecurKind::UMax:
2935 case RecurKind::FMin:
2936 case RecurKind::FMax:
2939 case RecurKind::FMulAdd:
2940 return true;
2941 default:
2942 return false;
2943 }
2944}
2945
2948 bool IsUnsigned,
2950 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2951
2952 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
2953 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
2954
2955 assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&
2956 "Both vector needs to be equally scalable");
2957
2958 InstructionCost LegalizationCost = 0;
2959 if (LT.first > 1) {
2960 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2961 unsigned MinMaxOpcode =
2962 Ty->isFPOrFPVectorTy()
2963 ? Intrinsic::maxnum
2964 : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin);
2965 IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy});
2966 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
2967 }
2968
2969 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
2970}
2971
2973 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
2974 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2975 InstructionCost LegalizationCost = 0;
2976 if (LT.first > 1) {
2977 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
2978 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
2979 LegalizationCost *= LT.first - 1;
2980 }
2981
2982 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2983 assert(ISD && "Invalid opcode");
2984 // Add the final reduction cost for the legal horizontal reduction
2985 switch (ISD) {
2986 case ISD::ADD:
2987 case ISD::AND:
2988 case ISD::OR:
2989 case ISD::XOR:
2990 case ISD::FADD:
2991 return LegalizationCost + 2;
2992 default:
2994 }
2995}
2996
2999 std::optional<FastMathFlags> FMF,
3002 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
3003 InstructionCost BaseCost =
3004 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3005 // Add on extra cost to reflect the extra overhead on some CPUs. We still
3006 // end up vectorizing for more computationally intensive loops.
3007 return BaseCost + FixedVTy->getNumElements();
3008 }
3009
3010 if (Opcode != Instruction::FAdd)
3012
3013 auto *VTy = cast<ScalableVectorType>(ValTy);
3015 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
3016 Cost *= getMaxNumElements(VTy->getElementCount());
3017 return Cost;
3018 }
3019
3020 if (isa<ScalableVectorType>(ValTy))
3021 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
3022
3023 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3024 MVT MTy = LT.second;
3025 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3026 assert(ISD && "Invalid opcode");
3027
3028 // Horizontal adds can use the 'addv' instruction. We model the cost of these
3029 // instructions as twice a normal vector add, plus 1 for each legalization
3030 // step (LT.first). This is the only arithmetic vector reduction operation for
3031 // which we have an instruction.
3032 // OR, XOR and AND costs should match the codegen from:
3033 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3034 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
3035 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
3036 static const CostTblEntry CostTblNoPairwise[]{
3037 {ISD::ADD, MVT::v8i8, 2},
3038 {ISD::ADD, MVT::v16i8, 2},
3039 {ISD::ADD, MVT::v4i16, 2},
3040 {ISD::ADD, MVT::v8i16, 2},
3041 {ISD::ADD, MVT::v4i32, 2},
3042 {ISD::ADD, MVT::v2i64, 2},
3043 {ISD::OR, MVT::v8i8, 15},
3044 {ISD::OR, MVT::v16i8, 17},
3045 {ISD::OR, MVT::v4i16, 7},
3046 {ISD::OR, MVT::v8i16, 9},
3047 {ISD::OR, MVT::v2i32, 3},
3048 {ISD::OR, MVT::v4i32, 5},
3049 {ISD::OR, MVT::v2i64, 3},
3050 {ISD::XOR, MVT::v8i8, 15},
3051 {ISD::XOR, MVT::v16i8, 17},
3052 {ISD::XOR, MVT::v4i16, 7},
3053 {ISD::XOR, MVT::v8i16, 9},
3054 {ISD::XOR, MVT::v2i32, 3},
3055 {ISD::XOR, MVT::v4i32, 5},
3056 {ISD::XOR, MVT::v2i64, 3},
3057 {ISD::AND, MVT::v8i8, 15},
3058 {ISD::AND, MVT::v16i8, 17},
3059 {ISD::AND, MVT::v4i16, 7},
3060 {ISD::AND, MVT::v8i16, 9},
3061 {ISD::AND, MVT::v2i32, 3},
3062 {ISD::AND, MVT::v4i32, 5},
3063 {ISD::AND, MVT::v2i64, 3},
3064 };
3065 switch (ISD) {
3066 default:
3067 break;
3068 case ISD::ADD:
3069 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
3070 return (LT.first - 1) + Entry->Cost;
3071 break;
3072 case ISD::XOR:
3073 case ISD::AND:
3074 case ISD::OR:
3075 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
3076 if (!Entry)
3077 break;
3078 auto *ValVTy = cast<FixedVectorType>(ValTy);
3079 if (!ValVTy->getElementType()->isIntegerTy(1) &&
3080 MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
3081 isPowerOf2_32(ValVTy->getNumElements())) {
3082 InstructionCost ExtraCost = 0;
3083 if (LT.first != 1) {
3084 // Type needs to be split, so there is an extra cost of LT.first - 1
3085 // arithmetic ops.
3086 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
3087 MTy.getVectorNumElements());
3088 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3089 ExtraCost *= LT.first - 1;
3090 }
3091 return Entry->Cost + ExtraCost;
3092 }
3093 break;
3094 }
3095 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3096}
3097
3099 static const CostTblEntry ShuffleTbl[] = {
3113 };
3114
3115 // The code-generator is currently not able to handle scalable vectors
3116 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3117 // it. This change will be removed when code-generation for these types is
3118 // sufficiently reliable.
3121
3122 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3123 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
3125 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
3126 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
3127 : LT.second;
3128 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
3129 InstructionCost LegalizationCost = 0;
3130 if (Index < 0) {
3131 LegalizationCost =
3132 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
3134 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
3136 }
3137
3138 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
3139 // Cost performed on a promoted type.
3140 if (LT.second.getScalarType() == MVT::i1) {
3141 LegalizationCost +=
3142 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
3144 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
3146 }
3147 const auto *Entry =
3148 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
3149 assert(Entry && "Illegal Type for Splice");
3150 LegalizationCost += Entry->Cost;
3151 return LegalizationCost * LT.first;
3152}
3153
3155 VectorType *Tp,
3156 ArrayRef<int> Mask,
3158 int Index, VectorType *SubTp,
3160 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3161 // If we have a Mask, and the LT is being legalized somehow, split the Mask
3162 // into smaller vectors and sum the cost of each shuffle.
3163 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
3164 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
3165 cast<FixedVectorType>(Tp)->getNumElements() >
3166 LT.second.getVectorNumElements() &&
3167 !Index && !SubTp) {
3168 unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements();
3169 assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!");
3170 unsigned LTNumElts = LT.second.getVectorNumElements();
3171 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
3172 VectorType *NTp =
3173 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
3175 for (unsigned N = 0; N < NumVecs; N++) {
3176 SmallVector<int> NMask;
3177 // Split the existing mask into chunks of size LTNumElts. Track the source
3178 // sub-vectors to ensure the result has at most 2 inputs.
3179 unsigned Source1, Source2;
3180 unsigned NumSources = 0;
3181 for (unsigned E = 0; E < LTNumElts; E++) {
3182 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
3183 : UndefMaskElem;
3184 if (MaskElt < 0) {
3185 NMask.push_back(UndefMaskElem);
3186 continue;
3187 }
3188
3189 // Calculate which source from the input this comes from and whether it
3190 // is new to us.
3191 unsigned Source = MaskElt / LTNumElts;
3192 if (NumSources == 0) {
3193 Source1 = Source;
3194 NumSources = 1;
3195 } else if (NumSources == 1 && Source != Source1) {
3196 Source2 = Source;
3197 NumSources = 2;
3198 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
3199 NumSources++;
3200 }
3201
3202 // Add to the new mask. For the NumSources>2 case these are not correct,
3203 // but are only used for the modular lane number.
3204 if (Source == Source1)
3205 NMask.push_back(MaskElt % LTNumElts);
3206 else if (Source == Source2)
3207 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
3208 else
3209 NMask.push_back(MaskElt % LTNumElts);
3210 }
3211 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
3212 // getShuffleCost. If not then cost it using the worst case.
3213 if (NumSources <= 2)
3214 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
3216 NTp, NMask, CostKind, 0, nullptr, Args);
3217 else if (any_of(enumerate(NMask), [&](const auto &ME) {
3218 return ME.value() % LTNumElts == ME.index();
3219 }))
3220 Cost += LTNumElts - 1;
3221 else
3222 Cost += LTNumElts;
3223 }
3224 return Cost;
3225 }
3226
3227 Kind = improveShuffleKindFromMask(Kind, Mask);
3228
3229 // Check for broadcast loads, which are supported by the LD1R instruction.
3230 // In terms of code-size, the shuffle vector is free when a load + dup get
3231 // folded into a LD1R. That's what we check and return here. For performance
3232 // and reciprocal throughput, a LD1R is not completely free. In this case, we
3233 // return the cost for the broadcast below (i.e. 1 for most/all types), so
3234 // that we model the load + dup sequence slightly higher because LD1R is a
3235 // high latency instruction.
3236 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
3237 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
3238 if (IsLoad && LT.second.isVector() &&
3240 LT.second.getVectorElementCount()))
3241 return 0;
3242 }
3243
3244 // If we have 4 elements for the shuffle and a Mask, get the cost straight
3245 // from the perfect shuffle tables.
3246 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
3247 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
3248 all_of(Mask, [](int E) { return E < 8; }))
3249 return getPerfectShuffleCost(Mask);
3250
3251 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
3252 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
3253 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
3254 static const CostTblEntry ShuffleTbl[] = {
3255 // Broadcast shuffle kinds can be performed with 'dup'.
3268 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
3269 // 'zip1/zip2' instructions.
3282 // Select shuffle kinds.
3283 // TODO: handle vXi8/vXi16.
3284 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
3285 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
3286 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
3287 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
3288 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
3289 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
3290 // PermuteSingleSrc shuffle kinds.
3292 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
3295 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
3297 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
3298 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
3300 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
3301 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
3302 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
3303 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
3304 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
3305 // Reverse can be lowered with `rev`.
3306 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
3307 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
3308 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
3309 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
3310 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
3311 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
3312 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
3313 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
3314 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
3315 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
3316 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
3317 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
3318 // Splice can all be lowered as `ext`.
3333 // Broadcast shuffle kinds for scalable vectors
3351 // Handle the cases for vector.reverse with scalable vectors
3369 };
3370 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
3371 return LT.first * Entry->Cost;
3372 }
3373
3374 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
3375 return getSpliceCost(Tp, Index);
3376
3377 // Inserting a subvector can often be done with either a D, S or H register
3378 // move, so long as the inserted vector is "aligned".
3379 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
3380 LT.second.getSizeInBits() <= 128 && SubTp) {
3381 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
3382 if (SubLT.second.isVector()) {
3383 int NumElts = LT.second.getVectorNumElements();
3384 int NumSubElts = SubLT.second.getVectorNumElements();
3385 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
3386 return SubLT.first;
3387 }
3388 }
3389
3390 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
3391}
3392
3395 const ValueToValueMap &Strides = ValueToValueMap();
3396 for (BasicBlock *BB : TheLoop->blocks()) {
3397 // Scan the instructions in the block and look for addresses that are
3398 // consecutive and decreasing.
3399 for (Instruction &I : *BB) {
3400 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
3402 Type *AccessTy = getLoadStoreType(&I);
3403 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
3404 /*ShouldCheckWrap=*/false)
3405 .value_or(0) < 0)
3406 return true;
3407 }
3408 }
3409 }
3410 return false;
3411}
3412
3414 Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
3416 InterleavedAccessInfo *IAI) {
3417 if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled)
3418 return false;
3419
3420 // We don't currently support vectorisation with interleaving for SVE - with
3421 // such loops we're better off not using tail-folding. This gives us a chance
3422 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
3423 if (IAI->hasGroups())
3424 return false;
3425
3426 TailFoldingKind Required; // Defaults to 0.
3427 if (LVL->getReductionVars().size())
3428 Required.add(TailFoldingKind::TFReductions);
3429 if (LVL->getFixedOrderRecurrences().size())
3430 Required.add(TailFoldingKind::TFRecurrences);
3431
3432 // We call this to discover whether any load/store pointers in the loop have
3433 // negative strides. This will require extra work to reverse the loop
3434 // predicate, which may be expensive.
3436 Required.add(TailFoldingKind::TFReverse);
3437 if (!Required)
3438 Required.add(TailFoldingKind::TFSimple);
3439
3440 return (TailFoldingKindLoc & Required) == Required;
3441}
3442
3445 int64_t BaseOffset, bool HasBaseReg,
3446 int64_t Scale, unsigned AddrSpace) const {
3447 // Scaling factors are not free at all.
3448 // Operands | Rt Latency
3449 // -------------------------------------------
3450 // Rt, [Xn, Xm] | 4
3451 // -------------------------------------------
3452 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
3453 // Rt, [Xn, Wm, <extend> #imm] |
3455 AM.BaseGV = BaseGV;
3456 AM.BaseOffs = BaseOffset;
3457 AM.HasBaseReg = HasBaseReg;
3458 AM.Scale = Scale;
3459 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
3460 // Scale represents reg2 * scale, thus account for 1 if
3461 // it is not equal to 0 or 1.
3462 return AM.Scale != 0 && AM.Scale != 1;
3463 return -1;
3464}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
cl::opt< TailFoldingKind, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE:" "\ndisabled No loop types will vectorize using tail-folding" "\ndefault Uses the default tail-folding settings for the target " "CPU" "\nall All legal loop types will vectorize using tail-folding" "\nsimple Use tail-folding for simple loops (not reductions or " "recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nreverse Use tail-folding for loops requiring reversed " "predicates"), cl::location(TailFoldingKindLoc))
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE)
static std::optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
TailFoldingKind TailFoldingKindLoc
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfo::Concept conforming object specific to the AArch64 target machine.
amdgpu Simplify well known AMD library false FunctionCallee Callee
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
amdgpu AMDGPU Register Bank Select
assume Assume Builder
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
IntegerType * Int32Ty
#define P(N)
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
bool isStreamingSVEModeDisabled() const
unsigned getVectorInsertExtractBaseCost() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned getMaxInterleaveFactor() const
bool useSVEForFixedLengthVectors() const
unsigned getMinSVEVectorSizeInBits() const
InstructionCost getSpliceCost(VectorType *Tp, int Index)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isElementTypeLegalForScalableVector(Type *Ty) const
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool useNeonVector(const Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
unsigned getMaxInterleaveFactor(ElementCount VF)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, InterleavedAccessInfo *IAI)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
Class for arbitrary precision integers.
Definition: APInt.h:75
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:441
void negate()
Negate this APInt in place.
Definition: APInt.h:1421
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1002
unsigned logBase2() const
Definition: APInt.h:1707
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:815
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:432
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1516
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:538
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:851
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
Definition: BasicTTIImpl.h:965
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr)
Definition: BasicTTIImpl.h:328
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:610
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:815
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:929
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:995
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Instruction *CopyO, const Twine &Name="", Instruction *InsertBefore=nullptr)
Definition: InstrTypes.h:248
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1353
unsigned arg_size() const
Definition: InstrTypes.h:1351
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:718
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:721
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:724
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:722
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:723
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:725
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:734
bool isIntPredicate() const
Definition: InstrTypes.h:826
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1586
This is the shared class of boolean and integer constants.
Definition: Constants.h:78
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:136
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:356
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:294
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:291
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:21
bool allowContract() const
Definition: FMF.h:71
Container class for subtarget features.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:704
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:940
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2564
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
The core instruction combiner logic.
Definition: InstCombiner.h:45
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:418
static InstructionCost getInvalid(CostType Val=0)
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:70
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:781
bool hasGroups() const
Returns true if we have any interleave groups.
Definition: VectorUtils.h:845
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:177
Value * getPointerOperand()
Definition: Instructions.h:264
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:195
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition: MapVector.h:61
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:398
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:651
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1750
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:69
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNewZAInterface() const
std::optional< bool > requiresSMChange(const SMEAttrs &Callee, bool BodyOverridesInterface=false) const
bool requiresLazySave(const SMEAttrs &Callee) const
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:725
The main scalar evolution driver.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
size_type size() const
Definition: SmallPtrSet.h:93
size_t size() const
Definition: SmallVector.h:91
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:809
void resize(size_type N)
Definition: SmallVector.h:642
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
An instruction for storing to memory.
Definition: Instructions.h:301
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:687
Class to represent struct types.
Definition: DerivedTypes.h:213
Provides information about what library functions are available for the current target.
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
bool isLoweredToCall(const Function *F) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:322
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:325
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:267
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
static IntegerType * getInt1Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:163
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:264
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:231
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:219
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:350
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:920
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:996
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:627
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:688
Type * getElementType() const
Definition: DerivedTypes.h:422
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:166
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:786
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:898
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:773