72#include "llvm/IR/IntrinsicsAArch64.h"
108#define DEBUG_TYPE "aarch64-lower"
111STATISTIC(NumShiftInserts,
"Number of vector shift inserts");
112STATISTIC(NumOptimizedImms,
"Number of times immediates were optimized");
119 cl::desc(
"Allow AArch64 Local Dynamic TLS code generation"),
124 cl::desc(
"Enable AArch64 logical imm instruction "
134 cl::desc(
"Combine extends of AArch64 masked "
135 "gather intrinsics"),
139 cl::desc(
"Combine ext and trunc to TBL"),
154 cl::desc(
"Enable / disable SVE scalable vectors in Global ISel"),
161 AArch64::X3, AArch64::X4, AArch64::X5,
162 AArch64::X6, AArch64::X7};
164 AArch64::Q3, AArch64::Q4, AArch64::Q5,
165 AArch64::Q6, AArch64::Q7};
190 return MVT::nxv8bf16;
197 switch (EC.getKnownMinValue()) {
213 "Expected scalable predicate vector type!");
235 "Expected legal vector type!");
282 switch (
Op.getOpcode()) {
291 switch (
Op.getConstantOperandVal(0)) {
294 case Intrinsic::aarch64_sve_ptrue:
295 case Intrinsic::aarch64_sve_pnext:
296 case Intrinsic::aarch64_sve_cmpeq:
297 case Intrinsic::aarch64_sve_cmpne:
298 case Intrinsic::aarch64_sve_cmpge:
299 case Intrinsic::aarch64_sve_cmpgt:
300 case Intrinsic::aarch64_sve_cmphs:
301 case Intrinsic::aarch64_sve_cmphi:
302 case Intrinsic::aarch64_sve_cmpeq_wide:
303 case Intrinsic::aarch64_sve_cmpne_wide:
304 case Intrinsic::aarch64_sve_cmpge_wide:
305 case Intrinsic::aarch64_sve_cmpgt_wide:
306 case Intrinsic::aarch64_sve_cmplt_wide:
307 case Intrinsic::aarch64_sve_cmple_wide:
308 case Intrinsic::aarch64_sve_cmphs_wide:
309 case Intrinsic::aarch64_sve_cmphi_wide:
310 case Intrinsic::aarch64_sve_cmplo_wide:
311 case Intrinsic::aarch64_sve_cmpls_wide:
312 case Intrinsic::aarch64_sve_fcmpeq:
313 case Intrinsic::aarch64_sve_fcmpne:
314 case Intrinsic::aarch64_sve_fcmpge:
315 case Intrinsic::aarch64_sve_fcmpgt:
316 case Intrinsic::aarch64_sve_fcmpuo:
317 case Intrinsic::aarch64_sve_facgt:
318 case Intrinsic::aarch64_sve_facge:
319 case Intrinsic::aarch64_sve_whilege:
320 case Intrinsic::aarch64_sve_whilegt:
321 case Intrinsic::aarch64_sve_whilehi:
322 case Intrinsic::aarch64_sve_whilehs:
323 case Intrinsic::aarch64_sve_whilele:
324 case Intrinsic::aarch64_sve_whilelo:
325 case Intrinsic::aarch64_sve_whilels:
326 case Intrinsic::aarch64_sve_whilelt:
327 case Intrinsic::aarch64_sve_match:
328 case Intrinsic::aarch64_sve_nmatch:
329 case Intrinsic::aarch64_sve_whilege_x2:
330 case Intrinsic::aarch64_sve_whilegt_x2:
331 case Intrinsic::aarch64_sve_whilehi_x2:
332 case Intrinsic::aarch64_sve_whilehs_x2:
333 case Intrinsic::aarch64_sve_whilele_x2:
334 case Intrinsic::aarch64_sve_whilelo_x2:
335 case Intrinsic::aarch64_sve_whilels_x2:
336 case Intrinsic::aarch64_sve_whilelt_x2:
342static std::tuple<SDValue, SDValue>
362 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
363 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
369 AddrDisc = DAG->
getRegister(AArch64::NoRegister, MVT::i64);
371 return std::make_tuple(
390 if (Subtarget->hasLS64()) {
396 if (Subtarget->hasFPARMv8()) {
404 if (Subtarget->hasNEON()) {
408 addDRType(MVT::v2f32);
409 addDRType(MVT::v8i8);
410 addDRType(MVT::v4i16);
411 addDRType(MVT::v2i32);
412 addDRType(MVT::v1i64);
413 addDRType(MVT::v1f64);
414 addDRType(MVT::v4f16);
415 addDRType(MVT::v4bf16);
417 addQRType(MVT::v4f32);
418 addQRType(MVT::v2f64);
419 addQRType(MVT::v16i8);
420 addQRType(MVT::v8i16);
421 addQRType(MVT::v4i32);
422 addQRType(MVT::v2i64);
423 addQRType(MVT::v8f16);
424 addQRType(MVT::v8bf16);
463 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
588 if (Subtarget->hasFPARMv8()) {
594 if (Subtarget->hasFPARMv8()) {
648 if (Subtarget->hasCSSC()) {
727 if (Subtarget->hasFullFP16()) {
756 auto LegalizeNarrowFP = [
this](
MVT ScalarVT) {
859 if (!Subtarget->hasFullFP16()) {
860 LegalizeNarrowFP(MVT::f16);
862 LegalizeNarrowFP(MVT::bf16);
880 for (
MVT Ty : {MVT::f32, MVT::f64})
882 if (Subtarget->hasFullFP16())
890 for (
MVT Ty : {MVT::f32, MVT::f64})
892 if (Subtarget->hasFullFP16())
897 for (
auto VT : {MVT::f32, MVT::f64})
909 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
921 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
947#define LCALLNAMES(A, B, N) \
948 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
949 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
950 setLibcallName(A##N##_REL, #B #N "_rel"); \
951 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
952#define LCALLNAME4(A, B) \
953 LCALLNAMES(A, B, 1) \
954 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
955#define LCALLNAME5(A, B) \
956 LCALLNAMES(A, B, 1) \
957 LCALLNAMES(A, B, 2) \
958 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
959 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
960 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
961 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
962 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
963 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
964 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
970 if (Subtarget->hasLSE128()) {
984 if (Subtarget->hasLSE2()) {
1047 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1053 if (Subtarget->hasFPARMv8()) {
1232 for (
auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1235 if (Subtarget->hasFullFP16()) {
1268 for (
auto VT : {MVT::v1i64, MVT::v2i64}) {
1284 for (
MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1285 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1292 for (
MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1303 for (
MVT VT : { MVT::v4f16, MVT::v2f32,
1304 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1305 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1314 for (
MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1315 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1337 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1364 for (
MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1366 if (Subtarget->hasFullFP16())
1367 for (
MVT Ty : {MVT::v4f16, MVT::v8f16})
1373 for (
MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1375 if (Subtarget->hasFullFP16())
1376 for (
MVT Ty : {MVT::v4f16, MVT::v8f16})
1399 for (
MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1402 for (
MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1409 if (VT.is128BitVector() || VT.is64BitVector()) {
1424 for (
MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1430 if (Subtarget->hasSME()) {
1438 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1447 for (
auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1512 if (Subtarget->hasSVE2() ||
1513 (Subtarget->hasSME() && Subtarget->
isStreaming()))
1519 for (
auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1525 for (
auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1529 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1530 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1534 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1548 if (VT != MVT::nxv16i1) {
1556 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1557 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1558 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1597 for (
auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1598 MVT::nxv4f32, MVT::nxv2f64}) {
1674 for (
auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1686 if (Subtarget->hasSVEB16B16()) {
1706 if (!Subtarget->hasSVEB16B16()) {
1719 for (
auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1720 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1735 addTypeForFixedLengthSVE(VT);
1740 addTypeForFixedLengthSVE(VT);
1744 for (
auto VT : {MVT::v8i8, MVT::v4i16})
1749 for (
auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1751 for (
auto VT : {MVT::v8f16, MVT::v4f32})
1777 for (
auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1778 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1789 for (
auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1800 for (
auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1806 for (
auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1807 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1808 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1809 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1810 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1811 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1812 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1817 for (
auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1818 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1819 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1824 {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
1825 MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
1830 for (
auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
1831 MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
1835 if (Subtarget->hasSVE2()) {
1844 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1851 if (Subtarget->hasSVE()) {
1879 for (
int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1882 if ((libcallName !=
nullptr) && (libcallName[0] !=
'#')) {
1889void AArch64TargetLowering::addTypeForNEON(
MVT VT) {
1899 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1920 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1921 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1922 VT == MVT::v8f16) &&
1923 Subtarget->hasFullFP16()))
1946 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1955 for (
unsigned Opcode :
1973 for (
unsigned Opcode :
2009 if (Subtarget->hasD128()) {
2018 if (!Subtarget->hasSVE())
2023 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
2024 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
2025 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
2029 if (OpVT != MVT::i32 && OpVT != MVT::i64)
2037 if (
I->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add)
2041 auto Op1 =
I->getOperand(1);
2056 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2057 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2058 VT != MVT::v4i1 && VT != MVT::v2i1;
2062 unsigned SearchSize)
const {
2067 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2068 return SearchSize != 8;
2069 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2070 return SearchSize != 8 && SearchSize != 16;
2074void AArch64TargetLowering::addTypeForFixedLengthSVE(
MVT VT) {
2099 while (InnerVT != VT) {
2113 while (InnerVT != VT) {
2218void AArch64TargetLowering::addDRType(
MVT VT) {
2224void AArch64TargetLowering::addQRType(
MVT VT) {
2243 Imm =
C->getZExtValue();
2254 return N->getOpcode() == Opc &&
2259 const APInt &Demanded,
2262 uint64_t OldImm = Imm, NewImm, Enc;
2267 if (Imm == 0 || Imm == Mask ||
2271 unsigned EltSize =
Size;
2288 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2290 uint64_t Sum = RotatedImm + NonDemandedBits;
2291 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2292 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2293 NewImm = (Imm | Ones) & Mask;
2321 while (EltSize <
Size) {
2322 NewImm |= NewImm << EltSize;
2328 "demanded bits should never be altered");
2329 assert(OldImm != NewImm &&
"the new imm shouldn't be equal to the old imm");
2332 EVT VT =
Op.getValueType();
2338 if (NewImm == 0 || NewImm == OrigMask) {
2363 EVT VT =
Op.getValueType();
2369 "i32 or i64 is expected after legalization.");
2376 switch (
Op.getOpcode()) {
2380 NewOpc =
Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2383 NewOpc =
Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2386 NewOpc =
Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2401 switch (
Op.getOpcode()) {
2407 if (
SrcOp.getValueSizeInBits() !=
Op.getScalarValueSizeInBits()) {
2408 assert(
SrcOp.getValueSizeInBits() >
Op.getScalarValueSizeInBits() &&
2409 "Expected DUP implicit truncation");
2410 Known = Known.
trunc(
Op.getScalarValueSizeInBits());
2424 ~(
Op->getConstantOperandAPInt(1) <<
Op->getConstantOperandAPInt(2))
2474 case Intrinsic::aarch64_ldaxr:
2475 case Intrinsic::aarch64_ldxr: {
2477 EVT VT = cast<MemIntrinsicSDNode>(
Op)->getMemoryVT();
2487 unsigned IntNo =
Op.getConstantOperandVal(0);
2491 case Intrinsic::aarch64_neon_uaddlv: {
2492 MVT VT =
Op.getOperand(1).getValueType().getSimpleVT();
2494 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2495 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2502 case Intrinsic::aarch64_neon_umaxv:
2503 case Intrinsic::aarch64_neon_uminv: {
2508 MVT VT =
Op.getOperand(1).getValueType().getSimpleVT();
2510 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2514 }
else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2528 unsigned Depth)
const {
2529 EVT VT =
Op.getValueType();
2531 unsigned Opcode =
Op.getOpcode();
2556 return std::min<uint64_t>(Tmp +
Op.getConstantOperandVal(1), VTBits);
2570 unsigned *
Fast)
const {
2580 if (ElementSizeBits % 8 == 0 && Alignment >=
Align(ElementSizeBits / 8))
2584 if (Subtarget->requiresStrictAlign())
2589 *
Fast = !Subtarget->isMisaligned128StoreSlow() || VT.
getStoreSize() != 16 ||
2608 unsigned *
Fast)
const {
2609 if (Subtarget->requiresStrictAlign())
2614 *
Fast = !Subtarget->isMisaligned128StoreSlow() ||
2638#define MAKE_CASE(V) \
2998 Register DestReg =
MI.getOperand(0).getReg();
2999 Register IfTrueReg =
MI.getOperand(1).getReg();
3000 Register IfFalseReg =
MI.getOperand(2).getReg();
3001 unsigned CondCode =
MI.getOperand(3).getImm();
3002 bool NZCVKilled =
MI.getOperand(4).isKill();
3033 MI.eraseFromParent();
3041 "SEH does not use catchret!");
3053 Register TargetReg =
MI.getOperand(0).getReg();
3055 TII.probedStackAlloc(
MBBI, TargetReg,
false);
3057 MI.eraseFromParent();
3058 return NextInst->getParent();
3069 MIB.
add(
MI.getOperand(1));
3070 MIB.
add(
MI.getOperand(2));
3071 MIB.
add(
MI.getOperand(3));
3072 MIB.
add(
MI.getOperand(4));
3073 MIB.
add(
MI.getOperand(5));
3075 MI.eraseFromParent();
3086 MIB.
add(
MI.getOperand(0));
3087 MIB.
add(
MI.getOperand(1));
3088 MIB.
add(
MI.getOperand(2));
3089 MIB.
add(
MI.getOperand(1));
3091 MI.eraseFromParent();
3098 bool Op0IsDef)
const {
3104 for (
unsigned I = 1;
I <
MI.getNumOperands(); ++
I)
3105 MIB.
add(
MI.getOperand(
I));
3107 MI.eraseFromParent();
3117 unsigned StartIdx = 0;
3119 bool HasTile = BaseReg != AArch64::ZA;
3120 bool HasZPROut = HasTile &&
MI.getOperand(0).isReg();
3122 MIB.
add(
MI.getOperand(StartIdx));
3126 MIB.
addReg(BaseReg +
MI.getOperand(StartIdx).getImm(),
3128 MIB.
addReg(BaseReg +
MI.getOperand(StartIdx).getImm());
3132 if (
MI.getOperand(0).isReg() && !
MI.getOperand(1).isImm()) {
3133 MIB.
add(
MI.getOperand(StartIdx));
3138 for (
unsigned I = StartIdx;
I <
MI.getNumOperands(); ++
I)
3139 MIB.
add(
MI.getOperand(
I));
3141 MI.eraseFromParent();
3150 MIB.
add(
MI.getOperand(0));
3152 unsigned Mask =
MI.getOperand(0).getImm();
3153 for (
unsigned I = 0;
I < 8;
I++) {
3154 if (Mask & (1 <<
I))
3158 MI.eraseFromParent();
3169 if (TPIDR2.Uses > 0) {
3203 "Lazy ZA save is not yet supported on Windows");
3207 if (TPIDR2.
Uses > 0) {
3213 Register SP =
MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3214 BuildMI(*BB,
MI,
MI.getDebugLoc(),
TII->get(TargetOpcode::COPY), SP)
3218 auto Size =
MI.getOperand(1).getReg();
3219 auto Dest =
MI.getOperand(0).getReg();
3220 BuildMI(*BB,
MI,
MI.getDebugLoc(),
TII->get(AArch64::MSUBXrrr), Dest)
3244 "Lazy ZA save is not yet supported on Windows");
3249 auto Size =
MI.getOperand(1).getReg();
3250 auto Dest =
MI.getOperand(0).getReg();
3251 BuildMI(*BB,
MI,
MI.getDebugLoc(),
TII->get(AArch64::SUBXrx64), AArch64::SP)
3255 BuildMI(*BB,
MI,
MI.getDebugLoc(),
TII->get(TargetOpcode::COPY), Dest)
3261 BuildMI(*BB,
MI,
MI.getDebugLoc(),
TII->get(TargetOpcode::IMPLICIT_DEF),
3262 MI.getOperand(0).getReg());