LLVM 17.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Triple.h"
31#include "llvm/ADT/Twine.h"
54#include "llvm/IR/Attributes.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/DebugLoc.h"
59#include "llvm/IR/Function.h"
61#include "llvm/IR/GlobalValue.h"
62#include "llvm/IR/IRBuilder.h"
63#include "llvm/IR/Instruction.h"
66#include "llvm/IR/Intrinsics.h"
67#include "llvm/IR/IntrinsicsAArch64.h"
68#include "llvm/IR/Module.h"
71#include "llvm/IR/Type.h"
72#include "llvm/IR/Use.h"
73#include "llvm/IR/Value.h"
79#include "llvm/Support/Debug.h"
88#include <algorithm>
89#include <bitset>
90#include <cassert>
91#include <cctype>
92#include <cstdint>
93#include <cstdlib>
94#include <iterator>
95#include <limits>
96#include <optional>
97#include <tuple>
98#include <utility>
99#include <vector>
100
101using namespace llvm;
102using namespace llvm::PatternMatch;
103
104#define DEBUG_TYPE "aarch64-lower"
105
106STATISTIC(NumTailCalls, "Number of tail calls");
107STATISTIC(NumShiftInserts, "Number of vector shift inserts");
108STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
109
110// FIXME: The necessary dtprel relocations don't seem to be supported
111// well in the GNU bfd and gold linkers at the moment. Therefore, by
112// default, for now, fall back to GeneralDynamic code generation.
114 "aarch64-elf-ldtls-generation", cl::Hidden,
115 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
116 cl::init(false));
117
118static cl::opt<bool>
119EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
120 cl::desc("Enable AArch64 logical imm instruction "
121 "optimization"),
122 cl::init(true));
123
124// Temporary option added for the purpose of testing functionality added
125// to DAGCombiner.cpp in D92230. It is expected that this can be removed
126// in future when both implementations will be based off MGATHER rather
127// than the GLD1 nodes added for the SVE gather load intrinsics.
128static cl::opt<bool>
129EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
130 cl::desc("Combine extends of AArch64 masked "
131 "gather intrinsics"),
132 cl::init(true));
133
134// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
135// bottleneck after this transform on high end CPU. So this max leaf node
136// limitation is guard cmp+ccmp will be profitable.
137static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
138 cl::desc("Maximum of xors"));
139
140/// Value type used for condition codes.
141static const MVT MVT_CC = MVT::i32;
142
143static inline EVT getPackedSVEVectorVT(EVT VT) {
144 switch (VT.getSimpleVT().SimpleTy) {
145 default:
146 llvm_unreachable("unexpected element type for vector");
147 case MVT::i8:
148 return MVT::nxv16i8;
149 case MVT::i16:
150 return MVT::nxv8i16;
151 case MVT::i32:
152 return MVT::nxv4i32;
153 case MVT::i64:
154 return MVT::nxv2i64;
155 case MVT::f16:
156 return MVT::nxv8f16;
157 case MVT::f32:
158 return MVT::nxv4f32;
159 case MVT::f64:
160 return MVT::nxv2f64;
161 case MVT::bf16:
162 return MVT::nxv8bf16;
163 }
164}
165
166// NOTE: Currently there's only a need to return integer vector types. If this
167// changes then just add an extra "type" parameter.
169 switch (EC.getKnownMinValue()) {
170 default:
171 llvm_unreachable("unexpected element count for vector");
172 case 16:
173 return MVT::nxv16i8;
174 case 8:
175 return MVT::nxv8i16;
176 case 4:
177 return MVT::nxv4i32;
178 case 2:
179 return MVT::nxv2i64;
180 }
181}
182
185 "Expected scalable predicate vector type!");
186 switch (VT.getVectorMinNumElements()) {
187 default:
188 llvm_unreachable("unexpected element count for vector");
189 case 2:
190 return MVT::nxv2i64;
191 case 4:
192 return MVT::nxv4i32;
193 case 8:
194 return MVT::nxv8i16;
195 case 16:
196 return MVT::nxv16i8;
197 }
198}
199
200/// Returns true if VT's elements occupy the lowest bit positions of its
201/// associated register class without any intervening space.
202///
203/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
204/// same register class, but only nxv8f16 can be treated as a packed vector.
205static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
207 "Expected legal vector type!");
208 return VT.isFixedLengthVector() ||
210}
211
212// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
213// predicate and end with a passthru value matching the result type.
214static bool isMergePassthruOpcode(unsigned Opc) {
215 switch (Opc) {
216 default:
217 return false;
247 return true;
248 }
249}
250
251// Returns true if inactive lanes are known to be zeroed by construction.
253 switch (Op.getOpcode()) {
254 default:
255 // We guarantee i1 splat_vectors to zero the other lanes by
256 // implementing it with ptrue and possibly a punpklo for nxv1i1.
257 if (ISD::isConstantSplatVectorAllOnes(Op.getNode()))
258 return true;
259 return false;
262 return true;
264 switch (Op.getConstantOperandVal(0)) {
265 default:
266 return false;
267 case Intrinsic::aarch64_sve_ptrue:
268 case Intrinsic::aarch64_sve_pnext:
269 case Intrinsic::aarch64_sve_cmpeq:
270 case Intrinsic::aarch64_sve_cmpne:
271 case Intrinsic::aarch64_sve_cmpge:
272 case Intrinsic::aarch64_sve_cmpgt:
273 case Intrinsic::aarch64_sve_cmphs:
274 case Intrinsic::aarch64_sve_cmphi:
275 case Intrinsic::aarch64_sve_cmpeq_wide:
276 case Intrinsic::aarch64_sve_cmpne_wide:
277 case Intrinsic::aarch64_sve_cmpge_wide:
278 case Intrinsic::aarch64_sve_cmpgt_wide:
279 case Intrinsic::aarch64_sve_cmplt_wide:
280 case Intrinsic::aarch64_sve_cmple_wide:
281 case Intrinsic::aarch64_sve_cmphs_wide:
282 case Intrinsic::aarch64_sve_cmphi_wide:
283 case Intrinsic::aarch64_sve_cmplo_wide:
284 case Intrinsic::aarch64_sve_cmpls_wide:
285 case Intrinsic::aarch64_sve_fcmpeq:
286 case Intrinsic::aarch64_sve_fcmpne:
287 case Intrinsic::aarch64_sve_fcmpge:
288 case Intrinsic::aarch64_sve_fcmpgt:
289 case Intrinsic::aarch64_sve_fcmpuo:
290 case Intrinsic::aarch64_sve_facgt:
291 case Intrinsic::aarch64_sve_facge:
292 case Intrinsic::aarch64_sve_whilege:
293 case Intrinsic::aarch64_sve_whilegt:
294 case Intrinsic::aarch64_sve_whilehi:
295 case Intrinsic::aarch64_sve_whilehs:
296 case Intrinsic::aarch64_sve_whilele:
297 case Intrinsic::aarch64_sve_whilelo:
298 case Intrinsic::aarch64_sve_whilels:
299 case Intrinsic::aarch64_sve_whilelt:
300 case Intrinsic::aarch64_sve_match:
301 case Intrinsic::aarch64_sve_nmatch:
302 case Intrinsic::aarch64_sve_whilege_x2:
303 case Intrinsic::aarch64_sve_whilegt_x2:
304 case Intrinsic::aarch64_sve_whilehi_x2:
305 case Intrinsic::aarch64_sve_whilehs_x2:
306 case Intrinsic::aarch64_sve_whilele_x2:
307 case Intrinsic::aarch64_sve_whilelo_x2:
308 case Intrinsic::aarch64_sve_whilels_x2:
309 case Intrinsic::aarch64_sve_whilelt_x2:
310 return true;
311 }
312 }
313}
314
316 const AArch64Subtarget &STI)
317 : TargetLowering(TM), Subtarget(&STI) {
318 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
319 // we have to make something up. Arbitrarily, choose ZeroOrOne.
321 // When comparing vectors the result sets the different elements in the
322 // vector to all-one or all-zero.
324
325 // Set up the register classes.
326 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
327 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
328
329 if (Subtarget->hasLS64()) {
330 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
333 }
334
335 if (Subtarget->hasFPARMv8()) {
336 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
337 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
338 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
339 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
340 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
341 }
342
343 if (Subtarget->hasNEON()) {
344 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
345 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
346 // Someone set us up the NEON.
347 addDRTypeForNEON(MVT::v2f32);
348 addDRTypeForNEON(MVT::v8i8);
349 addDRTypeForNEON(MVT::v4i16);
350 addDRTypeForNEON(MVT::v2i32);
351 addDRTypeForNEON(MVT::v1i64);
352 addDRTypeForNEON(MVT::v1f64);
353 addDRTypeForNEON(MVT::v4f16);
354 if (Subtarget->hasBF16())
355 addDRTypeForNEON(MVT::v4bf16);
356
357 addQRTypeForNEON(MVT::v4f32);
358 addQRTypeForNEON(MVT::v2f64);
359 addQRTypeForNEON(MVT::v16i8);
360 addQRTypeForNEON(MVT::v8i16);
361 addQRTypeForNEON(MVT::v4i32);
362 addQRTypeForNEON(MVT::v2i64);
363 addQRTypeForNEON(MVT::v8f16);
364 if (Subtarget->hasBF16())
365 addQRTypeForNEON(MVT::v8bf16);
366 }
367
368 if (Subtarget->hasSVEorSME()) {
369 // Add legal sve predicate types
370 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
371 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
372 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
373 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
374 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
375
376 // Add legal sve data types
377 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
378 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
379 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
380 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
381
382 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
383 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
384 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
385 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
386 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
387 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
388
389 if (Subtarget->hasBF16()) {
390 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
391 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
392 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
393 }
394
395 if (Subtarget->useSVEForFixedLengthVectors()) {
398 addRegisterClass(VT, &AArch64::ZPRRegClass);
399
402 addRegisterClass(VT, &AArch64::ZPRRegClass);
403 }
404 }
405
406 // Compute derived properties from the register classes
408
409 // Provide all sorts of operation actions
446
450
454
456
457 // Custom lowering hooks are needed for XOR
458 // to fold it into CSINC/CSINV.
461
462 // Virtually no operation on f128 is legal, but LLVM can't expand them when
463 // there's a valid register class, so we need custom operations in most cases.
487 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
488 // aren't handled.
489
490 // Lowering for many of the conversions is actually specified by the non-f128
491 // type. The LowerXXX function will be trivial when f128 isn't involved.
522
527
528 // Variable arguments.
533
534 // Variable-sized objects.
537
538 if (Subtarget->isTargetWindows())
540 else
542
543 // Constant pool entries
545
546 // BlockAddress
548
549 // AArch64 lacks both left-rotate and popcount instructions.
555 }
556
557 // AArch64 doesn't have i32 MULH{S|U}.
560
561 // AArch64 doesn't have {U|S}MUL_LOHI.
564
565 if (Subtarget->hasCSSC()) {
569
571
575
578
583
588 } else {
592
595
598 }
599
605 }
612
613 // Custom lower Add/Sub/Mul with overflow.
626
635
644 if (Subtarget->hasFullFP16())
646 else
648
649 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
659 }
660
661 if (!Subtarget->hasFullFP16()) {
662 for (auto Op :
678
679 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
680 // because the result type is integer.
684
685 // promote v4f16 to v4f32 when that is known to be safe.
690
707
729 }
730
731 // AArch64 has implementations of a lot of rounding-like FP operations.
732 for (auto Op :
743 for (MVT Ty : {MVT::f32, MVT::f64})
744 setOperationAction(Op, Ty, Legal);
745 if (Subtarget->hasFullFP16())
747 }
748
749 // Basic strict FP operations are legal
752 for (MVT Ty : {MVT::f32, MVT::f64})
753 setOperationAction(Op, Ty, Legal);
754 if (Subtarget->hasFullFP16())
756 }
757
758 // Strict conversion to a larger type is legal
759 for (auto VT : {MVT::f32, MVT::f64})
761
763
766
772
773 // Generate outline atomics library calls only if LSE was not specified for
774 // subtarget
775 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
801#define LCALLNAMES(A, B, N) \
802 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
803 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
804 setLibcallName(A##N##_REL, #B #N "_rel"); \
805 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
806#define LCALLNAME4(A, B) \
807 LCALLNAMES(A, B, 1) \
808 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
809#define LCALLNAME5(A, B) \
810 LCALLNAMES(A, B, 1) \
811 LCALLNAMES(A, B, 2) \
812 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
813 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
814 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
815 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
816 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
817 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
818 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
819#undef LCALLNAMES
820#undef LCALLNAME4
821#undef LCALLNAME5
822 }
823
824 if (Subtarget->hasLSE128()) {
825 // Custom lowering because i128 is not legal. Must be replaced by 2x64
826 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
830 }
831
832 // 128-bit loads and stores can be done without expanding
835
836 // Aligned 128-bit loads and stores are single-copy atomic according to the
837 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
838 if (Subtarget->hasLSE2()) {
841 }
842
843 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
844 // custom lowering, as there are no un-paired non-temporal stores and
845 // legalization will break up 256 bit inputs.
853
854 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
855 // custom lowering, as there are no un-paired non-temporal loads legalization
856 // will break up 256 bit inputs.
864
865 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
867
868 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
869 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
870 // Issue __sincos_stret if available.
873 } else {
876 }
877
878 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
879 // MSVCRT doesn't have powi; fall back to pow
880 setLibcallName(RTLIB::POWI_F32, nullptr);
881 setLibcallName(RTLIB::POWI_F64, nullptr);
882 }
883
884 // Make floating-point constants legal for the large code model, so they don't
885 // become loads from the constant pool.
886 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
889 }
890
891 // AArch64 does not have floating-point extending loads, i1 sign-extending
892 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
893 for (MVT VT : MVT::fp_valuetypes()) {
898 }
899 for (MVT VT : MVT::integer_valuetypes())
901
909
913
914 // Indexed loads and stores are supported.
915 for (unsigned im = (unsigned)ISD::PRE_INC;
933 }
934
935 // Trap.
939
940 // We combine OR nodes for bitfield operations.
942 // Try to create BICs for vector ANDs.
944
945 // Vector add and sub nodes may conceal a high-half opportunity.
946 // Also, try to fold ADD into CSINC/CSINV..
949
952
953 // Try and combine setcc with csel
955
957
964
966
968
970
974
976
978
980
982
983 // In case of strict alignment, avoid an excessive number of byte wide stores.
986 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
987
991 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
992
995
998 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
999
1001
1003
1004 EnableExtLdPromotion = true;
1005
1006 // Set required alignment.
1008 // Set preferred alignments.
1012
1013 // Only change the limit for entries in a jump table if specified by
1014 // the sub target, but not at the command line.
1015 unsigned MaxJT = STI.getMaximumJumpTableSize();
1016 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1018
1020
1022
1024
1025 if (Subtarget->hasNEON()) {
1026 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1027 // silliness like this:
1028 for (auto Op :
1044
1045 for (auto Op :
1051
1052 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1053 // elements smaller than i32, so promote the input to i32 first.
1056
1057 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1058 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1059 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1062 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1063 setOperationAction(Op, VT, Custom);
1064
1065 if (Subtarget->hasFullFP16()) {
1067
1076 } else {
1077 // when AArch64 doesn't have fullfp16 support, promote the input
1078 // to i32 first.
1087 }
1088
1097 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1102 }
1103
1104 // AArch64 doesn't have MUL.2d:
1106 // Custom handling for some quad-vector types to detect MULL.
1110
1111 // Saturates
1112 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1118 }
1119
1121 MVT::v4i32}) {
1128 }
1129
1130 // Vector reductions
1131 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1133 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1136
1138 }
1139 }
1140 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1147 }
1149
1152 // Likewise, narrowing and extending vector loads/stores aren't handled
1153 // directly.
1156
1157 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1160 } else {
1163 }
1166
1169
1170 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1171 setTruncStoreAction(VT, InnerVT, Expand);
1172 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1173 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1174 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1175 }
1176 }
1177
1178 // AArch64 has implementations of a lot of rounding-like FP operations.
1179 for (auto Op :
1184 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1185 setOperationAction(Op, Ty, Legal);
1186 if (Subtarget->hasFullFP16())
1187 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1188 setOperationAction(Op, Ty, Legal);
1189 }
1190
1192
1199
1200 // ADDP custom lowering
1203 // FADDP custom lowering
1204 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1206 }
1207
1208 if (Subtarget->hasSME()) {
1210 }
1211
1212 // FIXME: Move lowering for more nodes here if those are common between
1213 // SVE and SME.
1214 if (Subtarget->hasSVEorSME()) {
1215 for (auto VT :
1219 }
1220 }
1221
1222 if (Subtarget->hasSVE()) {
1223 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1264
1270
1279
1280 if (Subtarget->hasSVE2()) {
1285 }
1286 }
1287
1288 // Illegal unpacked integer vector types.
1289 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1292 }
1293
1294 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1298
1299 for (auto VT :
1303
1304 for (auto VT :
1313
1317
1318 // There are no legal MVT::nxv16f## based types.
1319 if (VT != MVT::nxv16i1) {
1322 }
1323 }
1324
1325 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1333 }
1334
1335 // Firstly, exclude all scalable vector extending loads/truncating stores,
1336 // include both integer and floating scalable vector.
1338 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1339 setTruncStoreAction(VT, InnerVT, Expand);
1340 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1341 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1342 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1343 }
1344 }
1345
1346 // Then, selectively enable those which we directly support.
1353 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1360 }
1361
1362 // SVE supports truncating stores of 64 and 128-bit vectors
1368
1405
1418
1430 }
1431
1432 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1439 }
1440
1443
1444 // NEON doesn't support integer divides, but SVE does
1449 }
1450
1451 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1454
1455 // NEON doesn't support across-vector reductions, but SVE does.
1458
1459 if (Subtarget->forceStreamingCompatibleSVE()) {
1471 addTypeForStreamingSVE(VT);
1472
1473 for (MVT VT :
1475 addTypeForStreamingSVE(VT);
1476 }
1477
1478 // NOTE: Currently this has to happen after computeRegisterProperties rather
1479 // than the preferred option of combining it with the addRegisterClass call.
1480 if (Subtarget->useSVEForFixedLengthVectors()) {
1483 addTypeForFixedLengthSVE(VT);
1486 addTypeForFixedLengthSVE(VT);
1487
1488 // 64bit results can mean a bigger than NEON input.
1489 for (auto VT : {MVT::v8i8, MVT::v4i16})
1492
1493 // 128bit results imply a bigger than NEON input.
1494 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1496 for (auto VT : {MVT::v8f16, MVT::v4f32})
1498
1499 // These operations are not supported on NEON but SVE can do them.
1520
1521 // Int operations with no NEON support.
1522 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1529 }
1530
1531
1532 // Use SVE for vectors with more than 2 elements.
1533 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1535 }
1536
1541
1543 }
1544
1545 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1546 // Only required for llvm.aarch64.mops.memset.tag
1548 }
1549
1551
1552 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1553
1554 IsStrictFPEnabled = true;
1555}
1556
1557void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1558 assert(VT.isVector() && "VT should be a vector type");
1559
1560 if (VT.isFloatingPoint()) {
1562 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1563 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1564 }
1565
1566 // Mark vector float intrinsics as expand.
1567 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1576 }
1577
1578 // But we do support custom-lowering for FCOPYSIGN.
1579 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1580 ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
1582
1595
1599 for (MVT InnerVT : MVT::all_valuetypes())
1600 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1601
1602 // CNT supports only B element sizes, then use UADDLP to widen.
1603 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1605
1611
1612 for (unsigned Opcode :
1615 setOperationAction(Opcode, VT, Custom);
1616
1617 if (!VT.isFloatingPoint())
1619
1620 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1621 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1622 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1623 setOperationAction(Opcode, VT, Legal);
1624
1625 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1626 // NEON types.
1627 if (VT.isFloatingPoint() &&
1629 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1630 for (unsigned Opcode :
1636 setOperationAction(Opcode, VT, Legal);
1637
1638 // Strict fp extend and trunc are legal
1639 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1641 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1643
1644 // FIXME: We could potentially make use of the vector comparison instructions
1645 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1646 // complications:
1647 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1648 // so we would need to expand when the condition code doesn't match the
1649 // kind of comparison.
1650 // * Some kinds of comparison require more than one FCMXY instruction so
1651 // would need to be expanded instead.
1652 // * The lowering of the non-strict versions involves target-specific ISD
1653 // nodes so we would likely need to add strict versions of all of them and
1654 // handle them appropriately.
1657
1658 if (Subtarget->isLittleEndian()) {
1659 for (unsigned im = (unsigned)ISD::PRE_INC;
1663 }
1664 }
1665
1666 if (Subtarget->hasD128()) {
1669 }
1670}
1671
1673 EVT OpVT) const {
1674 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1675 if (!Subtarget->hasSVE())
1676 return true;
1677
1678 // We can only support legal predicate result types. We can use the SVE
1679 // whilelo instruction for generating fixed-width predicates too.
1680 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1681 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1682 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1683 return true;
1684
1685 // The whilelo instruction only works with i32 or i64 scalar inputs.
1686 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1687 return true;
1688
1689 return false;
1690}
1691
1692void AArch64TargetLowering::addTypeForStreamingSVE(MVT VT) {
1693 // By default set all operations to Expand,
1694 // then change to Legal/Custom if needed.
1695 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1696 setOperationAction(Op, VT, Expand);
1697
1698 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1699
1700 if (VT.isFloatingPoint()) {
1710 }
1711
1712 // STORE, LOAD, SCALAR_TO_VECTOR and BITCAST are natively supported,
1713 // so no need to Custom/Expand them.
1718
1719 // Mark integer truncating stores/extending loads as having custom lowering
1720 if (VT.isInteger()) {
1721 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1722 while (InnerVT != VT) {
1723 setTruncStoreAction(VT, InnerVT, Custom);
1724 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1725 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
1726 InnerVT = InnerVT.changeVectorElementType(
1727 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1728 }
1729 }
1730
1731 // Mark floating-point truncating stores/extending loads as having custom
1732 // lowering
1733 if (VT.isFloatingPoint()) {
1734 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1735 while (InnerVT != VT) {
1736 setTruncStoreAction(VT, InnerVT, Custom);
1737 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
1738 InnerVT = InnerVT.changeVectorElementType(
1740 }
1741 }
1742
1811}
1812
1813void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1814 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1815
1816 // By default everything must be expanded.
1817 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1818 setOperationAction(Op, VT, Expand);
1819
1820 if (VT.isFloatingPoint()) {
1830 }
1831
1832 // Mark integer truncating stores/extending loads as having custom lowering
1833 if (VT.isInteger()) {
1834 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1835 while (InnerVT != VT) {
1836 setTruncStoreAction(VT, InnerVT, Custom);
1837 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1838 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
1839 InnerVT = InnerVT.changeVectorElementType(
1840 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1841 }
1842 }
1843
1844 // Mark floating-point truncating stores/extending loads as having custom
1845 // lowering
1846 if (VT.isFloatingPoint()) {
1847 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1848 while (InnerVT != VT) {
1849 setTruncStoreAction(VT, InnerVT, Custom);
1850 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
1851 InnerVT = InnerVT.changeVectorElementType(
1853 }
1854 }
1855
1856 // Lower fixed length vector operations to scalable equivalents.
1941}
1942
1943void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1944 addRegisterClass(VT, &AArch64::FPR64RegClass);
1945 addTypeForNEON(VT);
1946}
1947
1948void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1949 addRegisterClass(VT, &AArch64::FPR128RegClass);
1950 addTypeForNEON(VT);
1951}
1952
1954 LLVMContext &C, EVT VT) const {
1955 if (!VT.isVector())
1956 return MVT::i32;
1957 if (VT.isScalableVector())
1960}
1961
1962// isIntImmediate - This method tests to see if the node is a constant
1963// operand. If so Imm will receive the value.
1964static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
1965 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
1966 Imm = C->getZExtValue();
1967 return true;
1968 }
1969 return false;
1970}
1971
1972// isOpcWithIntImmediate - This method tests to see if the node is a specific
1973// opcode and that it has a immediate integer right operand.
1974// If so Imm will receive the value.
1975static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
1976 uint64_t &Imm) {
1977 return N->getOpcode() == Opc &&
1978 isIntImmediate(N->getOperand(1).getNode(), Imm);
1979}
1980
1981static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1982 const APInt &Demanded,
1984 unsigned NewOpc) {
1985 uint64_t OldImm = Imm, NewImm, Enc;
1986 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1987
1988 // Return if the immediate is already all zeros, all ones, a bimm32 or a
1989 // bimm64.
1990 if (Imm == 0 || Imm == Mask ||
1992 return false;
1993
1994 unsigned EltSize = Size;
1995 uint64_t DemandedBits = Demanded.getZExtValue();
1996
1997 // Clear bits that are not demanded.
1998 Imm &= DemandedBits;
1999
2000 while (true) {
2001 // The goal here is to set the non-demanded bits in a way that minimizes
2002 // the number of switching between 0 and 1. In order to achieve this goal,
2003 // we set the non-demanded bits to the value of the preceding demanded bits.
2004 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2005 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2006 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2007 // The final result is 0b11000011.
2008 uint64_t NonDemandedBits = ~DemandedBits;
2009 uint64_t InvertedImm = ~Imm & DemandedBits;
2010 uint64_t RotatedImm =
2011 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2012 NonDemandedBits;
2013 uint64_t Sum = RotatedImm + NonDemandedBits;
2014 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2015 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2016 NewImm = (Imm | Ones) & Mask;
2017
2018 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2019 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2020 // we halve the element size and continue the search.
2021 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2022 break;
2023
2024 // We cannot shrink the element size any further if it is 2-bits.
2025 if (EltSize == 2)
2026 return false;
2027
2028 EltSize /= 2;
2029 Mask >>= EltSize;
2030 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2031
2032 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2033 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2034 return false;
2035
2036 // Merge the upper and lower halves of Imm and DemandedBits.
2037 Imm |= Hi;
2038 DemandedBits |= DemandedBitsHi;
2039 }
2040
2041 ++NumOptimizedImms;
2042
2043 // Replicate the element across the register width.
2044 while (EltSize < Size) {
2045 NewImm |= NewImm << EltSize;
2046 EltSize *= 2;
2047 }
2048
2049 (void)OldImm;
2050 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2051 "demanded bits should never be altered");
2052 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2053
2054 // Create the new constant immediate node.
2055 EVT VT = Op.getValueType();
2056 SDLoc DL(Op);
2057 SDValue New;
2058
2059 // If the new constant immediate is all-zeros or all-ones, let the target
2060 // independent DAG combine optimize this node.
2061 if (NewImm == 0 || NewImm == OrigMask) {
2062 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2063 TLO.DAG.getConstant(NewImm, DL, VT));
2064 // Otherwise, create a machine node so that target independent DAG combine
2065 // doesn't undo this optimization.
2066 } else {
2068 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2069 New = SDValue(
2070 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2071 }
2072
2073 return TLO.CombineTo(Op, New);
2074}
2075
2077 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2078 TargetLoweringOpt &TLO) const {
2079 // Delay this optimization to as late as possible.
2080 if (!TLO.LegalOps)
2081 return false;
2082
2084 return false;
2085
2086 EVT VT = Op.getValueType();
2087 if (VT.isVector())
2088 return false;
2089
2090 unsigned Size = VT.getSizeInBits();
2091 assert((Size == 32 || Size == 64) &&
2092 "i32 or i64 is expected after legalization.");
2093
2094 // Exit early if we demand all bits.
2095 if (DemandedBits.countPopulation() == Size)
2096 return false;
2097
2098 unsigned NewOpc;
2099 switch (Op.getOpcode()) {
2100 default:
2101 return false;
2102 case ISD::AND:
2103 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2104 break;
2105 case ISD::OR:
2106 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2107 break;
2108 case ISD::XOR:
2109 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2110 break;
2111 }
2112 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2113 if (!C)
2114 return false;
2115 uint64_t Imm = C->getZExtValue();
2116 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2117}
2118
2119/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2120/// Mask are known to be either zero or one and return them Known.
2122 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2123 const SelectionDAG &DAG, unsigned Depth) const {
2124 switch (Op.getOpcode()) {
2125 default:
2126 break;
2127 case AArch64ISD::DUP: {
2128 SDValue SrcOp = Op.getOperand(0);
2129 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2130 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2131 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2132 "Expected DUP implicit truncation");
2133 Known = Known.trunc(Op.getScalarValueSizeInBits());
2134 }
2135 break;
2136 }
2137 case AArch64ISD::CSEL: {
2138 KnownBits Known2;
2139 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2140 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2141 Known = KnownBits::commonBits(Known, Known2);
2142 break;
2143 }
2144 case AArch64ISD::BICi: {
2145 // Compute the bit cleared value.
2146 uint64_t Mask =
2147 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
2148 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2149 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
2150 break;
2151 }
2152 case AArch64ISD::VLSHR: {
2153 KnownBits Known2;
2154 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2155 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2156 Known = KnownBits::lshr(Known, Known2);
2157 break;
2158 }
2159 case AArch64ISD::VASHR: {
2160 KnownBits Known2;
2161 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2162 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2163 Known = KnownBits::ashr(Known, Known2);
2164 break;
2165 }
2166 case AArch64ISD::MOVI: {
2167 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(0));
2168 Known =
2170 break;
2171 }
2173 case AArch64ISD::ADDlow: {
2174 if (!Subtarget->isTargetILP32())
2175 break;
2176 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2177 Known.Zero = APInt::getHighBitsSet(64, 32);
2178 break;
2179 }
2181 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2182 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2183 break;
2184 }
2186 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
2187 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
2188 switch (IntID) {
2189 default: return;
2190 case Intrinsic::aarch64_ldaxr:
2191 case Intrinsic::aarch64_ldxr: {
2192 unsigned BitWidth = Known.getBitWidth();
2193 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2194 unsigned MemBits = VT.getScalarSizeInBits();
2195 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2196 return;
2197 }
2198 }
2199 break;
2200 }
2202 case ISD::INTRINSIC_VOID: {
2203 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2204 switch (IntNo) {
2205 default:
2206 break;
2207 case Intrinsic::aarch64_neon_umaxv:
2208 case Intrinsic::aarch64_neon_uminv: {
2209 // Figure out the datatype of the vector operand. The UMINV instruction
2210 // will zero extend the result, so we can mark as known zero all the
2211 // bits larger than the element datatype. 32-bit or larget doesn't need
2212 // this as those are legal types and will be handled by isel directly.
2213 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2214 unsigned BitWidth = Known.getBitWidth();
2215 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2216 assert(BitWidth >= 8 && "Unexpected width!");
2218 Known.Zero |= Mask;
2219 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2220 assert(BitWidth >= 16 && "Unexpected width!");
2222 Known.Zero |= Mask;
2223 }
2224 break;
2225 } break;
2226 }
2227 }
2228 }
2229}
2230
2232 EVT) const {
2233 return MVT::i64;
2234}
2235
2237 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2238 unsigned *Fast) const {
2239 if (Subtarget->requiresStrictAlign())
2240 return false;
2241
2242 if (Fast) {
2243 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2244 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2245 // See comments in performSTORECombine() for more details about
2246 // these conditions.
2247
2248 // Code that uses clang vector extensions can mark that it
2249 // wants unaligned accesses to be treated as fast by
2250 // underspecifying alignment to be 1 or 2.
2251 Alignment <= 2 ||
2252
2253 // Disregard v2i64. Memcpy lowering produces those and splitting
2254 // them regresses performance on micro-benchmarks and olden/bh.
2255 VT == MVT::v2i64;
2256 }
2257 return true;
2258}
2259
2260// Same as above but handling LLTs instead.
2262 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2263 unsigned *Fast) const {
2264 if (Subtarget->requiresStrictAlign())
2265 return false;
2266
2267 if (Fast) {
2268 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2269 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2270 Ty.getSizeInBytes() != 16 ||
2271 // See comments in performSTORECombine() for more details about
2272 // these conditions.
2273
2274 // Code that uses clang vector extensions can mark that it
2275 // wants unaligned accesses to be treated as fast by
2276 // underspecifying alignment to be 1 or 2.
2277 Alignment <= 2 ||
2278
2279 // Disregard v2i64. Memcpy lowering produces those and splitting
2280 // them regresses performance on micro-benchmarks and olden/bh.
2281 Ty == LLT::fixed_vector(2, 64);
2282 }
2283 return true;
2284}
2285
2286FastISel *
2288 const TargetLibraryInfo *libInfo) const {
2289 return AArch64::createFastISel(funcInfo, libInfo);
2290}
2291
2292const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2293#define MAKE_CASE(V) \
2294 case V: \
2295 return #V;
2296 switch ((AArch64ISD::NodeType)Opcode) {
2298 break;
2601 }
2602#undef MAKE_CASE
2603 return nullptr;
2604}
2605
2608 MachineBasicBlock *MBB) const {
2609 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2610 // phi node:
2611
2612 // OrigBB:
2613 // [... previous instrs leading to comparison ...]
2614 // b.ne TrueBB
2615 // b EndBB
2616 // TrueBB:
2617 // ; Fallthrough
2618 // EndBB:
2619 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2620
2621 MachineFunction *MF = MBB->getParent();
2622 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2623 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2624 DebugLoc DL = MI.getDebugLoc();
2626
2627 Register DestReg = MI.getOperand(0).getReg();
2628 Register IfTrueReg = MI.getOperand(1).getReg();
2629 Register IfFalseReg = MI.getOperand(2).getReg();
2630 unsigned CondCode = MI.getOperand(3).getImm();
2631 bool NZCVKilled = MI.getOperand(4).isKill();
2632
2633 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2634 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2635 MF->insert(It, TrueBB);
2636 MF->insert(It, EndBB);
2637
2638 // Transfer rest of current basic-block to EndBB
2639 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2640 MBB->end());
2642
2643 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2644 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2645 MBB->addSuccessor(TrueBB);
2646 MBB->addSuccessor(EndBB);
2647
2648 // TrueBB falls through to the end.
2649 TrueBB->addSuccessor(EndBB);
2650
2651 if (!NZCVKilled) {
2652 TrueBB->addLiveIn(AArch64::NZCV);
2653 EndBB->addLiveIn(AArch64::NZCV);
2654 }
2655
2656 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2657 .addReg(IfTrueReg)
2658 .addMBB(TrueBB)
2659 .addReg(IfFalseReg)
2660 .addMBB(MBB);
2661
2662 MI.eraseFromParent();
2663 return EndBB;
2664}
2665
2667 MachineInstr &MI, MachineBasicBlock *BB) const {
2669 BB->getParent()->getFunction().getPersonalityFn())) &&
2670 "SEH does not use catchret!");
2671 return BB;
2672}
2673
2675AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2677 MachineBasicBlock *BB) const {
2678 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2679 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2680
2681 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2682 MIB.add(MI.getOperand(1)); // slice index register
2683 MIB.add(MI.getOperand(2)); // slice index offset
2684 MIB.add(MI.getOperand(3)); // pg
2685 MIB.add(MI.getOperand(4)); // base
2686 MIB.add(MI.getOperand(5)); // offset
2687
2688 MI.eraseFromParent(); // The pseudo is gone now.
2689 return BB;
2690}
2691
2694 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2696 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2697
2698 MIB.addReg(AArch64::ZA, RegState::Define);
2699 MIB.add(MI.getOperand(0)); // Vector select register
2700 MIB.add(MI.getOperand(1)); // Vector select offset
2701 MIB.add(MI.getOperand(2)); // Base
2702 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2703
2704 MI.eraseFromParent(); // The pseudo is gone now.
2705 return BB;
2706}
2707
2709AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2711 MachineBasicBlock *BB, bool HasTile) const {
2712 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2713 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2714 unsigned StartIdx = 0;
2715
2716 if (HasTile) {
2717 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2718 MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2719 StartIdx = 1;
2720 } else
2721 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
2722
2723 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2724 MIB.add(MI.getOperand(I));
2725
2726 MI.eraseFromParent(); // The pseudo is gone now.
2727 return BB;
2728}
2729
2732 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2734 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2735 MIB.add(MI.getOperand(0)); // Mask
2736
2737 unsigned Mask = MI.getOperand(0).getImm();
2738 for (unsigned I = 0; I < 8; I++) {
2739 if (Mask & (1 << I))
2740 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2741 }
2742
2743 MI.eraseFromParent(); // The pseudo is gone now.
2744 return BB;
2745}
2746
2748 MachineInstr &MI, MachineBasicBlock *BB) const {
2749
2750 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
2751 if (SMEOrigInstr != -1) {
2752 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2753 uint64_t SMEMatrixType =
2754 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
2755 switch (SMEMatrixType) {
2757 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
2759 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
2761 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
2763 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
2765 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
2767 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
2768 }
2769 }
2770
2771 switch (MI.getOpcode()) {
2772 default:
2773#ifndef NDEBUG
2774 MI.dump();
2775#endif
2776 llvm_unreachable("Unexpected instruction for custom inserter!");
2777
2778 case AArch64::F128CSEL:
2779 return EmitF128CSEL(MI, BB);
2780 case TargetOpcode::STATEPOINT:
2781 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2782 // while bl call instruction (where statepoint will be lowered at the end)
2783 // has implicit def. This def is early-clobber as it will be set at
2784 // the moment of the call and earlier than any use is read.
2785 // Add this implicit dead def here as a workaround.
2786 MI.addOperand(*MI.getMF(),
2788 AArch64::LR, /*isDef*/ true,
2789 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2790 /*isUndef*/ false, /*isEarlyClobber*/ true));
2791 [[fallthrough]];
2792 case TargetOpcode::STACKMAP:
2793 case TargetOpcode::PATCHPOINT:
2794 return emitPatchPoint(MI, BB);
2795
2796 case AArch64::CATCHRET:
2797 return EmitLoweredCatchRet(MI, BB);
2798 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2799 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2800 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2801 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2802 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2803 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2804 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2805 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2806 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2807 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2808 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2809 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2810 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2811 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2812 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2813 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2814 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2815 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2816 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
2817 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
2818 case AArch64::LDR_ZA_PSEUDO:
2819 return EmitFill(MI, BB);
2820 case AArch64::ZERO_M_PSEUDO:
2821 return EmitZero(MI, BB);
2822 }
2823}
2824
2825//===----------------------------------------------------------------------===//
2826// AArch64 Lowering private implementation.
2827//===----------------------------------------------------------------------===//
2828
2829//===----------------------------------------------------------------------===//
2830// Lowering Code
2831//===----------------------------------------------------------------------===//
2832
2833// Forward declarations of SVE fixed length lowering helpers
2838 SelectionDAG &DAG);
2840 EVT VT);
2841
2842/// isZerosVector - Check whether SDNode N is a zero-filled vector.
2843static bool isZerosVector(const SDNode *N) {
2844 // Look through a bit convert.
2845 while (N->getOpcode() == ISD::BITCAST)
2846 N = N->getOperand(0).getNode();
2847
2849 return true;
2850
2851 if (N->getOpcode() != AArch64ISD::DUP)
2852 return false;
2853
2854 auto Opnd0 = N->getOperand(0);
2855 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
2856}
2857
2858/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2859/// CC
2861 switch (CC) {
2862 default:
2863 llvm_unreachable("Unknown condition code!");
2864 case ISD::SETNE:
2865 return AArch64CC::NE;
2866 case ISD::SETEQ:
2867 return AArch64CC::EQ;
2868 case ISD::SETGT:
2869 return AArch64CC::GT;
2870 case ISD::SETGE:
2871 return AArch64CC::GE;
2872 case ISD::SETLT:
2873 return AArch64CC::LT;
2874 case ISD::SETLE:
2875 return AArch64CC::LE;
2876 case ISD::SETUGT:
2877 return AArch64CC::HI;
2878 case ISD::SETUGE:
2879 return AArch64CC::HS;
2880 case ISD::SETULT:
2881 return AArch64CC::LO;
2882 case ISD::SETULE:
2883 return AArch64CC::LS;
2884 }
2885}
2886
2887/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
2889 AArch64CC::CondCode &CondCode,
2890 AArch64CC::CondCode &CondCode2) {
2891 CondCode2 = AArch64CC::AL;
2892 switch (CC) {
2893 default:
2894 llvm_unreachable("Unknown FP condition!");
2895 case ISD::SETEQ:
2896 case ISD::SETOEQ:
2897 CondCode = AArch64CC::EQ;
2898 break;
2899 case ISD::SETGT:
2900 case ISD::SETOGT:
2901 CondCode = AArch64CC::GT;
2902 break;
2903 case ISD::SETGE:
2904 case ISD::SETOGE:
2905 CondCode = AArch64CC::GE;
2906 break;
2907 case ISD::SETOLT:
2908 CondCode = AArch64CC::MI;
2909 break;
2910 case ISD::SETOLE:
2911 CondCode = AArch64CC::LS;
2912 break;
2913 case ISD::SETONE:
2914 CondCode = AArch64CC::MI;
2915 CondCode2 = AArch64CC::GT;
2916 break;
2917 case ISD::SETO:
2918 CondCode = AArch64CC::VC;
2919 break;
2920 case ISD::SETUO:
2921 CondCode = AArch64CC::VS;
2922 break;
2923 case ISD::SETUEQ:
2924 CondCode = AArch64CC::EQ;
2925 CondCode2 = AArch64CC::VS;
2926 break;
2927 case ISD::SETUGT:
2928 CondCode = AArch64CC::HI;
2929 break;
2930 case ISD::SETUGE:
2931 CondCode = AArch64CC::PL;
2932 break;
2933 case ISD::SETLT:
2934 case ISD::SETULT: