LLVM 20.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
23#include "llvm/ADT/APFloat.h"
24#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/ArrayRef.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/Statistic.h"
31#include "llvm/ADT/StringRef.h"
32#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
82#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157/// Value type used for condition codes.
158static const MVT MVT_CC = MVT::i32;
159
160static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
161 AArch64::X3, AArch64::X4, AArch64::X5,
162 AArch64::X6, AArch64::X7};
163static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
164 AArch64::Q3, AArch64::Q4, AArch64::Q5,
165 AArch64::Q6, AArch64::Q7};
166
168
170
171static inline EVT getPackedSVEVectorVT(EVT VT) {
172 switch (VT.getSimpleVT().SimpleTy) {
173 default:
174 llvm_unreachable("unexpected element type for vector");
175 case MVT::i8:
176 return MVT::nxv16i8;
177 case MVT::i16:
178 return MVT::nxv8i16;
179 case MVT::i32:
180 return MVT::nxv4i32;
181 case MVT::i64:
182 return MVT::nxv2i64;
183 case MVT::f16:
184 return MVT::nxv8f16;
185 case MVT::f32:
186 return MVT::nxv4f32;
187 case MVT::f64:
188 return MVT::nxv2f64;
189 case MVT::bf16:
190 return MVT::nxv8bf16;
191 }
192}
193
194// NOTE: Currently there's only a need to return integer vector types. If this
195// changes then just add an extra "type" parameter.
197 switch (EC.getKnownMinValue()) {
198 default:
199 llvm_unreachable("unexpected element count for vector");
200 case 16:
201 return MVT::nxv16i8;
202 case 8:
203 return MVT::nxv8i16;
204 case 4:
205 return MVT::nxv4i32;
206 case 2:
207 return MVT::nxv2i64;
208 }
209}
210
212 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
213 "Expected scalable predicate vector type!");
214 switch (VT.getVectorMinNumElements()) {
215 default:
216 llvm_unreachable("unexpected element count for vector");
217 case 2:
218 return MVT::nxv2i64;
219 case 4:
220 return MVT::nxv4i32;
221 case 8:
222 return MVT::nxv8i16;
223 case 16:
224 return MVT::nxv16i8;
225 }
226}
227
228/// Returns true if VT's elements occupy the lowest bit positions of its
229/// associated register class without any intervening space.
230///
231/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
232/// same register class, but only nxv8f16 can be treated as a packed vector.
233static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
235 "Expected legal vector type!");
236 return VT.isFixedLengthVector() ||
238}
239
240// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
241// predicate and end with a passthru value matching the result type.
242static bool isMergePassthruOpcode(unsigned Opc) {
243 switch (Opc) {
244 default:
245 return false;
276 return true;
277 }
278}
279
280// Returns true if inactive lanes are known to be zeroed by construction.
282 switch (Op.getOpcode()) {
283 default:
284 return false;
285 // We guarantee i1 splat_vectors to zero the other lanes
289 return true;
291 switch (Op.getConstantOperandVal(0)) {
292 default:
293 return false;
294 case Intrinsic::aarch64_sve_ptrue:
295 case Intrinsic::aarch64_sve_pnext:
296 case Intrinsic::aarch64_sve_cmpeq:
297 case Intrinsic::aarch64_sve_cmpne:
298 case Intrinsic::aarch64_sve_cmpge:
299 case Intrinsic::aarch64_sve_cmpgt:
300 case Intrinsic::aarch64_sve_cmphs:
301 case Intrinsic::aarch64_sve_cmphi:
302 case Intrinsic::aarch64_sve_cmpeq_wide:
303 case Intrinsic::aarch64_sve_cmpne_wide:
304 case Intrinsic::aarch64_sve_cmpge_wide:
305 case Intrinsic::aarch64_sve_cmpgt_wide:
306 case Intrinsic::aarch64_sve_cmplt_wide:
307 case Intrinsic::aarch64_sve_cmple_wide:
308 case Intrinsic::aarch64_sve_cmphs_wide:
309 case Intrinsic::aarch64_sve_cmphi_wide:
310 case Intrinsic::aarch64_sve_cmplo_wide:
311 case Intrinsic::aarch64_sve_cmpls_wide:
312 case Intrinsic::aarch64_sve_fcmpeq:
313 case Intrinsic::aarch64_sve_fcmpne:
314 case Intrinsic::aarch64_sve_fcmpge:
315 case Intrinsic::aarch64_sve_fcmpgt:
316 case Intrinsic::aarch64_sve_fcmpuo:
317 case Intrinsic::aarch64_sve_facgt:
318 case Intrinsic::aarch64_sve_facge:
319 case Intrinsic::aarch64_sve_whilege:
320 case Intrinsic::aarch64_sve_whilegt:
321 case Intrinsic::aarch64_sve_whilehi:
322 case Intrinsic::aarch64_sve_whilehs:
323 case Intrinsic::aarch64_sve_whilele:
324 case Intrinsic::aarch64_sve_whilelo:
325 case Intrinsic::aarch64_sve_whilels:
326 case Intrinsic::aarch64_sve_whilelt:
327 case Intrinsic::aarch64_sve_match:
328 case Intrinsic::aarch64_sve_nmatch:
329 case Intrinsic::aarch64_sve_whilege_x2:
330 case Intrinsic::aarch64_sve_whilegt_x2:
331 case Intrinsic::aarch64_sve_whilehi_x2:
332 case Intrinsic::aarch64_sve_whilehs_x2:
333 case Intrinsic::aarch64_sve_whilele_x2:
334 case Intrinsic::aarch64_sve_whilelo_x2:
335 case Intrinsic::aarch64_sve_whilels_x2:
336 case Intrinsic::aarch64_sve_whilelt_x2:
337 return true;
338 }
339 }
340}
341
342static std::tuple<SDValue, SDValue>
344 SDLoc DL(Disc);
345 SDValue AddrDisc;
346 SDValue ConstDisc;
347
348 // If this is a blend, remember the constant and address discriminators.
349 // Otherwise, it's either a constant discriminator, or a non-blended
350 // address discriminator.
351 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
352 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
353 AddrDisc = Disc->getOperand(1);
354 ConstDisc = Disc->getOperand(2);
355 } else {
356 ConstDisc = Disc;
357 }
358
359 // If the constant discriminator (either the blend RHS, or the entire
360 // discriminator value) isn't a 16-bit constant, bail out, and let the
361 // discriminator be computed separately.
362 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
363 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
364 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
365
366 // If there's no address discriminator, use NoRegister, which we'll later
367 // replace with XZR, or directly use a Z variant of the inst. when available.
368 if (!AddrDisc)
369 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
370
371 return std::make_tuple(
372 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
373 AddrDisc);
374}
375
377 const AArch64Subtarget &STI)
378 : TargetLowering(TM), Subtarget(&STI) {
379 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
380 // we have to make something up. Arbitrarily, choose ZeroOrOne.
382 // When comparing vectors the result sets the different elements in the
383 // vector to all-one or all-zero.
385
386 // Set up the register classes.
387 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
388 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
389
390 if (Subtarget->hasLS64()) {
391 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
392 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
394 }
395
396 if (Subtarget->hasFPARMv8()) {
397 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
398 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
399 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
400 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
401 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
402 }
403
404 if (Subtarget->hasNEON()) {
405 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
406 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
407
408 addDRType(MVT::v2f32);
409 addDRType(MVT::v8i8);
410 addDRType(MVT::v4i16);
411 addDRType(MVT::v2i32);
412 addDRType(MVT::v1i64);
413 addDRType(MVT::v1f64);
414 addDRType(MVT::v4f16);
415 addDRType(MVT::v4bf16);
416
417 addQRType(MVT::v4f32);
418 addQRType(MVT::v2f64);
419 addQRType(MVT::v16i8);
420 addQRType(MVT::v8i16);
421 addQRType(MVT::v4i32);
422 addQRType(MVT::v2i64);
423 addQRType(MVT::v8f16);
424 addQRType(MVT::v8bf16);
425 }
426
427 if (Subtarget->isSVEorStreamingSVEAvailable()) {
428 // Add legal sve predicate types
429 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
430 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
431 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
432 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
433 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
434
435 // Add legal sve data types
436 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
437 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
438 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
439 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
440
441 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
442 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
443 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
444 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
445 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
446 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
447
448 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
449 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
450 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
451
452 if (Subtarget->useSVEForFixedLengthVectors()) {
455 addRegisterClass(VT, &AArch64::ZPRRegClass);
456
459 addRegisterClass(VT, &AArch64::ZPRRegClass);
460 }
461 }
462
463 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
464 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
465 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
466 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
467
468 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
469 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
470 }
471
472 // Compute derived properties from the register classes
474
475 // Provide all sorts of operation actions
515
517
521
525
527
528 // Custom lowering hooks are needed for XOR
529 // to fold it into CSINC/CSINV.
532
533 // Virtually no operation on f128 is legal, but LLVM can't expand them when
534 // there's a valid register class, so we need custom operations in most cases.
559 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
560 // aren't handled.
561
562 // Lowering for many of the conversions is actually specified by the non-f128
563 // type. The LowerXXX function will be trivial when f128 isn't involved.
588 if (Subtarget->hasFPARMv8()) {
591 }
594 if (Subtarget->hasFPARMv8()) {
597 }
600
605
606 // Variable arguments.
611
612 // Variable-sized objects.
615
616 // Lowering Funnel Shifts to EXTR
621
623
624 // Constant pool entries
626
627 // BlockAddress
629
630 // AArch64 lacks both left-rotate and popcount instructions.
636 }
637
638 // AArch64 doesn't have i32 MULH{S|U}.
641
642 // AArch64 doesn't have {U|S}MUL_LOHI.
647
648 if (Subtarget->hasCSSC()) {
652
654
658
661
666
671 } else {
675
678
681 }
682
688 }
695
696 // Custom lower Add/Sub/Mul with overflow.
709
718
727 if (Subtarget->hasFullFP16()) {
730 } else {
733 }
734
735 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
748 setOperationAction(Op, MVT::f16, Promote);
749 setOperationAction(Op, MVT::v4f16, Expand);
750 setOperationAction(Op, MVT::v8f16, Expand);
751 setOperationAction(Op, MVT::bf16, Promote);
752 setOperationAction(Op, MVT::v4bf16, Expand);
753 setOperationAction(Op, MVT::v8bf16, Expand);
754 }
755
756 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
757 for (auto Op : {
761 ISD::FADD,
762 ISD::FSUB,
763 ISD::FMUL,
764 ISD::FDIV,
765 ISD::FMA,
796 })
797 setOperationAction(Op, ScalarVT, Promote);
798
799 for (auto Op : {ISD::FNEG, ISD::FABS})
800 setOperationAction(Op, ScalarVT, Legal);
801
802 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
803 // because the result type is integer.
807 setOperationAction(Op, ScalarVT, Custom);
808
809 // promote v4f16 to v4f32 when that is known to be safe.
810 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
811 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
812 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
813 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
814 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
815 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
816 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
817 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
818 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
819 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
820 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
821 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
822 setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
823
833
834 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
856 setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
857 };
858
859 if (!Subtarget->hasFullFP16()) {
860 LegalizeNarrowFP(MVT::f16);
861 }
862 LegalizeNarrowFP(MVT::bf16);
865
866 // AArch64 has implementations of a lot of rounding-like FP operations.
867 // clang-format off
868 for (auto Op :
880 for (MVT Ty : {MVT::f32, MVT::f64})
882 if (Subtarget->hasFullFP16())
883 setOperationAction(Op, MVT::f16, Legal);
884 }
885 // clang-format on
886
887 // Basic strict FP operations are legal
890 for (MVT Ty : {MVT::f32, MVT::f64})
892 if (Subtarget->hasFullFP16())
893 setOperationAction(Op, MVT::f16, Legal);
894 }
895
896 // Strict conversion to a larger type is legal
897 for (auto VT : {MVT::f32, MVT::f64})
899
901
907
909 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
912 } else {
915 }
918
919 // Generate outline atomics library calls only if LSE was not specified for
920 // subtarget
921 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
947#define LCALLNAMES(A, B, N) \
948 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
949 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
950 setLibcallName(A##N##_REL, #B #N "_rel"); \
951 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
952#define LCALLNAME4(A, B) \
953 LCALLNAMES(A, B, 1) \
954 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
955#define LCALLNAME5(A, B) \
956 LCALLNAMES(A, B, 1) \
957 LCALLNAMES(A, B, 2) \
958 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
959 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
960 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
961 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
962 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
963 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
964 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
965#undef LCALLNAMES
966#undef LCALLNAME4
967#undef LCALLNAME5
968 }
969
970 if (Subtarget->hasLSE128()) {
971 // Custom lowering because i128 is not legal. Must be replaced by 2x64
972 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
976 }
977
978 // 128-bit loads and stores can be done without expanding
981
982 // Aligned 128-bit loads and stores are single-copy atomic according to the
983 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
984 if (Subtarget->hasLSE2()) {
987 }
988
989 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
990 // custom lowering, as there are no un-paired non-temporal stores and
991 // legalization will break up 256 bit inputs.
993 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
994 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
995 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1000
1001 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1002 // custom lowering, as there are no un-paired non-temporal loads legalization
1003 // will break up 256 bit inputs.
1004 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1005 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1006 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1007 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1008 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1009 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1010 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1011 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1012
1013 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1015
1016 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1017 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1018 // Issue __sincos_stret if available.
1021 } else {
1024 }
1025
1026 // Make floating-point constants legal for the large code model, so they don't
1027 // become loads from the constant pool.
1028 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1031 }
1032
1033 // AArch64 does not have floating-point extending loads, i1 sign-extending
1034 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1035 for (MVT VT : MVT::fp_valuetypes()) {
1036 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1037 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1038 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1039 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1040 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1041 }
1042 for (MVT VT : MVT::integer_valuetypes())
1043 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1044
1045 for (MVT WideVT : MVT::fp_valuetypes()) {
1046 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1047 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1048 setTruncStoreAction(WideVT, NarrowVT, Expand);
1049 }
1050 }
1051 }
1052
1053 if (Subtarget->hasFPARMv8()) {
1057 }
1058
1059 // Indexed loads and stores are supported.
1060 for (unsigned im = (unsigned)ISD::PRE_INC;
1062 setIndexedLoadAction(im, MVT::i8, Legal);
1063 setIndexedLoadAction(im, MVT::i16, Legal);
1064 setIndexedLoadAction(im, MVT::i32, Legal);
1065 setIndexedLoadAction(im, MVT::i64, Legal);
1066 setIndexedLoadAction(im, MVT::f64, Legal);
1067 setIndexedLoadAction(im, MVT::f32, Legal);
1068 setIndexedLoadAction(im, MVT::f16, Legal);
1069 setIndexedLoadAction(im, MVT::bf16, Legal);
1070 setIndexedStoreAction(im, MVT::i8, Legal);
1071 setIndexedStoreAction(im, MVT::i16, Legal);
1072 setIndexedStoreAction(im, MVT::i32, Legal);
1073 setIndexedStoreAction(im, MVT::i64, Legal);
1074 setIndexedStoreAction(im, MVT::f64, Legal);
1075 setIndexedStoreAction(im, MVT::f32, Legal);
1076 setIndexedStoreAction(im, MVT::f16, Legal);
1077 setIndexedStoreAction(im, MVT::bf16, Legal);
1078 }
1079
1080 // Trap.
1081 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1084
1085 // We combine OR nodes for bitfield operations.
1087 // Try to create BICs for vector ANDs.
1089
1090 // llvm.init.trampoline and llvm.adjust.trampoline
1093
1094 // Vector add and sub nodes may conceal a high-half opportunity.
1095 // Also, try to fold ADD into CSINC/CSINV..
1098
1101
1102 // Try and combine setcc with csel
1104
1106
1113
1115
1117
1119
1123
1126
1128
1130
1132
1136
1138
1139 // In case of strict alignment, avoid an excessive number of byte wide stores.
1142 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1143
1147 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1148
1151 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1152
1155 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1156
1158
1160
1161 EnableExtLdPromotion = true;
1162
1163 // Set required alignment.
1165 // Set preferred alignments.
1166
1167 // Don't align loops on Windows. The SEH unwind info generation needs to
1168 // know the exact length of functions before the alignments have been
1169 // expanded.
1170 if (!Subtarget->isTargetWindows())
1174
1175 // Only change the limit for entries in a jump table if specified by
1176 // the sub target, but not at the command line.
1177 unsigned MaxJT = STI.getMaximumJumpTableSize();
1178 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1180
1182
1184
1186
1187 if (Subtarget->isNeonAvailable()) {
1188 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1189 // silliness like this:
1190 // clang-format off
1191 for (auto Op :
1212 setOperationAction(Op, MVT::v1f64, Expand);
1213 // clang-format on
1214
1215 for (auto Op :
1220 setOperationAction(Op, MVT::v1i64, Expand);
1221
1222 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1223 // elements smaller than i32, so promote the input to i32 first.
1224 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1225 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1226
1227 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1228 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1229 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1232 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1234
1235 if (Subtarget->hasFullFP16()) {
1238
1247 } else {
1248 // when AArch64 doesn't have fullfp16 support, promote the input
1249 // to i32 first.
1250 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1251 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1252 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1253 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1254 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1255 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1256 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1257 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1258 }
1259
1260 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1261 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1268 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1273 }
1274
1275 // Custom handling for some quad-vector types to detect MULL.
1276 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1277 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1278 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1279 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1280 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1281 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1282
1283 // Saturates
1284 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1285 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1290 }
1291
1292 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1293 MVT::v4i32}) {
1300 }
1301
1302 // Vector reductions
1303 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1304 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1305 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1310
1312 }
1313 }
1314 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1315 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1324 }
1329
1331 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1332 // Likewise, narrowing and extending vector loads/stores aren't handled
1333 // directly.
1336
1337 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1340 } else {
1343 }
1346
1349
1350 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1351 setTruncStoreAction(VT, InnerVT, Expand);
1352 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1353 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1354 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1355 }
1356 }
1357
1358 for (auto Op :
1364 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1366 if (Subtarget->hasFullFP16())
1367 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1369 }
1370
1371 // LRINT and LLRINT.
1372 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1373 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1375 if (Subtarget->hasFullFP16())
1376 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1378 }
1379
1380 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1381
1386
1390
1391 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1392 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1393 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1394 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1395 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1396 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1397
1398 // ADDP custom lowering
1399 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1401 // FADDP custom lowering
1402 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1404 } else /* !isNeonAvailable */ {
1406 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1408
1409 if (VT.is128BitVector() || VT.is64BitVector()) {
1413 Subtarget->isLittleEndian() ? Legal : Expand);
1414 }
1415 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1416 setTruncStoreAction(VT, InnerVT, Expand);
1417 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1418 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1419 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1420 }
1421 }
1422 }
1423
1424 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1428 }
1429
1430 if (Subtarget->hasSME()) {
1432 }
1433
1434 // FIXME: Move lowering for more nodes here if those are common between
1435 // SVE and SME.
1436 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1437 for (auto VT :
1438 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1443 }
1444 }
1445
1446 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1447 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1488
1494
1503
1508
1509 if (!Subtarget->isLittleEndian())
1511
1512 if (Subtarget->hasSVE2() ||
1513 (Subtarget->hasSME() && Subtarget->isStreaming()))
1514 // For SLI/SRI.
1516 }
1517
1518 // Illegal unpacked integer vector types.
1519 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1522 }
1523
1524 // Type legalize unpacked bitcasts.
1525 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1527
1528 for (auto VT :
1529 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1530 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1532
1533 for (auto VT :
1534 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1542
1546
1547 // There are no legal MVT::nxv16f## based types.
1548 if (VT != MVT::nxv16i1) {
1551 }
1552 }
1553
1554 // NEON doesn't support masked loads/stores, but SME and SVE do.
1555 for (auto VT :
1556 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1557 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1558 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1561 }
1562
1563 // Firstly, exclude all scalable vector extending loads/truncating stores,
1564 // include both integer and floating scalable vector.
1566 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1567 setTruncStoreAction(VT, InnerVT, Expand);
1568 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1569 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1570 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1571 }
1572 }
1573
1574 // Then, selectively enable those which we directly support.
1575 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1576 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1577 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1578 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1579 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1580 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1581 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1582 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1583 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1584 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1585 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1586 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1587 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1588 }
1589
1590 // SVE supports truncating stores of 64 and 128-bit vectors
1591 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1592 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1593 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1594 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1595 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1596
1597 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1598 MVT::nxv4f32, MVT::nxv2f64}) {
1638
1660
1672 }
1673
1674 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1685
1686 if (Subtarget->hasSVEB16B16()) {
1695 }
1696 }
1697
1698 for (auto Opcode :
1701 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1702 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1703 setOperationAction(Opcode, MVT::nxv8bf16, Expand);
1704 }
1705
1706 if (!Subtarget->hasSVEB16B16()) {
1707 for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM,
1709 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1710 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1711 setOperationAction(Opcode, MVT::nxv8bf16, Expand);
1712 }
1713 }
1714
1717
1718 // NEON doesn't support integer divides, but SVE does
1719 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1720 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1723 }
1724
1725 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1726 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1727 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1728
1729 // NOTE: Currently this has to happen after computeRegisterProperties rather
1730 // than the preferred option of combining it with the addRegisterClass call.
1731 if (Subtarget->useSVEForFixedLengthVectors()) {
1734 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1735 addTypeForFixedLengthSVE(VT);
1736 }
1739 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1740 addTypeForFixedLengthSVE(VT);
1741 }
1742
1743 // 64bit results can mean a bigger than NEON input.
1744 for (auto VT : {MVT::v8i8, MVT::v4i16})
1747
1748 // 128bit results imply a bigger than NEON input.
1749 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1751 for (auto VT : {MVT::v8f16, MVT::v4f32})
1753
1754 // These operations are not supported on NEON but SVE can do them.
1756 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1757 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1758 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1759 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1760 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1761 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1762 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1763 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1764 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1765 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1766 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1767 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1768 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1769 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1770 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1775
1776 // Int operations with no NEON support.
1777 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1778 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1786 }
1787
1788 // Use SVE for vectors with more than 2 elements.
1789 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1791 }
1792
1793 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1794 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1795 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1796 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1797
1799
1800 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1802 }
1803
1804 // Handle operations that are only available in non-streaming SVE mode.
1805 if (Subtarget->isSVEAvailable()) {
1806 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1807 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1808 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1809 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1810 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1811 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1812 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1815 }
1816
1817 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1818 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1819 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1821
1822 // We can lower types that have <vscale x {2|4}> elements to compact.
1823 for (auto VT :
1824 {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
1825 MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
1827
1828 // If we have SVE, we can use SVE logic for legal (or smaller than legal)
1829 // NEON vectors in the lowest bits of the SVE register.
1830 for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
1831 MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
1833
1834 // Histcnt is SVE2 only
1835 if (Subtarget->hasSVE2()) {
1837 Custom);
1839 Custom);
1840 }
1841 }
1842
1843
1844 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1845 // Only required for llvm.aarch64.mops.memset.tag
1847 }
1848
1850
1851 if (Subtarget->hasSVE()) {
1856 }
1857
1858 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1859
1860 IsStrictFPEnabled = true;
1862
1863 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1864 // it, but it's just a wrapper around ldexp.
1865 if (Subtarget->isTargetWindows()) {
1867 if (isOperationExpand(Op, MVT::f32))
1868 setOperationAction(Op, MVT::f32, Promote);
1869 }
1870
1871 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1872 // isn't legal.
1874 if (isOperationExpand(Op, MVT::f16))
1875 setOperationAction(Op, MVT::f16, Promote);
1876
1877 if (Subtarget->isWindowsArm64EC()) {
1878 // FIXME: are there intrinsics we need to exclude from this?
1879 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1880 auto code = static_cast<RTLIB::Libcall>(i);
1881 auto libcallName = getLibcallName(code);
1882 if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1883 setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
1884 }
1885 }
1886 }
1887}
1888
1889void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1890 assert(VT.isVector() && "VT should be a vector type");
1891
1892 if (VT.isFloatingPoint()) {
1894 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1895 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1896 }
1897
1898 // Mark vector float intrinsics as expand.
1899 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1917 }
1918
1919 // But we do support custom-lowering for FCOPYSIGN.
1920 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1921 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1922 VT == MVT::v8f16) &&
1923 Subtarget->hasFullFP16()))
1925
1938
1942 for (MVT InnerVT : MVT::all_valuetypes())
1943 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1944
1945 // CNT supports only B element sizes, then use UADDLP to widen.
1946 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1948
1954
1955 for (unsigned Opcode :
1958 setOperationAction(Opcode, VT, Custom);
1959
1960 if (!VT.isFloatingPoint())
1962
1963 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1964 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1965 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1966 setOperationAction(Opcode, VT, Legal);
1967
1968 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1969 // NEON types.
1970 if (VT.isFloatingPoint() &&
1971 VT.getVectorElementType() != MVT::bf16 &&
1972 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1973 for (unsigned Opcode :
1979 setOperationAction(Opcode, VT, Legal);
1980
1981 // Strict fp extend and trunc are legal
1982 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1984 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1986
1987 // FIXME: We could potentially make use of the vector comparison instructions
1988 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1989 // complications:
1990 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1991 // so we would need to expand when the condition code doesn't match the
1992 // kind of comparison.
1993 // * Some kinds of comparison require more than one FCMXY instruction so
1994 // would need to be expanded instead.
1995 // * The lowering of the non-strict versions involves target-specific ISD
1996 // nodes so we would likely need to add strict versions of all of them and
1997 // handle them appropriately.
2000
2001 if (Subtarget->isLittleEndian()) {
2002 for (unsigned im = (unsigned)ISD::PRE_INC;
2006 }
2007 }
2008
2009 if (Subtarget->hasD128()) {
2012 }
2013}
2014
2016 EVT OpVT) const {
2017 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2018 if (!Subtarget->hasSVE())
2019 return true;
2020
2021 // We can only support legal predicate result types. We can use the SVE
2022 // whilelo instruction for generating fixed-width predicates too.
2023 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
2024 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
2025 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
2026 return true;
2027
2028 // The whilelo instruction only works with i32 or i64 scalar inputs.
2029 if (OpVT != MVT::i32 && OpVT != MVT::i64)
2030 return true;
2031
2032 return false;
2033}
2034
2036 const IntrinsicInst *I) const {
2037 if (I->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add)
2038 return true;
2039
2040 EVT VT = EVT::getEVT(I->getType());
2041 auto Op1 = I->getOperand(1);
2042 EVT Op1VT = EVT::getEVT(Op1->getType());
2043 if (Op1VT.getVectorElementType() == VT.getVectorElementType() &&
2044 (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount() ||
2045 VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount()))
2046 return false;
2047 return true;
2048}
2049
2051 if (!Subtarget->isSVEorStreamingSVEAvailable())
2052 return true;
2053
2054 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2055 // also support fixed-width predicates.
2056 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2057 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2058 VT != MVT::v4i1 && VT != MVT::v2i1;
2059}
2060
2062 unsigned SearchSize) const {
2063 // MATCH is SVE2 and only available in non-streaming mode.
2064 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2065 return true;
2066 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2067 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2068 return SearchSize != 8;
2069 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2070 return SearchSize != 8 && SearchSize != 16;
2071 return true;
2072}
2073
2074void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2075 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2076
2077 // By default everything must be expanded.
2078 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2080
2081 if (VT.isFloatingPoint()) {
2091 }
2092
2094 VT == MVT::v1f64 ? Expand : Custom;
2095
2096 // Mark integer truncating stores/extending loads as having custom lowering
2097 if (VT.isInteger()) {
2098 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2099 while (InnerVT != VT) {
2100 setTruncStoreAction(VT, InnerVT, Default);
2101 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2102 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2103 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2104 InnerVT = InnerVT.changeVectorElementType(
2105 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2106 }
2107 }
2108
2109 // Mark floating-point truncating stores/extending loads as having custom
2110 // lowering
2111 if (VT.isFloatingPoint()) {
2112 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2113 while (InnerVT != VT) {
2114 setTruncStoreAction(VT, InnerVT, Custom);
2115 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2116 InnerVT = InnerVT.changeVectorElementType(
2118 }
2119 }
2120
2121 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2122 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2123
2124 // Lower fixed length vector operations to scalable equivalents.
2131 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2168 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2169 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2171 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2190 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2216}
2217
2218void AArch64TargetLowering::addDRType(MVT VT) {
2219 addRegisterClass(VT, &AArch64::FPR64RegClass);
2220 if (Subtarget->isNeonAvailable())
2221 addTypeForNEON(VT);
2222}
2223
2224void AArch64TargetLowering::addQRType(MVT VT) {
2225 addRegisterClass(VT, &AArch64::FPR128RegClass);
2226 if (Subtarget->isNeonAvailable())
2227 addTypeForNEON(VT);
2228}
2229
2231 LLVMContext &C, EVT VT) const {
2232 if (!VT.isVector())
2233 return MVT::i32;
2234 if (VT.isScalableVector())
2235 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2237}
2238
2239// isIntImmediate - This method tests to see if the node is a constant
2240// operand. If so Imm will receive the value.
2241static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2242 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2243 Imm = C->getZExtValue();
2244 return true;
2245 }
2246 return false;
2247}
2248
2249// isOpcWithIntImmediate - This method tests to see if the node is a specific
2250// opcode and that it has a immediate integer right operand.
2251// If so Imm will receive the value.
2252static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2253 uint64_t &Imm) {
2254 return N->getOpcode() == Opc &&
2255 isIntImmediate(N->getOperand(1).getNode(), Imm);
2256}
2257
2258static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2259 const APInt &Demanded,
2261 unsigned NewOpc) {
2262 uint64_t OldImm = Imm, NewImm, Enc;
2263 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2264
2265 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2266 // bimm64.
2267 if (Imm == 0 || Imm == Mask ||
2269 return false;
2270
2271 unsigned EltSize = Size;
2272 uint64_t DemandedBits = Demanded.getZExtValue();
2273
2274 // Clear bits that are not demanded.
2275 Imm &= DemandedBits;
2276
2277 while (true) {
2278 // The goal here is to set the non-demanded bits in a way that minimizes
2279 // the number of switching between 0 and 1. In order to achieve this goal,
2280 // we set the non-demanded bits to the value of the preceding demanded bits.
2281 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2282 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2283 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2284 // The final result is 0b11000011.
2285 uint64_t NonDemandedBits = ~DemandedBits;
2286 uint64_t InvertedImm = ~Imm & DemandedBits;
2287 uint64_t RotatedImm =
2288 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2289 NonDemandedBits;
2290 uint64_t Sum = RotatedImm + NonDemandedBits;
2291 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2292 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2293 NewImm = (Imm | Ones) & Mask;
2294
2295 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2296 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2297 // we halve the element size and continue the search.
2298 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2299 break;
2300
2301 // We cannot shrink the element size any further if it is 2-bits.
2302 if (EltSize == 2)
2303 return false;
2304
2305 EltSize /= 2;
2306 Mask >>= EltSize;
2307 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2308
2309 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2310 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2311 return false;
2312
2313 // Merge the upper and lower halves of Imm and DemandedBits.
2314 Imm |= Hi;
2315 DemandedBits |= DemandedBitsHi;
2316 }
2317
2318 ++NumOptimizedImms;
2319
2320 // Replicate the element across the register width.
2321 while (EltSize < Size) {
2322 NewImm |= NewImm << EltSize;
2323 EltSize *= 2;
2324 }
2325
2326 (void)OldImm;
2327 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2328 "demanded bits should never be altered");
2329 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2330
2331 // Create the new constant immediate node.
2332 EVT VT = Op.getValueType();
2333 SDLoc DL(Op);
2334 SDValue New;
2335
2336 // If the new constant immediate is all-zeros or all-ones, let the target
2337 // independent DAG combine optimize this node.
2338 if (NewImm == 0 || NewImm == OrigMask) {
2339 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2340 TLO.DAG.getConstant(NewImm, DL, VT));
2341 // Otherwise, create a machine node so that target independent DAG combine
2342 // doesn't undo this optimization.
2343 } else {
2345 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2346 New = SDValue(
2347 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2348 }
2349
2350 return TLO.CombineTo(Op, New);
2351}
2352
2354 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2355 TargetLoweringOpt &TLO) const {
2356 // Delay this optimization to as late as possible.
2357 if (!TLO.LegalOps)
2358 return false;
2359
2361 return false;
2362
2363 EVT VT = Op.getValueType();
2364 if (VT.isVector())
2365 return false;
2366
2367 unsigned Size = VT.getSizeInBits();
2368 assert((Size == 32 || Size == 64) &&
2369 "i32 or i64 is expected after legalization.");
2370
2371 // Exit early if we demand all bits.
2372 if (DemandedBits.popcount() == Size)
2373 return false;
2374
2375 unsigned NewOpc;
2376 switch (Op.getOpcode()) {
2377 default:
2378 return false;
2379 case ISD::AND:
2380 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2381 break;
2382 case ISD::OR:
2383 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2384 break;
2385 case ISD::XOR:
2386 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2387 break;
2388 }
2389 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2390 if (!C)
2391 return false;
2392 uint64_t Imm = C->getZExtValue();
2393 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2394}
2395
2396/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2397/// Mask are known to be either zero or one and return them Known.
2399 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2400 const SelectionDAG &DAG, unsigned Depth) const {
2401 switch (Op.getOpcode()) {
2402 default:
2403 break;
2404 case AArch64ISD::DUP: {
2405 SDValue SrcOp = Op.getOperand(0);
2406 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2407 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2408 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2409 "Expected DUP implicit truncation");
2410 Known = Known.trunc(Op.getScalarValueSizeInBits());
2411 }
2412 break;
2413 }
2414 case AArch64ISD::CSEL: {
2415 KnownBits Known2;
2416 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2417 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2418 Known = Known.intersectWith(Known2);
2419 break;
2420 }
2421 case AArch64ISD::BICi: {
2422 // Compute the bit cleared value.
2423 APInt Mask =
2424 ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2425 .trunc(Known.getBitWidth());
2426 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2427 Known &= KnownBits::makeConstant(Mask);
2428 break;
2429 }
2430 case AArch64ISD::VLSHR: {
2431 KnownBits Known2;
2432 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2433 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2434 Known = KnownBits::lshr(Known, Known2);
2435 break;
2436 }
2437 case AArch64ISD::VASHR: {
2438 KnownBits Known2;
2439 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2440 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2441 Known = KnownBits::ashr(Known, Known2);
2442 break;
2443 }
2444 case AArch64ISD::VSHL: {
2445 KnownBits Known2;
2446 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2447 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2448 Known = KnownBits::shl(Known, Known2);
2449 break;
2450 }
2451 case AArch64ISD::MOVI: {
2453 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2454 break;
2455 }
2457 case AArch64ISD::ADDlow: {
2458 if (!Subtarget->isTargetILP32())
2459 break;
2460 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2461 Known.Zero = APInt::getHighBitsSet(64, 32);
2462 break;
2463 }
2465 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2466 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2467 break;
2468 }
2470 Intrinsic::ID IntID =
2471 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2472 switch (IntID) {
2473 default: return;
2474 case Intrinsic::aarch64_ldaxr:
2475 case Intrinsic::aarch64_ldxr: {
2476 unsigned BitWidth = Known.getBitWidth();
2477 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2478 unsigned MemBits = VT.getScalarSizeInBits();
2479 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2480 return;
2481 }
2482 }
2483 break;
2484 }
2486 case ISD::INTRINSIC_VOID: {
2487 unsigned IntNo = Op.getConstantOperandVal(0);
2488 switch (IntNo) {
2489 default:
2490 break;
2491 case Intrinsic::aarch64_neon_uaddlv: {
2492 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2493 unsigned BitWidth = Known.getBitWidth();
2494 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2495 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2496 assert(BitWidth >= Bound && "Unexpected width!");
2498 Known.Zero |= Mask;
2499 }
2500 break;
2501 }
2502 case Intrinsic::aarch64_neon_umaxv:
2503 case Intrinsic::aarch64_neon_uminv: {
2504 // Figure out the datatype of the vector operand. The UMINV instruction
2505 // will zero extend the result, so we can mark as known zero all the
2506 // bits larger than the element datatype. 32-bit or larget doesn't need
2507 // this as those are legal types and will be handled by isel directly.
2508 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2509 unsigned BitWidth = Known.getBitWidth();
2510 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2511 assert(BitWidth >= 8 && "Unexpected width!");
2513 Known.Zero |= Mask;
2514 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2515 assert(BitWidth >= 16 && "Unexpected width!");
2517 Known.Zero |= Mask;
2518 }
2519 break;
2520 } break;
2521 }
2522 }
2523 }
2524}
2525
2527 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2528 unsigned Depth) const {
2529 EVT VT = Op.getValueType();
2530 unsigned VTBits = VT.getScalarSizeInBits();
2531 unsigned Opcode = Op.getOpcode();
2532 switch (Opcode) {
2533 case AArch64ISD::CMEQ:
2534 case AArch64ISD::CMGE:
2535 case AArch64ISD::CMGT:
2536 case AArch64ISD::CMHI:
2537 case AArch64ISD::CMHS:
2538 case AArch64ISD::FCMEQ:
2539 case AArch64ISD::FCMGE:
2540 case AArch64ISD::FCMGT:
2541 case AArch64ISD::CMEQz:
2542 case AArch64ISD::CMGEz:
2543 case AArch64ISD::CMGTz:
2544 case AArch64ISD::CMLEz:
2545 case AArch64ISD::CMLTz:
2546 case AArch64ISD::FCMEQz:
2547 case AArch64ISD::FCMGEz:
2548 case AArch64ISD::FCMGTz:
2549 case AArch64ISD::FCMLEz:
2550 case AArch64ISD::FCMLTz:
2551 // Compares return either 0 or all-ones
2552 return VTBits;
2553 case AArch64ISD::VASHR: {
2554 unsigned Tmp =
2555 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2556 return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2557 }
2558 }
2559
2560 return 1;
2561}
2562
2564 EVT) const {
2565 return MVT::i64;
2566}
2567
2569 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2570 unsigned *Fast) const {
2571
2572 // Allow SVE loads/stores where the alignment >= the size of the element type,
2573 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2574 // for stores that come from IR, only require element-size alignment (even if
2575 // unaligned accesses are disabled). Without this, these will be forced to
2576 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2577 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2578 if (VT.isScalableVector()) {
2579 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2580 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2581 return true;
2582 }
2583
2584 if (Subtarget->requiresStrictAlign())
2585 return false;
2586
2587 if (Fast) {
2588 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2589 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2590 // See comments in performSTORECombine() for more details about
2591 // these conditions.
2592
2593 // Code that uses clang vector extensions can mark that it
2594 // wants unaligned accesses to be treated as fast by
2595 // underspecifying alignment to be 1 or 2.
2596 Alignment <= 2 ||
2597
2598 // Disregard v2i64. Memcpy lowering produces those and splitting
2599 // them regresses performance on micro-benchmarks and olden/bh.
2600 VT == MVT::v2i64;
2601 }
2602 return true;
2603}
2604
2605// Same as above but handling LLTs instead.
2607 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2608 unsigned *Fast) const {
2609 if (Subtarget->requiresStrictAlign())
2610 return false;
2611
2612 if (Fast) {
2613 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2614 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2615 Ty.getSizeInBytes() != 16 ||
2616 // See comments in performSTORECombine() for more details about
2617 // these conditions.
2618
2619 // Code that uses clang vector extensions can mark that it
2620 // wants unaligned accesses to be treated as fast by
2621 // underspecifying alignment to be 1 or 2.
2622 Alignment <= 2 ||
2623
2624 // Disregard v2i64. Memcpy lowering produces those and splitting
2625 // them regresses performance on micro-benchmarks and olden/bh.
2626 Ty == LLT::fixed_vector(2, 64);
2627 }
2628 return true;
2629}
2630
2631FastISel *
2633 const TargetLibraryInfo *libInfo) const {
2634 return AArch64::createFastISel(funcInfo, libInfo);
2635}
2636
2637const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2638#define MAKE_CASE(V) \
2639 case V: \
2640 return #V;
2641 switch ((AArch64ISD::NodeType)Opcode) {
2643 break;
2972 }
2973#undef MAKE_CASE
2974 return nullptr;
2975}
2976
2979 MachineBasicBlock *MBB) const {
2980 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2981 // phi node:
2982
2983 // OrigBB:
2984 // [... previous instrs leading to comparison ...]
2985 // b.ne TrueBB
2986 // b EndBB
2987 // TrueBB:
2988 // ; Fallthrough
2989 // EndBB:
2990 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2991
2992 MachineFunction *MF = MBB->getParent();
2993 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2994 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2995 DebugLoc DL = MI.getDebugLoc();
2997
2998 Register DestReg = MI.getOperand(0).getReg();
2999 Register IfTrueReg = MI.getOperand(1).getReg();
3000 Register IfFalseReg = MI.getOperand(2).getReg();
3001 unsigned CondCode = MI.getOperand(3).getImm();
3002 bool NZCVKilled = MI.getOperand(4).isKill();
3003
3004 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
3005 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
3006 MF->insert(It, TrueBB);
3007 MF->insert(It, EndBB);
3008
3009 // Transfer rest of current basic-block to EndBB
3010 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
3011 MBB->end());
3013
3014 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
3015 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
3016 MBB->addSuccessor(TrueBB);
3017 MBB->addSuccessor(EndBB);
3018
3019 // TrueBB falls through to the end.
3020 TrueBB->addSuccessor(EndBB);
3021
3022 if (!NZCVKilled) {
3023 TrueBB->addLiveIn(AArch64::NZCV);
3024 EndBB->addLiveIn(AArch64::NZCV);
3025 }
3026
3027 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
3028 .addReg(IfTrueReg)
3029 .addMBB(TrueBB)
3030 .addReg(IfFalseReg)
3031 .addMBB(MBB);
3032
3033 MI.eraseFromParent();
3034 return EndBB;
3035}
3036
3038 MachineInstr &MI, MachineBasicBlock *BB) const {
3040 BB->getParent()->getFunction().getPersonalityFn())) &&
3041 "SEH does not use catchret!");
3042 return BB;
3043}
3044
3047 MachineBasicBlock *MBB) const {
3048 MachineFunction &MF = *MBB->getParent();
3049 MachineBasicBlock::iterator MBBI = MI.getIterator();
3051 const AArch64InstrInfo &TII =
3052 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
3053 Register TargetReg = MI.getOperand(0).getReg();
3055 TII.probedStackAlloc(MBBI, TargetReg, false);
3056
3057 MI.eraseFromParent();
3058 return NextInst->getParent();
3059}
3060
3062AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
3064 MachineBasicBlock *BB) const {
3065 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3066 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3067
3068 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
3069 MIB.add(MI.getOperand(1)); // slice index register
3070 MIB.add(MI.getOperand(2)); // slice index offset
3071 MIB.add(MI.getOperand(3)); // pg
3072 MIB.add(MI.getOperand(4)); // base
3073 MIB.add(MI.getOperand(5)); // offset
3074
3075 MI.eraseFromParent(); // The pseudo is gone now.
3076 return BB;
3077}
3078
3081 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3083 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
3084
3085 MIB.addReg(AArch64::ZA, RegState::Define);
3086 MIB.add(MI.getOperand(0)); // Vector select register
3087 MIB.add(MI.getOperand(1)); // Vector select offset
3088 MIB.add(MI.getOperand(2)); // Base
3089 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
3090
3091 MI.eraseFromParent(); // The pseudo is gone now.
3092 return BB;
3093}
3094
3097 unsigned Opcode,
3098 bool Op0IsDef) const {
3099 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3101
3102 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
3103 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
3104 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
3105 MIB.add(MI.getOperand(I));
3106
3107 MI.eraseFromParent(); // The pseudo is gone now.
3108 return BB;
3109}
3110
3112AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
3114 MachineBasicBlock *BB) const {
3115 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3116 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3117 unsigned StartIdx = 0;
3118
3119 bool HasTile = BaseReg != AArch64::ZA;
3120 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3121 if (HasZPROut) {
3122 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3123 ++StartIdx;
3124 }
3125 if (HasTile) {
3126 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3127 RegState::Define); // Output ZA Tile
3128 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3129 StartIdx++;
3130 } else {
3131 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3132 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3133 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3134 ++StartIdx;
3135 }
3136 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3137 }
3138 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3139 MIB.add(MI.getOperand(I));
3140
3141 MI.eraseFromParent(); // The pseudo is gone now.
3142 return BB;
3143}
3144
3147 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3149 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3150 MIB.add(MI.getOperand(0)); // Mask
3151
3152 unsigned Mask = MI.getOperand(0).getImm();
3153 for (unsigned I = 0; I < 8; I++) {
3154 if (Mask & (1 << I))
3155 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3156 }
3157
3158 MI.eraseFromParent(); // The pseudo is gone now.
3159 return BB;
3160}
3161
3164 MachineBasicBlock *BB) const {
3165 MachineFunction *MF = BB->getParent();
3166 MachineFrameInfo &MFI = MF->getFrameInfo();
3168 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3169 if (TPIDR2.Uses > 0) {
3170 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3171 // Store the buffer pointer to the TPIDR2 stack object.
3172 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
3173 .addReg(MI.getOperand(0).getReg())
3174 .addFrameIndex(TPIDR2.FrameIndex)
3175 .addImm(0);
3176 // Set the reserved bytes (10-15) to zero
3177 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
3178 .addReg(AArch64::WZR)
3179 .addFrameIndex(TPIDR2.FrameIndex)
3180 .addImm(5);
3181 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
3182 .addReg(AArch64::WZR)
3183 .addFrameIndex(TPIDR2.FrameIndex)
3184 .addImm(3);
3185 } else
3186 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3187
3188 BB->remove_instr(&MI);
3189 return BB;
3190}
3191
3194 MachineBasicBlock *BB) const {
3195 MachineFunction *MF = BB->getParent();
3196 MachineFrameInfo &MFI = MF->getFrameInfo();
3198 // TODO This function grows the stack with a subtraction, which doesn't work
3199 // on Windows. Some refactoring to share the functionality in
3200 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3201 // supports SME
3203 "Lazy ZA save is not yet supported on Windows");
3204
3205 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3206
3207 if (TPIDR2.Uses > 0) {
3208 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3210
3211 // The SUBXrs below won't always be emitted in a form that accepts SP
3212 // directly
3213 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3214 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3215 .addReg(AArch64::SP);
3216
3217 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3218 auto Size = MI.getOperand(1).getReg();
3219 auto Dest = MI.getOperand(0).getReg();
3220 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3221 .addReg(Size)
3222 .addReg(Size)
3223 .addReg(SP);
3224 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3225 AArch64::SP)
3226 .addReg(Dest);
3227
3228 // We have just allocated a variable sized object, tell this to PEI.
3229 MFI.CreateVariableSizedObject(Align(16), nullptr);
3230 }
3231
3232 BB->remove_instr(&MI);
3233 return BB;
3234}
3235
3236// TODO: Find a way to merge this with EmitAllocateZABuffer.
3239 MachineBasicBlock *BB) const {
3240 MachineFunction *MF = BB->getParent();
3241 MachineFrameInfo &MFI = MF->getFrameInfo();
3244 "Lazy ZA save is not yet supported on Windows");
3245
3246 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3247 if (FuncInfo->isSMESaveBufferUsed()) {
3248 // Allocate a buffer object of the size given by MI.getOperand(1).
3249 auto Size = MI.getOperand(1).getReg();
3250 auto Dest = MI.getOperand(0).getReg();
3251 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrx64), AArch64::SP)
3252 .addReg(AArch64::SP)
3253 .addReg(Size)
3255 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Dest)
3256 .addReg(AArch64::SP);
3257
3258 // We have just allocated a variable sized object, tell this to PEI.
3259 MFI.CreateVariableSizedObject(Align(16), nullptr);
3260 } else
3261 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
3262 MI.getOperand(0).getReg());
3263
3264 BB->remove_instr(&MI);
3265 return BB;
3266}
3267
3270 MachineBasicBlock *BB) const {
3271 // If the buffer is used, emit a call to __arm_sme_state_size()
3272 MachineFunction *MF = BB->getParent();