LLVM 20.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
93#include <algorithm>
94#include <bitset>
95#include <cassert>
96#include <cctype>
97#include <cstdint>
98#include <cstdlib>
99#include <iterator>
100#include <limits>
101#include <optional>
102#include <tuple>
103#include <utility>
104#include <vector>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108
109#define DEBUG_TYPE "aarch64-lower"
110
111STATISTIC(NumTailCalls, "Number of tail calls");
112STATISTIC(NumShiftInserts, "Number of vector shift inserts");
113STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
114
115// FIXME: The necessary dtprel relocations don't seem to be supported
116// well in the GNU bfd and gold linkers at the moment. Therefore, by
117// default, for now, fall back to GeneralDynamic code generation.
119 "aarch64-elf-ldtls-generation", cl::Hidden,
120 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
121 cl::init(false));
122
123static cl::opt<bool>
124EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
125 cl::desc("Enable AArch64 logical imm instruction "
126 "optimization"),
127 cl::init(true));
128
129// Temporary option added for the purpose of testing functionality added
130// to DAGCombiner.cpp in D92230. It is expected that this can be removed
131// in future when both implementations will be based off MGATHER rather
132// than the GLD1 nodes added for the SVE gather load intrinsics.
133static cl::opt<bool>
134EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
135 cl::desc("Combine extends of AArch64 masked "
136 "gather intrinsics"),
137 cl::init(true));
138
139static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
140 cl::desc("Combine ext and trunc to TBL"),
141 cl::init(true));
142
143// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
144// bottleneck after this transform on high end CPU. So this max leaf node
145// limitation is guard cmp+ccmp will be profitable.
146static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
147 cl::desc("Maximum of xors"));
148
149// By turning this on, we will not fallback to DAG ISel when encountering
150// scalable vector types for all instruction, even if SVE is not yet supported
151// with some instructions.
152// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
154 "aarch64-enable-gisel-sve", cl::Hidden,
155 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
156 cl::init(false));
157
158/// Value type used for condition codes.
159static const MVT MVT_CC = MVT::i32;
160
161static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
162 AArch64::X3, AArch64::X4, AArch64::X5,
163 AArch64::X6, AArch64::X7};
164static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
165 AArch64::Q3, AArch64::Q4, AArch64::Q5,
166 AArch64::Q6, AArch64::Q7};
167
169
171
172static inline EVT getPackedSVEVectorVT(EVT VT) {
173 switch (VT.getSimpleVT().SimpleTy) {
174 default:
175 llvm_unreachable("unexpected element type for vector");
176 case MVT::i8:
177 return MVT::nxv16i8;
178 case MVT::i16:
179 return MVT::nxv8i16;
180 case MVT::i32:
181 return MVT::nxv4i32;
182 case MVT::i64:
183 return MVT::nxv2i64;
184 case MVT::f16:
185 return MVT::nxv8f16;
186 case MVT::f32:
187 return MVT::nxv4f32;
188 case MVT::f64:
189 return MVT::nxv2f64;
190 case MVT::bf16:
191 return MVT::nxv8bf16;
192 }
193}
194
195// NOTE: Currently there's only a need to return integer vector types. If this
196// changes then just add an extra "type" parameter.
198 switch (EC.getKnownMinValue()) {
199 default:
200 llvm_unreachable("unexpected element count for vector");
201 case 16:
202 return MVT::nxv16i8;
203 case 8:
204 return MVT::nxv8i16;
205 case 4:
206 return MVT::nxv4i32;
207 case 2:
208 return MVT::nxv2i64;
209 }
210}
211
213 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
214 "Expected scalable predicate vector type!");
215 switch (VT.getVectorMinNumElements()) {
216 default:
217 llvm_unreachable("unexpected element count for vector");
218 case 2:
219 return MVT::nxv2i64;
220 case 4:
221 return MVT::nxv4i32;
222 case 8:
223 return MVT::nxv8i16;
224 case 16:
225 return MVT::nxv16i8;
226 }
227}
228
229/// Returns true if VT's elements occupy the lowest bit positions of its
230/// associated register class without any intervening space.
231///
232/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
233/// same register class, but only nxv8f16 can be treated as a packed vector.
234static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
236 "Expected legal vector type!");
237 return VT.isFixedLengthVector() ||
239}
240
241// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
242// predicate and end with a passthru value matching the result type.
243static bool isMergePassthruOpcode(unsigned Opc) {
244 switch (Opc) {
245 default:
246 return false;
276 return true;
277 }
278}
279
280// Returns true if inactive lanes are known to be zeroed by construction.
282 switch (Op.getOpcode()) {
283 default:
284 return false;
285 // We guarantee i1 splat_vectors to zero the other lanes
289 return true;
291 switch (Op.getConstantOperandVal(0)) {
292 default:
293 return false;
294 case Intrinsic::aarch64_sve_ptrue:
295 case Intrinsic::aarch64_sve_pnext:
296 case Intrinsic::aarch64_sve_cmpeq:
297 case Intrinsic::aarch64_sve_cmpne:
298 case Intrinsic::aarch64_sve_cmpge:
299 case Intrinsic::aarch64_sve_cmpgt:
300 case Intrinsic::aarch64_sve_cmphs:
301 case Intrinsic::aarch64_sve_cmphi:
302 case Intrinsic::aarch64_sve_cmpeq_wide:
303 case Intrinsic::aarch64_sve_cmpne_wide:
304 case Intrinsic::aarch64_sve_cmpge_wide:
305 case Intrinsic::aarch64_sve_cmpgt_wide:
306 case Intrinsic::aarch64_sve_cmplt_wide:
307 case Intrinsic::aarch64_sve_cmple_wide:
308 case Intrinsic::aarch64_sve_cmphs_wide:
309 case Intrinsic::aarch64_sve_cmphi_wide:
310 case Intrinsic::aarch64_sve_cmplo_wide:
311 case Intrinsic::aarch64_sve_cmpls_wide:
312 case Intrinsic::aarch64_sve_fcmpeq:
313 case Intrinsic::aarch64_sve_fcmpne:
314 case Intrinsic::aarch64_sve_fcmpge:
315 case Intrinsic::aarch64_sve_fcmpgt:
316 case Intrinsic::aarch64_sve_fcmpuo:
317 case Intrinsic::aarch64_sve_facgt:
318 case Intrinsic::aarch64_sve_facge:
319 case Intrinsic::aarch64_sve_whilege:
320 case Intrinsic::aarch64_sve_whilegt:
321 case Intrinsic::aarch64_sve_whilehi:
322 case Intrinsic::aarch64_sve_whilehs:
323 case Intrinsic::aarch64_sve_whilele:
324 case Intrinsic::aarch64_sve_whilelo:
325 case Intrinsic::aarch64_sve_whilels:
326 case Intrinsic::aarch64_sve_whilelt:
327 case Intrinsic::aarch64_sve_match:
328 case Intrinsic::aarch64_sve_nmatch:
329 case Intrinsic::aarch64_sve_whilege_x2:
330 case Intrinsic::aarch64_sve_whilegt_x2:
331 case Intrinsic::aarch64_sve_whilehi_x2:
332 case Intrinsic::aarch64_sve_whilehs_x2:
333 case Intrinsic::aarch64_sve_whilele_x2:
334 case Intrinsic::aarch64_sve_whilelo_x2:
335 case Intrinsic::aarch64_sve_whilels_x2:
336 case Intrinsic::aarch64_sve_whilelt_x2:
337 return true;
338 }
339 }
340}
341
342static std::tuple<SDValue, SDValue>
344 SDLoc DL(Disc);
345 SDValue AddrDisc;
346 SDValue ConstDisc;
347
348 // If this is a blend, remember the constant and address discriminators.
349 // Otherwise, it's either a constant discriminator, or a non-blended
350 // address discriminator.
351 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
352 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
353 AddrDisc = Disc->getOperand(1);
354 ConstDisc = Disc->getOperand(2);
355 } else {
356 ConstDisc = Disc;
357 }
358
359 // If the constant discriminator (either the blend RHS, or the entire
360 // discriminator value) isn't a 16-bit constant, bail out, and let the
361 // discriminator be computed separately.
362 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
363 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
364 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
365
366 // If there's no address discriminator, use NoRegister, which we'll later
367 // replace with XZR, or directly use a Z variant of the inst. when available.
368 if (!AddrDisc)
369 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
370
371 return std::make_tuple(
372 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
373 AddrDisc);
374}
375
377 const AArch64Subtarget &STI)
378 : TargetLowering(TM), Subtarget(&STI) {
379 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
380 // we have to make something up. Arbitrarily, choose ZeroOrOne.
382 // When comparing vectors the result sets the different elements in the
383 // vector to all-one or all-zero.
385
386 // Set up the register classes.
387 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
388 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
389
390 if (Subtarget->hasLS64()) {
391 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
392 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
394 }
395
396 if (Subtarget->hasFPARMv8()) {
397 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
398 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
399 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
400 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
401 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
402 }
403
404 if (Subtarget->hasNEON()) {
405 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
406 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
407
408 addDRType(MVT::v2f32);
409 addDRType(MVT::v8i8);
410 addDRType(MVT::v4i16);
411 addDRType(MVT::v2i32);
412 addDRType(MVT::v1i64);
413 addDRType(MVT::v1f64);
414 addDRType(MVT::v4f16);
415 addDRType(MVT::v4bf16);
416
417 addQRType(MVT::v4f32);
418 addQRType(MVT::v2f64);
419 addQRType(MVT::v16i8);
420 addQRType(MVT::v8i16);
421 addQRType(MVT::v4i32);
422 addQRType(MVT::v2i64);
423 addQRType(MVT::v8f16);
424 addQRType(MVT::v8bf16);
425 }
426
427 if (Subtarget->isSVEorStreamingSVEAvailable()) {
428 // Add legal sve predicate types
429 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
430 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
431 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
432 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
433 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
434
435 // Add legal sve data types
436 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
437 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
438 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
439 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
440
441 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
442 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
443 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
444 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
445 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
446 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
447
448 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
449 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
450 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
451
452 if (Subtarget->useSVEForFixedLengthVectors()) {
455 addRegisterClass(VT, &AArch64::ZPRRegClass);
456
459 addRegisterClass(VT, &AArch64::ZPRRegClass);
460 }
461 }
462
463 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
464 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
465 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
466 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
467
468 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
469 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
470 }
471
472 // Compute derived properties from the register classes
474
475 // Provide all sorts of operation actions
515
517
521
525
527
528 // Custom lowering hooks are needed for XOR
529 // to fold it into CSINC/CSINV.
532
533 // Virtually no operation on f128 is legal, but LLVM can't expand them when
534 // there's a valid register class, so we need custom operations in most cases.
559 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
560 // aren't handled.
561
562 // Lowering for many of the conversions is actually specified by the non-f128
563 // type. The LowerXXX function will be trivial when f128 isn't involved.
588 if (Subtarget->hasFPARMv8()) {
591 }
594 if (Subtarget->hasFPARMv8()) {
597 }
600
605
606 // Variable arguments.
611
612 // Variable-sized objects.
615
616 // Lowering Funnel Shifts to EXTR
621
623
624 // Constant pool entries
626
627 // BlockAddress
629
630 // AArch64 lacks both left-rotate and popcount instructions.
636 }
637
638 // AArch64 doesn't have i32 MULH{S|U}.
641
642 // AArch64 doesn't have {U|S}MUL_LOHI.
647
648 if (Subtarget->hasCSSC()) {
652
654
658
661
666
671 } else {
675
678
681 }
682
688 }
695
696 // Custom lower Add/Sub/Mul with overflow.
709
718
727 if (Subtarget->hasFullFP16()) {
730 } else {
733 }
734
735 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
747 setOperationAction(Op, MVT::f16, Promote);
748 setOperationAction(Op, MVT::v4f16, Expand);
749 setOperationAction(Op, MVT::v8f16, Expand);
750 setOperationAction(Op, MVT::bf16, Promote);
751 setOperationAction(Op, MVT::v4bf16, Expand);
752 setOperationAction(Op, MVT::v8bf16, Expand);
753 }
754
755 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
756 for (auto Op : {
760 ISD::FADD,
761 ISD::FSUB,
762 ISD::FMUL,
763 ISD::FDIV,
764 ISD::FMA,
794 })
795 setOperationAction(Op, ScalarVT, Promote);
796
797 for (auto Op : {ISD::FNEG, ISD::FABS})
798 setOperationAction(Op, ScalarVT, Legal);
799
800 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
801 // because the result type is integer.
805 setOperationAction(Op, ScalarVT, Custom);
806
807 // promote v4f16 to v4f32 when that is known to be safe.
808 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
809 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
810 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
811 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
812 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
813 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
814 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
815 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
816 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
817 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
818 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
819 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
820
830
831 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
853 };
854
855 if (!Subtarget->hasFullFP16()) {
856 LegalizeNarrowFP(MVT::f16);
857 }
858 LegalizeNarrowFP(MVT::bf16);
861
862 // AArch64 has implementations of a lot of rounding-like FP operations.
863 for (auto Op :
874 for (MVT Ty : {MVT::f32, MVT::f64})
876 if (Subtarget->hasFullFP16())
877 setOperationAction(Op, MVT::f16, Legal);
878 }
879
880 // Basic strict FP operations are legal
883 for (MVT Ty : {MVT::f32, MVT::f64})
885 if (Subtarget->hasFullFP16())
886 setOperationAction(Op, MVT::f16, Legal);
887 }
888
889 // Strict conversion to a larger type is legal
890 for (auto VT : {MVT::f32, MVT::f64})
892
894
900
902 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
905 } else {
908 }
911
912 // Generate outline atomics library calls only if LSE was not specified for
913 // subtarget
914 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
940#define LCALLNAMES(A, B, N) \
941 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
942 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
943 setLibcallName(A##N##_REL, #B #N "_rel"); \
944 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
945#define LCALLNAME4(A, B) \
946 LCALLNAMES(A, B, 1) \
947 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
948#define LCALLNAME5(A, B) \
949 LCALLNAMES(A, B, 1) \
950 LCALLNAMES(A, B, 2) \
951 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
952 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
953 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
954 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
955 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
956 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
957 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
958#undef LCALLNAMES
959#undef LCALLNAME4
960#undef LCALLNAME5
961 }
962
963 if (Subtarget->hasLSE128()) {
964 // Custom lowering because i128 is not legal. Must be replaced by 2x64
965 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
969 }
970
971 // 128-bit loads and stores can be done without expanding
974
975 // Aligned 128-bit loads and stores are single-copy atomic according to the
976 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
977 if (Subtarget->hasLSE2()) {
980 }
981
982 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
983 // custom lowering, as there are no un-paired non-temporal stores and
984 // legalization will break up 256 bit inputs.
986 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
987 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
988 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
993
994 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
995 // custom lowering, as there are no un-paired non-temporal loads legalization
996 // will break up 256 bit inputs.
997 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
998 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
999 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1000 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1001 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1002 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1003 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1004 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1005
1006 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1008
1009 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1010 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1011 // Issue __sincos_stret if available.
1014 } else {
1017 }
1018
1019 // Make floating-point constants legal for the large code model, so they don't
1020 // become loads from the constant pool.
1021 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1024 }
1025
1026 // AArch64 does not have floating-point extending loads, i1 sign-extending
1027 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1028 for (MVT VT : MVT::fp_valuetypes()) {
1029 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1030 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1031 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1032 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1033 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1034 }
1035 for (MVT VT : MVT::integer_valuetypes())
1036 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1037
1038 for (MVT WideVT : MVT::fp_valuetypes()) {
1039 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1040 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1041 setTruncStoreAction(WideVT, NarrowVT, Expand);
1042 }
1043 }
1044 }
1045
1046 if (Subtarget->hasFPARMv8()) {
1050 }
1051
1052 // Indexed loads and stores are supported.
1053 for (unsigned im = (unsigned)ISD::PRE_INC;
1055 setIndexedLoadAction(im, MVT::i8, Legal);
1056 setIndexedLoadAction(im, MVT::i16, Legal);
1057 setIndexedLoadAction(im, MVT::i32, Legal);
1058 setIndexedLoadAction(im, MVT::i64, Legal);
1059 setIndexedLoadAction(im, MVT::f64, Legal);
1060 setIndexedLoadAction(im, MVT::f32, Legal);
1061 setIndexedLoadAction(im, MVT::f16, Legal);
1062 setIndexedLoadAction(im, MVT::bf16, Legal);
1063 setIndexedStoreAction(im, MVT::i8, Legal);
1064 setIndexedStoreAction(im, MVT::i16, Legal);
1065 setIndexedStoreAction(im, MVT::i32, Legal);
1066 setIndexedStoreAction(im, MVT::i64, Legal);
1067 setIndexedStoreAction(im, MVT::f64, Legal);
1068 setIndexedStoreAction(im, MVT::f32, Legal);
1069 setIndexedStoreAction(im, MVT::f16, Legal);
1070 setIndexedStoreAction(im, MVT::bf16, Legal);
1071 }
1072
1073 // Trap.
1074 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1077
1078 // We combine OR nodes for bitfield operations.
1080 // Try to create BICs for vector ANDs.
1082
1083 // llvm.init.trampoline and llvm.adjust.trampoline
1086
1087 // Vector add and sub nodes may conceal a high-half opportunity.
1088 // Also, try to fold ADD into CSINC/CSINV..
1091
1094
1095 // Try and combine setcc with csel
1097
1099
1106
1108
1110
1112
1116
1118
1120
1122
1124
1128
1130
1131 // In case of strict alignment, avoid an excessive number of byte wide stores.
1134 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1135
1139 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1140
1143
1146 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1147
1149
1151
1152 EnableExtLdPromotion = true;
1153
1154 // Set required alignment.
1156 // Set preferred alignments.
1157
1158 // Don't align loops on Windows. The SEH unwind info generation needs to
1159 // know the exact length of functions before the alignments have been
1160 // expanded.
1161 if (!Subtarget->isTargetWindows())
1165
1166 // Only change the limit for entries in a jump table if specified by
1167 // the sub target, but not at the command line.
1168 unsigned MaxJT = STI.getMaximumJumpTableSize();
1169 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1171
1173
1175
1177
1178 if (Subtarget->isNeonAvailable()) {
1179 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1180 // silliness like this:
1181 // clang-format off
1182 for (auto Op :
1202 setOperationAction(Op, MVT::v1f64, Expand);
1203 // clang-format on
1204 for (auto Op :
1209 setOperationAction(Op, MVT::v1i64, Expand);
1210
1211 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1212 // elements smaller than i32, so promote the input to i32 first.
1213 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1214 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1215
1216 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1217 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1218 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1221 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1223
1224 if (Subtarget->hasFullFP16()) {
1227
1236 } else {
1237 // when AArch64 doesn't have fullfp16 support, promote the input
1238 // to i32 first.
1239 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1240 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1241 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1242 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1243 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1244 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1245 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1246 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1247 }
1248
1249 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1250 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1257 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1262 }
1263
1264 // Custom handling for some quad-vector types to detect MULL.
1265 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1266 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1267 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1268 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1269 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1270 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1271
1272 // Saturates
1273 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1274 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1279 }
1280
1281 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1282 MVT::v4i32}) {
1289 }
1290
1291 // Vector reductions
1292 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1293 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1294 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1299
1301 }
1302 }
1303 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1304 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1313 }
1318
1320 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1321 // Likewise, narrowing and extending vector loads/stores aren't handled
1322 // directly.
1325
1326 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1329 } else {
1332 }
1335
1338
1339 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1340 setTruncStoreAction(VT, InnerVT, Expand);
1341 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1342 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1343 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1344 }
1345 }
1346
1347 // AArch64 has implementations of a lot of rounding-like FP operations.
1348 for (auto Op :
1353 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1355 if (Subtarget->hasFullFP16())
1356 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1358 }
1359
1360 // LRINT and LLRINT.
1361 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1362 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1364 if (Subtarget->hasFullFP16())
1365 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1367 }
1368
1369 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1370
1375
1379
1380 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1381 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1382 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1383 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1384 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1385 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1386
1387 // ADDP custom lowering
1388 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1390 // FADDP custom lowering
1391 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1393 } else /* !isNeonAvailable */ {
1395 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1397
1398 if (VT.is128BitVector() || VT.is64BitVector()) {
1402 Subtarget->isLittleEndian() ? Legal : Expand);
1403 }
1404 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1405 setTruncStoreAction(VT, InnerVT, Expand);
1406 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1407 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1408 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1409 }
1410 }
1411 }
1412
1413 if (Subtarget->hasSME()) {
1415 }
1416
1417 // FIXME: Move lowering for more nodes here if those are common between
1418 // SVE and SME.
1419 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1420 for (auto VT :
1421 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1426 }
1427 }
1428
1429 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1430 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1471
1477
1486
1491
1492 if (!Subtarget->isLittleEndian())
1494
1495 if (Subtarget->hasSVE2() ||
1496 (Subtarget->hasSME() && Subtarget->isStreaming()))
1497 // For SLI/SRI.
1499 }
1500
1501 // Illegal unpacked integer vector types.
1502 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1505 }
1506
1507 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1508 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1509 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1511
1512 for (auto VT :
1513 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1514 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1516
1517 for (auto VT :
1518 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1527
1531
1532 // There are no legal MVT::nxv16f## based types.
1533 if (VT != MVT::nxv16i1) {
1536 }
1537 }
1538
1539 // NEON doesn't support masked loads/stores, but SME and SVE do.
1540 for (auto VT :
1541 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1542 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1543 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1546 }
1547
1548 // Firstly, exclude all scalable vector extending loads/truncating stores,
1549 // include both integer and floating scalable vector.
1551 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1552 setTruncStoreAction(VT, InnerVT, Expand);
1553 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1554 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1555 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1556 }
1557 }
1558
1559 // Then, selectively enable those which we directly support.
1560 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1561 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1562 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1563 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1564 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1565 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1566 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1567 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1568 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1569 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1570 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1571 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1572 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1573 }
1574
1575 // SVE supports truncating stores of 64 and 128-bit vectors
1576 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1577 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1578 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1579 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1580 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1581
1582 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1583 MVT::nxv4f32, MVT::nxv2f64}) {
1622
1643
1655
1656 if (!Subtarget->isLittleEndian())
1658 }
1659
1660 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1666
1667 if (!Subtarget->isLittleEndian())
1669 }
1670
1673
1674 // NEON doesn't support integer divides, but SVE does
1675 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1676 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1679 }
1680
1681 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1682 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1683 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1684
1685 // NOTE: Currently this has to happen after computeRegisterProperties rather
1686 // than the preferred option of combining it with the addRegisterClass call.
1687 if (Subtarget->useSVEForFixedLengthVectors()) {
1690 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1691 addTypeForFixedLengthSVE(VT);
1692 }
1695 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1696 addTypeForFixedLengthSVE(VT);
1697 }
1698
1699 // 64bit results can mean a bigger than NEON input.
1700 for (auto VT : {MVT::v8i8, MVT::v4i16})
1703
1704 // 128bit results imply a bigger than NEON input.
1705 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1707 for (auto VT : {MVT::v8f16, MVT::v4f32})
1709
1710 // These operations are not supported on NEON but SVE can do them.
1712 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1713 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1714 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1715 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1716 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1717 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1718 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1719 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1720 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1721 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1722 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1723 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1724 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1725 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1726 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1731
1732 // Int operations with no NEON support.
1733 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1734 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1742 }
1743
1744 // Use SVE for vectors with more than 2 elements.
1745 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1747 }
1748
1749 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1750 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1751 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1752 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1753
1755
1756 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1758 }
1759
1760 // Handle operations that are only available in non-streaming SVE mode.
1761 if (Subtarget->isSVEAvailable()) {
1762 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1763 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1764 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1765 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1766 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1767 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1768 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1771 }
1772
1773 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1774 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1775 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1777
1778 // Histcnt is SVE2 only
1779 if (Subtarget->hasSVE2())
1781 Custom);
1782 }
1783
1784
1785 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1786 // Only required for llvm.aarch64.mops.memset.tag
1788 }
1789
1791
1792 if (Subtarget->hasSVE()) {
1797 }
1798
1799 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1800
1801 IsStrictFPEnabled = true;
1803
1804 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1805 // it, but it's just a wrapper around ldexp.
1806 if (Subtarget->isTargetWindows()) {
1808 if (isOperationExpand(Op, MVT::f32))
1809 setOperationAction(Op, MVT::f32, Promote);
1810 }
1811
1812 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1813 // isn't legal.
1815 if (isOperationExpand(Op, MVT::f16))
1816 setOperationAction(Op, MVT::f16, Promote);
1817
1818 if (Subtarget->isWindowsArm64EC()) {
1819 // FIXME: are there intrinsics we need to exclude from this?
1820 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1821 auto code = static_cast<RTLIB::Libcall>(i);
1822 auto libcallName = getLibcallName(code);
1823 if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1824 setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
1825 }
1826 }
1827 }
1828}
1829
1830void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1831 assert(VT.isVector() && "VT should be a vector type");
1832
1833 if (VT.isFloatingPoint()) {
1835 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1836 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1837 }
1838
1839 // Mark vector float intrinsics as expand.
1840 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1857 }
1858
1859 // But we do support custom-lowering for FCOPYSIGN.
1860 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1861 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1862 VT == MVT::v8f16) &&
1863 Subtarget->hasFullFP16()))
1865
1878
1882 for (MVT InnerVT : MVT::all_valuetypes())
1883 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1884
1885 // CNT supports only B element sizes, then use UADDLP to widen.
1886 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1888
1894
1895 for (unsigned Opcode :
1898 setOperationAction(Opcode, VT, Custom);
1899
1900 if (!VT.isFloatingPoint())
1902
1903 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1904 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1905 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1906 setOperationAction(Opcode, VT, Legal);
1907
1908 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1909 // NEON types.
1910 if (VT.isFloatingPoint() &&
1911 VT.getVectorElementType() != MVT::bf16 &&
1912 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1913 for (unsigned Opcode :
1919 setOperationAction(Opcode, VT, Legal);
1920
1921 // Strict fp extend and trunc are legal
1922 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1924 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1926
1927 // FIXME: We could potentially make use of the vector comparison instructions
1928 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1929 // complications:
1930 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1931 // so we would need to expand when the condition code doesn't match the
1932 // kind of comparison.
1933 // * Some kinds of comparison require more than one FCMXY instruction so
1934 // would need to be expanded instead.
1935 // * The lowering of the non-strict versions involves target-specific ISD
1936 // nodes so we would likely need to add strict versions of all of them and
1937 // handle them appropriately.
1940
1941 if (Subtarget->isLittleEndian()) {
1942 for (unsigned im = (unsigned)ISD::PRE_INC;
1946 }
1947 }
1948
1949 if (Subtarget->hasD128()) {
1952 }
1953}
1954
1956 EVT OpVT) const {
1957 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1958 if (!Subtarget->hasSVE())
1959 return true;
1960
1961 // We can only support legal predicate result types. We can use the SVE
1962 // whilelo instruction for generating fixed-width predicates too.
1963 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1964 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1965 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1966 return true;
1967
1968 // The whilelo instruction only works with i32 or i64 scalar inputs.
1969 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1970 return true;
1971
1972 return false;
1973}
1974
1976 if (!Subtarget->isSVEorStreamingSVEAvailable())
1977 return true;
1978
1979 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
1980 // also support fixed-width predicates.
1981 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
1982 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
1983 VT != MVT::v4i1 && VT != MVT::v2i1;
1984}
1985
1986void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1987 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1988
1989 // By default everything must be expanded.
1990 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1992
1993 if (VT.isFloatingPoint()) {
2003 }
2004
2006 VT == MVT::v1f64 ? Expand : Custom;
2007
2008 // Mark integer truncating stores/extending loads as having custom lowering
2009 if (VT.isInteger()) {
2010 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2011 while (InnerVT != VT) {
2012 setTruncStoreAction(VT, InnerVT, Default);
2013 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2014 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2015 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2016 InnerVT = InnerVT.changeVectorElementType(
2017 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2018 }
2019 }
2020
2021 // Mark floating-point truncating stores/extending loads as having custom
2022 // lowering
2023 if (VT.isFloatingPoint()) {
2024 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2025 while (InnerVT != VT) {
2026 setTruncStoreAction(VT, InnerVT, Custom);
2027 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2028 InnerVT = InnerVT.changeVectorElementType(
2030 }
2031 }
2032
2033 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2034 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2035
2036 // Lower fixed length vector operations to scalable equivalents.
2041 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2078 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2079 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2081 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2100 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2126}
2127
2128void AArch64TargetLowering::addDRType(MVT VT) {
2129 addRegisterClass(VT, &AArch64::FPR64RegClass);
2130 if (Subtarget->isNeonAvailable())
2131 addTypeForNEON(VT);
2132}
2133
2134void AArch64TargetLowering::addQRType(MVT VT) {
2135 addRegisterClass(VT, &AArch64::FPR128RegClass);
2136 if (Subtarget->isNeonAvailable())
2137 addTypeForNEON(VT);
2138}
2139
2141 LLVMContext &C, EVT VT) const {
2142 if (!VT.isVector())
2143 return MVT::i32;
2144 if (VT.isScalableVector())
2145 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2147}
2148
2149// isIntImmediate - This method tests to see if the node is a constant
2150// operand. If so Imm will receive the value.
2151static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2152 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2153 Imm = C->getZExtValue();
2154 return true;
2155 }
2156 return false;
2157}
2158
2159// isOpcWithIntImmediate - This method tests to see if the node is a specific
2160// opcode and that it has a immediate integer right operand.
2161// If so Imm will receive the value.
2162static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2163 uint64_t &Imm) {
2164 return N->getOpcode() == Opc &&
2165 isIntImmediate(N->getOperand(1).getNode(), Imm);
2166}
2167
2168static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2169 const APInt &Demanded,
2171 unsigned NewOpc) {
2172 uint64_t OldImm = Imm, NewImm, Enc;
2173 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2174
2175 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2176 // bimm64.
2177 if (Imm == 0 || Imm == Mask ||
2179 return false;
2180
2181 unsigned EltSize = Size;
2182 uint64_t DemandedBits = Demanded.getZExtValue();
2183
2184 // Clear bits that are not demanded.
2185 Imm &= DemandedBits;
2186
2187 while (true) {
2188 // The goal here is to set the non-demanded bits in a way that minimizes
2189 // the number of switching between 0 and 1. In order to achieve this goal,
2190 // we set the non-demanded bits to the value of the preceding demanded bits.
2191 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2192 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2193 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2194 // The final result is 0b11000011.
2195 uint64_t NonDemandedBits = ~DemandedBits;
2196 uint64_t InvertedImm = ~Imm & DemandedBits;
2197 uint64_t RotatedImm =
2198 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2199 NonDemandedBits;
2200 uint64_t Sum = RotatedImm + NonDemandedBits;
2201 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2202 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2203 NewImm = (Imm | Ones) & Mask;
2204
2205 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2206 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2207 // we halve the element size and continue the search.
2208 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2209 break;
2210
2211 // We cannot shrink the element size any further if it is 2-bits.
2212 if (EltSize == 2)
2213 return false;
2214
2215 EltSize /= 2;
2216 Mask >>= EltSize;
2217 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2218
2219 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2220 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2221 return false;
2222
2223 // Merge the upper and lower halves of Imm and DemandedBits.
2224 Imm |= Hi;
2225 DemandedBits |= DemandedBitsHi;
2226 }
2227
2228 ++NumOptimizedImms;
2229
2230 // Replicate the element across the register width.
2231 while (EltSize < Size) {
2232 NewImm |= NewImm << EltSize;
2233 EltSize *= 2;
2234 }
2235
2236 (void)OldImm;
2237 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2238 "demanded bits should never be altered");
2239 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2240
2241 // Create the new constant immediate node.
2242 EVT VT = Op.getValueType();
2243 SDLoc DL(Op);
2244 SDValue New;
2245
2246 // If the new constant immediate is all-zeros or all-ones, let the target
2247 // independent DAG combine optimize this node.
2248 if (NewImm == 0 || NewImm == OrigMask) {
2249 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2250 TLO.DAG.getConstant(NewImm, DL, VT));
2251 // Otherwise, create a machine node so that target independent DAG combine
2252 // doesn't undo this optimization.
2253 } else {
2255 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2256 New = SDValue(
2257 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2258 }
2259
2260 return TLO.CombineTo(Op, New);
2261}
2262
2264 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2265 TargetLoweringOpt &TLO) const {
2266 // Delay this optimization to as late as possible.
2267 if (!TLO.LegalOps)
2268 return false;
2269
2271 return false;
2272
2273 EVT VT = Op.getValueType();
2274 if (VT.isVector())
2275 return false;
2276
2277 unsigned Size = VT.getSizeInBits();
2278 assert((Size == 32 || Size == 64) &&
2279 "i32 or i64 is expected after legalization.");
2280
2281 // Exit early if we demand all bits.
2282 if (DemandedBits.popcount() == Size)
2283 return false;
2284
2285 unsigned NewOpc;
2286 switch (Op.getOpcode()) {
2287 default:
2288 return false;
2289 case ISD::AND:
2290 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2291 break;
2292 case ISD::OR:
2293 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2294 break;
2295 case ISD::XOR:
2296 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2297 break;
2298 }
2299 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2300 if (!C)
2301 return false;
2302 uint64_t Imm = C->getZExtValue();
2303 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2304}
2305
2306/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2307/// Mask are known to be either zero or one and return them Known.
2309 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2310 const SelectionDAG &DAG, unsigned Depth) const {
2311 switch (Op.getOpcode()) {
2312 default:
2313 break;
2314 case AArch64ISD::DUP: {
2315 SDValue SrcOp = Op.getOperand(0);
2316 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2317 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2318 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2319 "Expected DUP implicit truncation");
2320 Known = Known.trunc(Op.getScalarValueSizeInBits());
2321 }
2322 break;
2323 }
2324 case AArch64ISD::CSEL: {
2325 KnownBits Known2;
2326 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2327 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2328 Known = Known.intersectWith(Known2);
2329 break;
2330 }
2331 case AArch64ISD::BICi: {
2332 // Compute the bit cleared value.
2333 uint64_t Mask =
2334 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
2335 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2336 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
2337 break;
2338 }
2339 case AArch64ISD::VLSHR: {
2340 KnownBits Known2;
2341 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2342 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2343 Known = KnownBits::lshr(Known, Known2);
2344 break;
2345 }
2346 case AArch64ISD::VASHR: {
2347 KnownBits Known2;
2348 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2349 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2350 Known = KnownBits::ashr(Known, Known2);
2351 break;
2352 }
2353 case AArch64ISD::VSHL: {
2354 KnownBits Known2;
2355 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2356 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2357 Known = KnownBits::shl(Known, Known2);
2358 break;
2359 }
2360 case AArch64ISD::MOVI: {
2362 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2363 break;
2364 }
2366 case AArch64ISD::ADDlow: {
2367 if (!Subtarget->isTargetILP32())
2368 break;
2369 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2370 Known.Zero = APInt::getHighBitsSet(64, 32);
2371 break;
2372 }
2374 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2375 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2376 break;
2377 }
2379 Intrinsic::ID IntID =
2380 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2381 switch (IntID) {
2382 default: return;
2383 case Intrinsic::aarch64_ldaxr:
2384 case Intrinsic::aarch64_ldxr: {
2385 unsigned BitWidth = Known.getBitWidth();
2386 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2387 unsigned MemBits = VT.getScalarSizeInBits();
2388 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2389 return;
2390 }
2391 }
2392 break;
2393 }
2395 case ISD::INTRINSIC_VOID: {
2396 unsigned IntNo = Op.getConstantOperandVal(0);
2397 switch (IntNo) {
2398 default:
2399 break;
2400 case Intrinsic::aarch64_neon_uaddlv: {
2401 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2402 unsigned BitWidth = Known.getBitWidth();
2403 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2404 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2405 assert(BitWidth >= Bound && "Unexpected width!");
2407 Known.Zero |= Mask;
2408 }
2409 break;
2410 }
2411 case Intrinsic::aarch64_neon_umaxv:
2412 case Intrinsic::aarch64_neon_uminv: {
2413 // Figure out the datatype of the vector operand. The UMINV instruction
2414 // will zero extend the result, so we can mark as known zero all the
2415 // bits larger than the element datatype. 32-bit or larget doesn't need
2416 // this as those are legal types and will be handled by isel directly.
2417 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2418 unsigned BitWidth = Known.getBitWidth();
2419 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2420 assert(BitWidth >= 8 && "Unexpected width!");
2422 Known.Zero |= Mask;
2423 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2424 assert(BitWidth >= 16 && "Unexpected width!");
2426 Known.Zero |= Mask;
2427 }
2428 break;
2429 } break;
2430 }
2431 }
2432 }
2433}
2434
2436 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2437 unsigned Depth) const {
2438 EVT VT = Op.getValueType();
2439 unsigned VTBits = VT.getScalarSizeInBits();
2440 unsigned Opcode = Op.getOpcode();
2441 switch (Opcode) {
2442 case AArch64ISD::CMEQ:
2443 case AArch64ISD::CMGE:
2444 case AArch64ISD::CMGT:
2445 case AArch64ISD::CMHI:
2446 case AArch64ISD::CMHS:
2447 case AArch64ISD::FCMEQ:
2448 case AArch64ISD::FCMGE:
2449 case AArch64ISD::FCMGT:
2450 case AArch64ISD::CMEQz:
2451 case AArch64ISD::CMGEz:
2452 case AArch64ISD::CMGTz:
2453 case AArch64ISD::CMLEz:
2454 case AArch64ISD::CMLTz:
2455 case AArch64ISD::FCMEQz:
2456 case AArch64ISD::FCMGEz:
2457 case AArch64ISD::FCMGTz:
2458 case AArch64ISD::FCMLEz:
2459 case AArch64ISD::FCMLTz:
2460 // Compares return either 0 or all-ones
2461 return VTBits;
2462 }
2463
2464 return 1;
2465}
2466
2468 EVT) const {
2469 return MVT::i64;
2470}
2471
2473 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2474 unsigned *Fast) const {
2475 if (Subtarget->requiresStrictAlign())
2476 return false;
2477
2478 if (Fast) {
2479 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2480 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2481 // See comments in performSTORECombine() for more details about
2482 // these conditions.
2483
2484 // Code that uses clang vector extensions can mark that it
2485 // wants unaligned accesses to be treated as fast by
2486 // underspecifying alignment to be 1 or 2.
2487 Alignment <= 2 ||
2488
2489 // Disregard v2i64. Memcpy lowering produces those and splitting
2490 // them regresses performance on micro-benchmarks and olden/bh.
2491 VT == MVT::v2i64;
2492 }
2493 return true;
2494}
2495
2496// Same as above but handling LLTs instead.
2498 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2499 unsigned *Fast) const {
2500 if (Subtarget->requiresStrictAlign())
2501 return false;
2502
2503 if (Fast) {
2504 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2505 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2506 Ty.getSizeInBytes() != 16 ||
2507 // See comments in performSTORECombine() for more details about
2508 // these conditions.
2509
2510 // Code that uses clang vector extensions can mark that it
2511 // wants unaligned accesses to be treated as fast by
2512 // underspecifying alignment to be 1 or 2.
2513 Alignment <= 2 ||
2514
2515 // Disregard v2i64. Memcpy lowering produces those and splitting
2516 // them regresses performance on micro-benchmarks and olden/bh.
2517 Ty == LLT::fixed_vector(2, 64);
2518 }
2519 return true;
2520}
2521
2522FastISel *
2524 const TargetLibraryInfo *libInfo) const {
2525 return AArch64::createFastISel(funcInfo, libInfo);
2526}
2527
2528const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2529#define MAKE_CASE(V) \
2530 case V: \
2531 return #V;
2532 switch ((AArch64ISD::NodeType)Opcode) {
2534 break;
2858 }
2859#undef MAKE_CASE
2860 return nullptr;
2861}
2862
2865 MachineBasicBlock *MBB) const {
2866 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2867 // phi node:
2868
2869 // OrigBB:
2870 // [... previous instrs leading to comparison ...]
2871 // b.ne TrueBB
2872 // b EndBB
2873 // TrueBB:
2874 // ; Fallthrough
2875 // EndBB:
2876 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2877
2878 MachineFunction *MF = MBB->getParent();
2879 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2880 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2881 DebugLoc DL = MI.getDebugLoc();
2883
2884 Register DestReg = MI.getOperand(0).getReg();
2885 Register IfTrueReg = MI.getOperand(1).getReg();
2886 Register IfFalseReg = MI.getOperand(2).getReg();
2887 unsigned CondCode = MI.getOperand(3).getImm();
2888 bool NZCVKilled = MI.getOperand(4).isKill();
2889
2890 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2891 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2892 MF->insert(It, TrueBB);
2893 MF->insert(It, EndBB);
2894
2895 // Transfer rest of current basic-block to EndBB
2896 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2897 MBB->end());
2899
2900 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2901 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2902 MBB->addSuccessor(TrueBB);
2903 MBB->addSuccessor(EndBB);
2904
2905 // TrueBB falls through to the end.
2906 TrueBB->addSuccessor(EndBB);
2907
2908 if (!NZCVKilled) {
2909 TrueBB->addLiveIn(AArch64::NZCV);
2910 EndBB->addLiveIn(AArch64::NZCV);
2911 }
2912
2913 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2914 .addReg(IfTrueReg)
2915 .addMBB(TrueBB)
2916 .addReg(IfFalseReg)
2917 .addMBB(MBB);
2918
2919 MI.eraseFromParent();
2920 return EndBB;
2921}
2922
2924 MachineInstr &MI, MachineBasicBlock *BB) const {
2926 BB->getParent()->getFunction().getPersonalityFn())) &&
2927 "SEH does not use catchret!");
2928 return BB;
2929}
2930
2933 MachineBasicBlock *MBB) const {
2934 MachineFunction &MF = *MBB->getParent();
2935 MachineBasicBlock::iterator MBBI = MI.getIterator();
2937 const AArch64InstrInfo &TII =
2938 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2939 Register TargetReg = MI.getOperand(0).getReg();
2941 TII.probedStackAlloc(MBBI, TargetReg, false);
2942
2943 MI.eraseFromParent();
2944 return NextInst->getParent();
2945}
2946
2948AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2950 MachineBasicBlock *BB) const {
2951 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2952 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2953
2954 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2955 MIB.add(MI.getOperand(1)); // slice index register
2956 MIB.add(MI.getOperand(2)); // slice index offset
2957 MIB.add(MI.getOperand(3)); // pg
2958 MIB.add(MI.getOperand(4)); // base
2959 MIB.add(MI.getOperand(5)); // offset
2960
2961 MI.eraseFromParent(); // The pseudo is gone now.
2962 return BB;
2963}
2964
2967 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2969 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2970
2971 MIB.addReg(AArch64::ZA, RegState::Define);
2972 MIB.add(MI.getOperand(0)); // Vector select register
2973 MIB.add(MI.getOperand(1)); // Vector select offset
2974 MIB.add(MI.getOperand(2)); // Base
2975 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2976
2977 MI.eraseFromParent(); // The pseudo is gone now.
2978 return BB;
2979}
2980
2983 unsigned Opcode,
2984 bool Op0IsDef) const {
2985 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2987
2988 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
2989 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
2990 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2991 MIB.add(MI.getOperand(I));
2992
2993 MI.eraseFromParent(); // The pseudo is gone now.
2994 return BB;
2995}
2996
2998AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
3000 MachineBasicBlock *BB) const {
3001 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3002 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3003 unsigned StartIdx = 0;
3004
3005 bool HasTile = BaseReg != AArch64::ZA;
3006 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3007 if (HasZPROut) {
3008 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3009 ++StartIdx;
3010 }
3011 if (HasTile) {
3012 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3013 RegState::Define); // Output ZA Tile
3014 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3015 StartIdx++;
3016 } else {
3017 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3018 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3019 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3020 ++StartIdx;
3021 }
3022 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3023 }
3024 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3025 MIB.add(MI.getOperand(I));
3026
3027 MI.eraseFromParent(); // The pseudo is gone now.
3028 return BB;
3029}
3030
3033 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3035 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3036 MIB.add(MI.getOperand(0)); // Mask
3037
3038 unsigned Mask = MI.getOperand(0).getImm();
3039 for (unsigned I = 0; I < 8; I++) {
3040 if (Mask & (1 << I))
3041 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3042 }
3043
3044 MI.eraseFromParent(); // The pseudo is gone now.
3045 return BB;
3046}
3047
3050 MachineBasicBlock *BB) const {
3051 MachineFunction *MF = BB->getParent();
3052 MachineFrameInfo &MFI = MF->getFrameInfo();
3054 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3055 if (TPIDR2.Uses > 0) {
3056 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3057 // Store the buffer pointer to the TPIDR2 stack object.
3058 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
3059 .addReg(MI.getOperand(0).getReg())
3060 .addFrameIndex(TPIDR2.FrameIndex)
3061 .addImm(0);
3062 // Set the reserved bytes (10-15) to zero
3063 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
3064 .addReg(AArch64::WZR)
3065 .addFrameIndex(TPIDR2.FrameIndex)
3066 .addImm(5);
3067 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
3068 .addReg(AArch64::WZR)
3069 .addFrameIndex(TPIDR2.FrameIndex)
3070 .addImm(3);
3071 } else
3072 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3073
3074 BB->remove_instr(&MI);
3075 return BB;
3076}
3077
3080 MachineBasicBlock *BB) const {
3081 MachineFunction *MF = BB->getParent();
3082 MachineFrameInfo &MFI = MF->getFrameInfo();
3084 // TODO This function grows the stack with a subtraction, which doesn't work
3085 // on Windows. Some refactoring to share the functionality in
3086 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3087 // supports SME
3089 "Lazy ZA save is not yet supported on Windows");
3090
3091 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3092
3093 if (TPIDR2.Uses > 0) {
3094 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3096
3097 // The SUBXrs below won't always be emitted in a form that accepts SP
3098 // directly
3099 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3100 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3101 .addReg(AArch64::SP);
3102
3103 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3104 auto Size = MI.getOperand(1).getReg();
3105 auto Dest = MI.getOperand(0).getReg();
3106 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3107 .addReg(Size)
3108 .addReg(Size)
3109 .addReg(SP);
3110 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3111 AArch64::SP)
3112 .addReg(Dest);
3113
3114 // We have just allocated a variable sized object, tell this to PEI.
3115 MFI.CreateVariableSizedObject(Align(16), nullptr);
3116 }
3117
3118 BB->remove_instr(&MI);
3119 return BB;
3120}
3121
3123 MachineInstr &MI, MachineBasicBlock *BB) const {
3124
3125 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3126 if (SMEOrigInstr != -1) {
3127 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3128 uint64_t SMEMatrixType =
3129 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3130 switch (SMEMatrixType) {
3132 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3134 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3136 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3138 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3140 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3142 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3143 }
3144 }
3145
3146 switch (MI.getOpcode()) {
3147 default:
3148#ifndef NDEBUG
3149 MI.dump();
3150#endif
3151 llvm_unreachable("Unexpected instruction for custom inserter!");
3152 case AArch64::InitTPIDR2Obj:
3153 return EmitInitTPIDR2Object(MI, BB);
3154 case AArch64::AllocateZABuffer:
3155 return EmitAllocateZABuffer(MI, BB);
3156 case AArch64::F128CSEL:
3157 return EmitF128CSEL(MI, BB);
3158 case TargetOpcode::STATEPOINT:
3159 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3160 // while bl call instruction (where statepoint will be lowered at the end)
3161 // has implicit def. This def is early-clobber as it will be set at
3162 // the moment of the call and earlier than any use is read.
3163 // Add this implicit dead def here as a workaround.
3164 MI.addOperand(*MI.getMF(),
3166 AArch64::LR, /*isDef*/ true,
3167 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3168 /*isUndef*/ false, /*isEarlyClobber*/ true));
3169 [[fallthrough]];
3170 case TargetOpcode::STACKMAP:
3171 case TargetOpcode::PATCHPOINT:
3172 return emitPatchPoint(MI, BB);
3173
3174 case TargetOpcode::PATCHABLE_EVENT_CALL:
3175 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3176 return BB;
3177
3178 case AArch64::CATCHRET:
3179 return EmitLoweredCatchRet(MI, BB);
3180
3181 case AArch64::PROBED_STACKALLOC_DYN:
3182 return EmitDynamicProbedAlloc(MI, BB);
3183
3184 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3185 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3186 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3187 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3188 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3189 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3190 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3191 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3192 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3193 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3194 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3195 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3196 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3197 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3198 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3199 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3200 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3201 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3202 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3203 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3204 case AArch64::LDR_ZA_PSEUDO:
3205 return EmitFill(MI, BB);
3206 case AArch64::LDR_TX_PSEUDO:
3207 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3208 case AArch64::STR_TX_PSEUDO:
3209 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3210 case AArch64::ZERO_M_PSEUDO:
3211 return EmitZero(MI, BB);
3212 case AArch64::ZERO_T_PSEUDO:
3213 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3214 }
3215}
3216
3217//===----------------------------------------------------------------------===//
3218// AArch64 Lowering private implementation.
3219//===----------------------------------------------------------------------===//
3220
3221//===----------------------------------------------------------------------===//
3222// Lowering Code
3223//===----------------------------------------------------------------------===//
3224
3225// Forward declarations of SVE fixed length lowering helpers
3230 SelectionDAG &DAG);
3233 EVT VT);
3234
3235/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3236static bool isZerosVector(const SDNode *N) {
3237 // Look through a bit convert.
3238 while (N->getOpcode() == ISD::BITCAST)
3239 N = N->getOperand(0).getNode();
3240
3242 return true;
3243
3244 if (N->getOpcode() != AArch64ISD::DUP)
3245 return false;
3246
3247 auto Opnd0 = N->getOperand(0);
3248 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3249}
3250
3251/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3252/// CC
3254 switch (CC) {
3255 default:
3256 llvm_unreachable("Unknown condition code!");
3257 case ISD::SETNE:
3258 return AArch64CC::NE;
3259 case ISD::SETEQ:
3260 return AArch64CC::EQ;
3261 case ISD::SETGT:
3262 return AArch64CC::GT;
3263 case ISD::SETGE:
3264 return AArch64CC::GE;
3265 case ISD::SETLT:
3266 return AArch64CC::LT;
3267 case ISD::SETLE:
3268 return AArch64CC::LE;
3269 case ISD::SETUGT:
3270 return AArch64CC::HI;
3271 case ISD::SETUGE:
3272 return AArch64CC::HS;
3273 case ISD::SETULT:
3274 return AArch64CC::LO;
3275 case ISD::SETULE:
3276 return AArch64CC::LS;
3277 }
3278}
3279
3280/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3282 AArch64CC::CondCode &CondCode,
3283 AArch64CC::CondCode &CondCode2) {
3284 CondCode2 = AArch64CC::AL;
3285 switch (CC) {
3286 default:
3287 llvm_unreachable("Unknown FP condition!");
3288 case ISD::SETEQ:
3289 case ISD::SETOEQ:
3290 CondCode = AArch64CC::EQ;
3291 break;
3292 case ISD::SETGT:
3293 case ISD::SETOGT:
3294 CondCode = AArch64CC::GT;
3295 break;
3296 c