LLVM 19.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148/// Value type used for condition codes.
149static const MVT MVT_CC = MVT::i32;
150
151static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
152 AArch64::X3, AArch64::X4, AArch64::X5,
153 AArch64::X6, AArch64::X7};
154static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
155 AArch64::Q3, AArch64::Q4, AArch64::Q5,
156 AArch64::Q6, AArch64::Q7};
157
159
161
162static inline EVT getPackedSVEVectorVT(EVT VT) {
163 switch (VT.getSimpleVT().SimpleTy) {
164 default:
165 llvm_unreachable("unexpected element type for vector");
166 case MVT::i8:
167 return MVT::nxv16i8;
168 case MVT::i16:
169 return MVT::nxv8i16;
170 case MVT::i32:
171 return MVT::nxv4i32;
172 case MVT::i64:
173 return MVT::nxv2i64;
174 case MVT::f16:
175 return MVT::nxv8f16;
176 case MVT::f32:
177 return MVT::nxv4f32;
178 case MVT::f64:
179 return MVT::nxv2f64;
180 case MVT::bf16:
181 return MVT::nxv8bf16;
182 }
183}
184
185// NOTE: Currently there's only a need to return integer vector types. If this
186// changes then just add an extra "type" parameter.
188 switch (EC.getKnownMinValue()) {
189 default:
190 llvm_unreachable("unexpected element count for vector");
191 case 16:
192 return MVT::nxv16i8;
193 case 8:
194 return MVT::nxv8i16;
195 case 4:
196 return MVT::nxv4i32;
197 case 2:
198 return MVT::nxv2i64;
199 }
200}
201
203 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
204 "Expected scalable predicate vector type!");
205 switch (VT.getVectorMinNumElements()) {
206 default:
207 llvm_unreachable("unexpected element count for vector");
208 case 2:
209 return MVT::nxv2i64;
210 case 4:
211 return MVT::nxv4i32;
212 case 8:
213 return MVT::nxv8i16;
214 case 16:
215 return MVT::nxv16i8;
216 }
217}
218
219/// Returns true if VT's elements occupy the lowest bit positions of its
220/// associated register class without any intervening space.
221///
222/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
223/// same register class, but only nxv8f16 can be treated as a packed vector.
224static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
226 "Expected legal vector type!");
227 return VT.isFixedLengthVector() ||
229}
230
231// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
232// predicate and end with a passthru value matching the result type.
233static bool isMergePassthruOpcode(unsigned Opc) {
234 switch (Opc) {
235 default:
236 return false;
266 return true;
267 }
268}
269
270// Returns true if inactive lanes are known to be zeroed by construction.
272 switch (Op.getOpcode()) {
273 default:
274 return false;
275 // We guarantee i1 splat_vectors to zero the other lanes
279 return true;
281 switch (Op.getConstantOperandVal(0)) {
282 default:
283 return false;
284 case Intrinsic::aarch64_sve_ptrue:
285 case Intrinsic::aarch64_sve_pnext:
286 case Intrinsic::aarch64_sve_cmpeq:
287 case Intrinsic::aarch64_sve_cmpne:
288 case Intrinsic::aarch64_sve_cmpge:
289 case Intrinsic::aarch64_sve_cmpgt:
290 case Intrinsic::aarch64_sve_cmphs:
291 case Intrinsic::aarch64_sve_cmphi:
292 case Intrinsic::aarch64_sve_cmpeq_wide:
293 case Intrinsic::aarch64_sve_cmpne_wide:
294 case Intrinsic::aarch64_sve_cmpge_wide:
295 case Intrinsic::aarch64_sve_cmpgt_wide:
296 case Intrinsic::aarch64_sve_cmplt_wide:
297 case Intrinsic::aarch64_sve_cmple_wide:
298 case Intrinsic::aarch64_sve_cmphs_wide:
299 case Intrinsic::aarch64_sve_cmphi_wide:
300 case Intrinsic::aarch64_sve_cmplo_wide:
301 case Intrinsic::aarch64_sve_cmpls_wide:
302 case Intrinsic::aarch64_sve_fcmpeq:
303 case Intrinsic::aarch64_sve_fcmpne:
304 case Intrinsic::aarch64_sve_fcmpge:
305 case Intrinsic::aarch64_sve_fcmpgt:
306 case Intrinsic::aarch64_sve_fcmpuo:
307 case Intrinsic::aarch64_sve_facgt:
308 case Intrinsic::aarch64_sve_facge:
309 case Intrinsic::aarch64_sve_whilege:
310 case Intrinsic::aarch64_sve_whilegt:
311 case Intrinsic::aarch64_sve_whilehi:
312 case Intrinsic::aarch64_sve_whilehs:
313 case Intrinsic::aarch64_sve_whilele:
314 case Intrinsic::aarch64_sve_whilelo:
315 case Intrinsic::aarch64_sve_whilels:
316 case Intrinsic::aarch64_sve_whilelt:
317 case Intrinsic::aarch64_sve_match:
318 case Intrinsic::aarch64_sve_nmatch:
319 case Intrinsic::aarch64_sve_whilege_x2:
320 case Intrinsic::aarch64_sve_whilegt_x2:
321 case Intrinsic::aarch64_sve_whilehi_x2:
322 case Intrinsic::aarch64_sve_whilehs_x2:
323 case Intrinsic::aarch64_sve_whilele_x2:
324 case Intrinsic::aarch64_sve_whilelo_x2:
325 case Intrinsic::aarch64_sve_whilels_x2:
326 case Intrinsic::aarch64_sve_whilelt_x2:
327 return true;
328 }
329 }
330}
331
333 const AArch64Subtarget &STI)
334 : TargetLowering(TM), Subtarget(&STI) {
335 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
336 // we have to make something up. Arbitrarily, choose ZeroOrOne.
338 // When comparing vectors the result sets the different elements in the
339 // vector to all-one or all-zero.
341
342 // Set up the register classes.
343 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
344 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
345
346 if (Subtarget->hasLS64()) {
347 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
348 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
350 }
351
352 if (Subtarget->hasFPARMv8()) {
353 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
354 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
355 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
356 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
357 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
358 }
359
360 if (Subtarget->hasNEON()) {
361 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
362 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
363 // Someone set us up the NEON.
364 addDRTypeForNEON(MVT::v2f32);
365 addDRTypeForNEON(MVT::v8i8);
366 addDRTypeForNEON(MVT::v4i16);
367 addDRTypeForNEON(MVT::v2i32);
368 addDRTypeForNEON(MVT::v1i64);
369 addDRTypeForNEON(MVT::v1f64);
370 addDRTypeForNEON(MVT::v4f16);
371 addDRTypeForNEON(MVT::v4bf16);
372
373 addQRTypeForNEON(MVT::v4f32);
374 addQRTypeForNEON(MVT::v2f64);
375 addQRTypeForNEON(MVT::v16i8);
376 addQRTypeForNEON(MVT::v8i16);
377 addQRTypeForNEON(MVT::v4i32);
378 addQRTypeForNEON(MVT::v2i64);
379 addQRTypeForNEON(MVT::v8f16);
380 addQRTypeForNEON(MVT::v8bf16);
381 }
382
383 if (Subtarget->hasSVEorSME()) {
384 // Add legal sve predicate types
385 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
386 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
387 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
388 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
389 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
390
391 // Add legal sve data types
392 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
393 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
394 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
395 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
396
397 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
398 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
399 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
400 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
401 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
402 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
403
404 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
405 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
406 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
407
408 if (Subtarget->useSVEForFixedLengthVectors()) {
411 addRegisterClass(VT, &AArch64::ZPRRegClass);
412
415 addRegisterClass(VT, &AArch64::ZPRRegClass);
416 }
417 }
418
419 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
420 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
421 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
422 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
423
424 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
425 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
426 }
427
428 // Compute derived properties from the register classes
430
431 // Provide all sorts of operation actions
470
474
478
480
481 // Custom lowering hooks are needed for XOR
482 // to fold it into CSINC/CSINV.
485
486 // Virtually no operation on f128 is legal, but LLVM can't expand them when
487 // there's a valid register class, so we need custom operations in most cases.
511 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
512 // aren't handled.
513
514 // Lowering for many of the conversions is actually specified by the non-f128
515 // type. The LowerXXX function will be trivial when f128 isn't involved.
540 if (Subtarget->hasFPARMv8()) {
543 }
546 if (Subtarget->hasFPARMv8()) {
549 }
552
557
558 // Variable arguments.
563
564 // Variable-sized objects.
567
568 // Lowering Funnel Shifts to EXTR
573
575
576 // Constant pool entries
578
579 // BlockAddress
581
582 // AArch64 lacks both left-rotate and popcount instructions.
588 }
589
590 // AArch64 doesn't have i32 MULH{S|U}.
593
594 // AArch64 doesn't have {U|S}MUL_LOHI.
599
600 if (Subtarget->hasCSSC()) {
604
606
610
613
618
623 } else {
627
630
633 }
634
640 }
647
648 // Custom lower Add/Sub/Mul with overflow.
661
670
679 if (Subtarget->hasFullFP16()) {
682 } else {
685 }
686
687 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
695 setOperationAction(Op, MVT::f16, Promote);
696 setOperationAction(Op, MVT::v4f16, Expand);
697 setOperationAction(Op, MVT::v8f16, Expand);
698 setOperationAction(Op, MVT::bf16, Promote);
699 setOperationAction(Op, MVT::v4bf16, Expand);
700 setOperationAction(Op, MVT::v8bf16, Expand);
701 }
702
703 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
704 for (auto Op : {
708 ISD::FADD,
709 ISD::FSUB,
710 ISD::FMUL,
711 ISD::FDIV,
712 ISD::FMA,
742 })
743 setOperationAction(Op, ScalarVT, Promote);
744
745 for (auto Op : {ISD::FNEG, ISD::FABS})
746 setOperationAction(Op, ScalarVT, Legal);
747
748 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
749 // because the result type is integer.
753 setOperationAction(Op, ScalarVT, Custom);
754
755 // promote v4f16 to v4f32 when that is known to be safe.
756 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
757 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
758 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
759 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
760 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
761 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
762 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
763 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
764 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
765 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
766 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
767 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
768
778
779 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
801 };
802
803 if (!Subtarget->hasFullFP16()) {
804 LegalizeNarrowFP(MVT::f16);
805 }
806 LegalizeNarrowFP(MVT::bf16);
809
810 // AArch64 has implementations of a lot of rounding-like FP operations.
811 for (auto Op :
822 for (MVT Ty : {MVT::f32, MVT::f64})
824 if (Subtarget->hasFullFP16())
825 setOperationAction(Op, MVT::f16, Legal);
826 }
827
828 // Basic strict FP operations are legal
831 for (MVT Ty : {MVT::f32, MVT::f64})
833 if (Subtarget->hasFullFP16())
834 setOperationAction(Op, MVT::f16, Legal);
835 }
836
837 // Strict conversion to a larger type is legal
838 for (auto VT : {MVT::f32, MVT::f64})
840
842
848
850 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
853 } else {
856 }
859
860 // Generate outline atomics library calls only if LSE was not specified for
861 // subtarget
862 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
888#define LCALLNAMES(A, B, N) \
889 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
890 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
891 setLibcallName(A##N##_REL, #B #N "_rel"); \
892 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
893#define LCALLNAME4(A, B) \
894 LCALLNAMES(A, B, 1) \
895 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
896#define LCALLNAME5(A, B) \
897 LCALLNAMES(A, B, 1) \
898 LCALLNAMES(A, B, 2) \
899 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
900 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
901 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
902 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
903 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
904 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
905 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
906#undef LCALLNAMES
907#undef LCALLNAME4
908#undef LCALLNAME5
909 }
910
911 if (Subtarget->hasLSE128()) {
912 // Custom lowering because i128 is not legal. Must be replaced by 2x64
913 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
917 }
918
919 // 128-bit loads and stores can be done without expanding
922
923 // Aligned 128-bit loads and stores are single-copy atomic according to the
924 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
925 if (Subtarget->hasLSE2()) {
928 }
929
930 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
931 // custom lowering, as there are no un-paired non-temporal stores and
932 // legalization will break up 256 bit inputs.
934 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
935 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
936 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
941
942 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
943 // custom lowering, as there are no un-paired non-temporal loads legalization
944 // will break up 256 bit inputs.
945 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
946 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
947 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
948 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
949 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
950 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
951 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
952 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
953
954 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
956
957 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
958 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
959 // Issue __sincos_stret if available.
962 } else {
965 }
966
967 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
968 // MSVCRT doesn't have powi; fall back to pow
969 setLibcallName(RTLIB::POWI_F32, nullptr);
970 setLibcallName(RTLIB::POWI_F64, nullptr);
971 }
972
973 // Make floating-point constants legal for the large code model, so they don't
974 // become loads from the constant pool.
975 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
978 }
979
980 // AArch64 does not have floating-point extending loads, i1 sign-extending
981 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
982 for (MVT VT : MVT::fp_valuetypes()) {
983 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
984 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
985 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
986 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
987 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
988 }
989 for (MVT VT : MVT::integer_valuetypes())
991
992 for (MVT WideVT : MVT::fp_valuetypes()) {
993 for (MVT NarrowVT : MVT::fp_valuetypes()) {
994 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
995 setTruncStoreAction(WideVT, NarrowVT, Expand);
996 }
997 }
998 }
999
1000 if (Subtarget->hasFPARMv8()) {
1004 }
1005
1006 // Indexed loads and stores are supported.
1007 for (unsigned im = (unsigned)ISD::PRE_INC;
1009 setIndexedLoadAction(im, MVT::i8, Legal);
1010 setIndexedLoadAction(im, MVT::i16, Legal);
1011 setIndexedLoadAction(im, MVT::i32, Legal);
1012 setIndexedLoadAction(im, MVT::i64, Legal);
1013 setIndexedLoadAction(im, MVT::f64, Legal);
1014 setIndexedLoadAction(im, MVT::f32, Legal);
1015 setIndexedLoadAction(im, MVT::f16, Legal);
1016 setIndexedLoadAction(im, MVT::bf16, Legal);
1017 setIndexedStoreAction(im, MVT::i8, Legal);
1018 setIndexedStoreAction(im, MVT::i16, Legal);
1019 setIndexedStoreAction(im, MVT::i32, Legal);
1020 setIndexedStoreAction(im, MVT::i64, Legal);
1021 setIndexedStoreAction(im, MVT::f64, Legal);
1022 setIndexedStoreAction(im, MVT::f32, Legal);
1023 setIndexedStoreAction(im, MVT::f16, Legal);
1024 setIndexedStoreAction(im, MVT::bf16, Legal);
1025 }
1026
1027 // Trap.
1028 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1031
1032 // We combine OR nodes for bitfield operations.
1034 // Try to create BICs for vector ANDs.
1036
1037 // Vector add and sub nodes may conceal a high-half opportunity.
1038 // Also, try to fold ADD into CSINC/CSINV..
1041
1044
1045 // Try and combine setcc with csel
1047
1049
1056
1058
1060
1062
1066
1068
1070
1072
1074
1078
1080
1081 // In case of strict alignment, avoid an excessive number of byte wide stores.
1084 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1085
1089 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1090
1093
1096 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1097
1099
1101
1102 EnableExtLdPromotion = true;
1103
1104 // Set required alignment.
1106 // Set preferred alignments.
1107
1108 // Don't align loops on Windows. The SEH unwind info generation needs to
1109 // know the exact length of functions before the alignments have been
1110 // expanded.
1111 if (!Subtarget->isTargetWindows())
1115
1116 // Only change the limit for entries in a jump table if specified by
1117 // the sub target, but not at the command line.
1118 unsigned MaxJT = STI.getMaximumJumpTableSize();
1119 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1121
1123
1125
1127
1128 if (Subtarget->hasNEON()) {
1129 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1130 // silliness like this:
1131 for (auto Op :
1149 setOperationAction(Op, MVT::v1f64, Expand);
1150
1151 for (auto Op :
1156 setOperationAction(Op, MVT::v1i64, Expand);
1157
1158 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1159 // elements smaller than i32, so promote the input to i32 first.
1160 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1161 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1162
1163 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1164 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1165 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1168 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1170
1171 if (Subtarget->hasFullFP16()) {
1174
1183 } else {
1184 // when AArch64 doesn't have fullfp16 support, promote the input
1185 // to i32 first.
1186 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1187 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1188 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1189 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1190 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1191 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1192 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1193 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1194 }
1195
1196 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1197 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1204 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1209 }
1210
1211 // Custom handling for some quad-vector types to detect MULL.
1212 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1213 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1214 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1215 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1216 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1217 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1218
1219 // Saturates
1220 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1221 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1226 }
1227
1228 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1229 MVT::v4i32}) {
1236 }
1237
1238 // Vector reductions
1239 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1240 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1241 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1246
1248 }
1249 }
1250 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1251 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1260 }
1265
1267 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1268 // Likewise, narrowing and extending vector loads/stores aren't handled
1269 // directly.
1272
1273 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1276 } else {
1279 }
1282
1285
1286 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1287 setTruncStoreAction(VT, InnerVT, Expand);
1288 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1289 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1290 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1291 }
1292 }
1293
1294 // AArch64 has implementations of a lot of rounding-like FP operations.
1295 for (auto Op :
1300 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1302 if (Subtarget->hasFullFP16())
1303 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1305 }
1306
1307 // LRINT and LLRINT.
1308 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1309 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1311 if (Subtarget->hasFullFP16())
1312 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1314 }
1315
1316 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1317
1322
1326
1327 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1328 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1329 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1330 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1331 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1332 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1333
1334 // ADDP custom lowering
1335 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1337 // FADDP custom lowering
1338 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1340 }
1341
1342 if (Subtarget->hasSME()) {
1344 }
1345
1346 // FIXME: Move lowering for more nodes here if those are common between
1347 // SVE and SME.
1348 if (Subtarget->hasSVEorSME()) {
1349 for (auto VT :
1350 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1355 }
1356 }
1357
1358 if (Subtarget->hasSVEorSME()) {
1359 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1402
1408
1417
1422
1423 if (!Subtarget->isLittleEndian())
1425
1426 if (Subtarget->hasSVE2orSME())
1427 // For SLI/SRI.
1429 }
1430
1431 // Illegal unpacked integer vector types.
1432 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1435 }
1436
1437 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1438 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1439 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1441
1442 for (auto VT :
1443 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1444 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1446
1447 for (auto VT :
1448 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1456
1460
1461 // There are no legal MVT::nxv16f## based types.
1462 if (VT != MVT::nxv16i1) {
1465 }
1466 }
1467
1468 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1469 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1470 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1471 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1476 }
1477
1478 // Firstly, exclude all scalable vector extending loads/truncating stores,
1479 // include both integer and floating scalable vector.
1481 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1482 setTruncStoreAction(VT, InnerVT, Expand);
1483 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1484 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1485 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1486 }
1487 }
1488
1489 // Then, selectively enable those which we directly support.
1490 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1491 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1492 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1493 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1494 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1495 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1496 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1497 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1498 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1499 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1500 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1501 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1502 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1503 }
1504
1505 // SVE supports truncating stores of 64 and 128-bit vectors
1506 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1507 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1508 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1509 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1510 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1511
1512 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1513 MVT::nxv4f32, MVT::nxv2f64}) {
1551 if (Subtarget->isSVEAvailable())
1556
1570
1582
1583 if (!Subtarget->isLittleEndian())
1585 }
1586
1587 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1595
1596 if (!Subtarget->isLittleEndian())
1598 }
1599
1602
1603 // NEON doesn't support integer divides, but SVE does
1604 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1605 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1608 }
1609
1610 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1611 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1612 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1613
1614 if (Subtarget->isSVEAvailable()) {
1615 // NEON doesn't support across-vector reductions, but SVE does.
1616 for (auto VT :
1617 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1619 }
1620
1621 // Histcnt is SVE2 only
1622 if (Subtarget->hasSVE2() && Subtarget->isSVEAvailable())
1624 Custom);
1625
1626 // NOTE: Currently this has to happen after computeRegisterProperties rather
1627 // than the preferred option of combining it with the addRegisterClass call.
1628 if (Subtarget->useSVEForFixedLengthVectors()) {
1631 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1632 addTypeForFixedLengthSVE(VT);
1633 }
1636 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1637 addTypeForFixedLengthSVE(VT);
1638 }
1639
1640 // 64bit results can mean a bigger than NEON input.
1641 for (auto VT : {MVT::v8i8, MVT::v4i16})
1644
1645 // 128bit results imply a bigger than NEON input.
1646 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1648 for (auto VT : {MVT::v8f16, MVT::v4f32})
1650
1651 // These operations are not supported on NEON but SVE can do them.
1653 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1654 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1655 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1656 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1657 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1658 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1659 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1660 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1661 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1662 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1663 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1664 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1665 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1666 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1667 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1672
1673 // Int operations with no NEON support.
1674 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1675 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1683 }
1684
1685 // Use SVE for vectors with more than 2 elements.
1686 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1688 }
1689
1690 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1691 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1692 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1693 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1694
1696
1697 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1699 }
1700
1701 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1702 // Only required for llvm.aarch64.mops.memset.tag
1704 }
1705
1707
1708 if (Subtarget->hasSVE()) {
1713 }
1714
1715 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1716
1717 IsStrictFPEnabled = true;
1719
1720 if (Subtarget->isWindowsArm64EC()) {
1721 // FIXME: are there intrinsics we need to exclude from this?
1722 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1723 auto code = static_cast<RTLIB::Libcall>(i);
1724 auto libcallName = getLibcallName(code);
1725 if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1726 setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
1727 }
1728 }
1729 }
1730}
1731
1732void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1733 assert(VT.isVector() && "VT should be a vector type");
1734
1735 if (VT.isFloatingPoint()) {
1737 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1738 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1739 }
1740
1741 // Mark vector float intrinsics as expand.
1742 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1752 }
1753
1754 // But we do support custom-lowering for FCOPYSIGN.
1755 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1756 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1757 VT == MVT::v8f16) &&
1758 Subtarget->hasFullFP16()))
1760
1773
1777 for (MVT InnerVT : MVT::all_valuetypes())
1778 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1779
1780 // CNT supports only B element sizes, then use UADDLP to widen.
1781 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1783
1789
1790 for (unsigned Opcode :
1793 setOperationAction(Opcode, VT, Custom);
1794
1795 if (!VT.isFloatingPoint())
1797
1798 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1799 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1800 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1801 setOperationAction(Opcode, VT, Legal);
1802
1803 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1804 // NEON types.
1805 if (VT.isFloatingPoint() &&
1806 VT.getVectorElementType() != MVT::bf16 &&
1807 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1808 for (unsigned Opcode :
1814 setOperationAction(Opcode, VT, Legal);
1815
1816 // Strict fp extend and trunc are legal
1817 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1819 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1821
1822 // FIXME: We could potentially make use of the vector comparison instructions
1823 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1824 // complications:
1825 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1826 // so we would need to expand when the condition code doesn't match the
1827 // kind of comparison.
1828 // * Some kinds of comparison require more than one FCMXY instruction so
1829 // would need to be expanded instead.
1830 // * The lowering of the non-strict versions involves target-specific ISD
1831 // nodes so we would likely need to add strict versions of all of them and
1832 // handle them appropriately.
1835
1836 if (Subtarget->isLittleEndian()) {
1837 for (unsigned im = (unsigned)ISD::PRE_INC;
1841 }
1842 }
1843
1844 if (Subtarget->hasD128()) {
1847 }
1848}
1849
1851 EVT OpVT) const {
1852 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1853 if (!Subtarget->hasSVE())
1854 return true;
1855
1856 // We can only support legal predicate result types. We can use the SVE
1857 // whilelo instruction for generating fixed-width predicates too.
1858 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1859 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1860 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1861 return true;
1862
1863 // The whilelo instruction only works with i32 or i64 scalar inputs.
1864 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1865 return true;
1866
1867 return false;
1868}
1869
1871 if (!Subtarget->hasSVEorSME())
1872 return true;
1873
1874 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
1875 // also support fixed-width predicates.
1876 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
1877 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
1878 VT != MVT::v4i1 && VT != MVT::v2i1;
1879}
1880
1881void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1882 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1883
1884 // By default everything must be expanded.
1885 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1887
1888 if (VT.isFloatingPoint()) {
1898 }
1899
1901 VT == MVT::v1f64 ? Expand : Custom;
1902
1903 // Mark integer truncating stores/extending loads as having custom lowering
1904 if (VT.isInteger()) {
1905 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1906 while (InnerVT != VT) {
1907 setTruncStoreAction(VT, InnerVT, Default);
1908 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
1909 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
1910 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
1911 InnerVT = InnerVT.changeVectorElementType(
1912 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1913 }
1914 }
1915
1916 // Mark floating-point truncating stores/extending loads as having custom
1917 // lowering
1918 if (VT.isFloatingPoint()) {
1919 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1920 while (InnerVT != VT) {
1921 setTruncStoreAction(VT, InnerVT, Custom);
1922 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
1923 InnerVT = InnerVT.changeVectorElementType(
1925 }
1926 }
1927
1928 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
1929 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
1930
1931 // Lower fixed length vector operations to scalable equivalents.
1936 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
1973 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
1974 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
1976 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
1995 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2021}
2022
2023void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
2024 addRegisterClass(VT, &AArch64::FPR64RegClass);
2025 addTypeForNEON(VT);
2026}
2027
2028void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
2029 addRegisterClass(VT, &AArch64::FPR128RegClass);
2030 addTypeForNEON(VT);
2031}
2032
2034 LLVMContext &C, EVT VT) const {
2035 if (!VT.isVector())
2036 return MVT::i32;
2037 if (VT.isScalableVector())
2038 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2040}
2041
2042// isIntImmediate - This method tests to see if the node is a constant
2043// operand. If so Imm will receive the value.
2044static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2045 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2046 Imm = C->getZExtValue();
2047 return true;
2048 }
2049 return false;
2050}
2051
2052// isOpcWithIntImmediate - This method tests to see if the node is a specific
2053// opcode and that it has a immediate integer right operand.
2054// If so Imm will receive the value.
2055static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2056 uint64_t &Imm) {
2057 return N->getOpcode() == Opc &&
2058 isIntImmediate(N->getOperand(1).getNode(), Imm);
2059}
2060
2061static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2062 const APInt &Demanded,
2064 unsigned NewOpc) {
2065 uint64_t OldImm = Imm, NewImm, Enc;
2066 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2067
2068 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2069 // bimm64.
2070 if (Imm == 0 || Imm == Mask ||
2072 return false;
2073
2074 unsigned EltSize = Size;
2075 uint64_t DemandedBits = Demanded.getZExtValue();
2076
2077 // Clear bits that are not demanded.
2078 Imm &= DemandedBits;
2079
2080 while (true) {
2081 // The goal here is to set the non-demanded bits in a way that minimizes
2082 // the number of switching between 0 and 1. In order to achieve this goal,
2083 // we set the non-demanded bits to the value of the preceding demanded bits.
2084 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2085 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2086 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2087 // The final result is 0b11000011.
2088 uint64_t NonDemandedBits = ~DemandedBits;
2089 uint64_t InvertedImm = ~Imm & DemandedBits;
2090 uint64_t RotatedImm =
2091 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2092 NonDemandedBits;
2093 uint64_t Sum = RotatedImm + NonDemandedBits;
2094 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2095 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2096 NewImm = (Imm | Ones) & Mask;
2097
2098 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2099 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2100 // we halve the element size and continue the search.
2101 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2102 break;
2103
2104 // We cannot shrink the element size any further if it is 2-bits.
2105 if (EltSize == 2)
2106 return false;
2107
2108 EltSize /= 2;
2109 Mask >>= EltSize;
2110 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2111
2112 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2113 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2114 return false;
2115
2116 // Merge the upper and lower halves of Imm and DemandedBits.
2117 Imm |= Hi;
2118 DemandedBits |= DemandedBitsHi;
2119 }
2120
2121 ++NumOptimizedImms;
2122
2123 // Replicate the element across the register width.
2124 while (EltSize < Size) {
2125 NewImm |= NewImm << EltSize;
2126 EltSize *= 2;
2127 }
2128
2129 (void)OldImm;
2130 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2131 "demanded bits should never be altered");
2132 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2133
2134 // Create the new constant immediate node.
2135 EVT VT = Op.getValueType();
2136 SDLoc DL(Op);
2137 SDValue New;
2138
2139 // If the new constant immediate is all-zeros or all-ones, let the target
2140 // independent DAG combine optimize this node.
2141 if (NewImm == 0 || NewImm == OrigMask) {
2142 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2143 TLO.DAG.getConstant(NewImm, DL, VT));
2144 // Otherwise, create a machine node so that target independent DAG combine
2145 // doesn't undo this optimization.
2146 } else {
2148 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2149 New = SDValue(
2150 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2151 }
2152
2153 return TLO.CombineTo(Op, New);
2154}
2155
2157 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2158 TargetLoweringOpt &TLO) const {
2159 // Delay this optimization to as late as possible.
2160 if (!TLO.LegalOps)
2161 return false;
2162
2164 return false;
2165
2166 EVT VT = Op.getValueType();
2167 if (VT.isVector())
2168 return false;
2169
2170 unsigned Size = VT.getSizeInBits();
2171 assert((Size == 32 || Size == 64) &&
2172 "i32 or i64 is expected after legalization.");
2173
2174 // Exit early if we demand all bits.
2175 if (DemandedBits.popcount() == Size)
2176 return false;
2177
2178 unsigned NewOpc;
2179 switch (Op.getOpcode()) {
2180 default:
2181 return false;
2182 case ISD::AND:
2183 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2184 break;
2185 case ISD::OR:
2186 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2187 break;
2188 case ISD::XOR:
2189 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2190 break;
2191 }
2192 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2193 if (!C)
2194 return false;
2195 uint64_t Imm = C->getZExtValue();
2196 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2197}
2198
2199/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2200/// Mask are known to be either zero or one and return them Known.
2202 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2203 const SelectionDAG &DAG, unsigned Depth) const {
2204 switch (Op.getOpcode()) {
2205 default:
2206 break;
2207 case AArch64ISD::DUP: {
2208 SDValue SrcOp = Op.getOperand(0);
2209 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2210 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2211 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2212 "Expected DUP implicit truncation");
2213 Known = Known.trunc(Op.getScalarValueSizeInBits());
2214 }
2215 break;
2216 }
2217 case AArch64ISD::CSEL: {
2218 KnownBits Known2;
2219 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2220 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2221 Known = Known.intersectWith(Known2);
2222 break;
2223 }
2224 case AArch64ISD::BICi: {
2225 // Compute the bit cleared value.
2226 uint64_t Mask =
2227 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
2228 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2229 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
2230 break;
2231 }
2232 case AArch64ISD::VLSHR: {
2233 KnownBits Known2;
2234 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2235 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2236 Known = KnownBits::lshr(Known, Known2);
2237 break;
2238 }
2239 case AArch64ISD::VASHR: {
2240 KnownBits Known2;
2241 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2242 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2243 Known = KnownBits::ashr(Known, Known2);
2244 break;
2245 }
2246 case AArch64ISD::VSHL: {
2247 KnownBits Known2;
2248 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2249 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2250 Known = KnownBits::shl(Known, Known2);
2251 break;
2252 }
2253 case AArch64ISD::MOVI: {
2255 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2256 break;
2257 }
2259 case AArch64ISD::ADDlow: {
2260 if (!Subtarget->isTargetILP32())
2261 break;
2262 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2263 Known.Zero = APInt::getHighBitsSet(64, 32);
2264 break;
2265 }
2267 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2268 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2269 break;
2270 }
2272 Intrinsic::ID IntID =
2273 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2274 switch (IntID) {
2275 default: return;
2276 case Intrinsic::aarch64_ldaxr:
2277 case Intrinsic::aarch64_ldxr: {
2278 unsigned BitWidth = Known.getBitWidth();
2279 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2280 unsigned MemBits = VT.getScalarSizeInBits();
2281 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2282 return;
2283 }
2284 }
2285 break;
2286 }
2288 case ISD::INTRINSIC_VOID: {
2289 unsigned IntNo = Op.getConstantOperandVal(0);
2290 switch (IntNo) {
2291 default:
2292 break;
2293 case Intrinsic::aarch64_neon_uaddlv: {
2294 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2295 unsigned BitWidth = Known.getBitWidth();
2296 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2297 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2298 assert(BitWidth >= Bound && "Unexpected width!");
2300 Known.Zero |= Mask;
2301 }
2302 break;
2303 }
2304 case Intrinsic::aarch64_neon_umaxv:
2305 case Intrinsic::aarch64_neon_uminv: {
2306 // Figure out the datatype of the vector operand. The UMINV instruction
2307 // will zero extend the result, so we can mark as known zero all the
2308 // bits larger than the element datatype. 32-bit or larget doesn't need
2309 // this as those are legal types and will be handled by isel directly.
2310 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2311 unsigned BitWidth = Known.getBitWidth();
2312 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2313 assert(BitWidth >= 8 && "Unexpected width!");
2315 Known.Zero |= Mask;
2316 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2317 assert(BitWidth >= 16 && "Unexpected width!");
2319 Known.Zero |= Mask;
2320 }
2321 break;
2322 } break;
2323 }
2324 }
2325 }
2326}
2327
2329 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2330 unsigned Depth) const {
2331 EVT VT = Op.getValueType();
2332 unsigned VTBits = VT.getScalarSizeInBits();
2333 unsigned Opcode = Op.getOpcode();
2334 switch (Opcode) {
2335 case AArch64ISD::CMEQ:
2336 case AArch64ISD::CMGE:
2337 case AArch64ISD::CMGT:
2338 case AArch64ISD::CMHI:
2339 case AArch64ISD::CMHS:
2340 case AArch64ISD::FCMEQ:
2341 case AArch64ISD::FCMGE:
2342 case AArch64ISD::FCMGT:
2343 case AArch64ISD::CMEQz:
2344 case AArch64ISD::CMGEz:
2345 case AArch64ISD::CMGTz:
2346 case AArch64ISD::CMLEz:
2347 case AArch64ISD::CMLTz:
2348 case AArch64ISD::FCMEQz:
2349 case AArch64ISD::FCMGEz:
2350 case AArch64ISD::FCMGTz:
2351 case AArch64ISD::FCMLEz:
2352 case AArch64ISD::FCMLTz:
2353 // Compares return either 0 or all-ones
2354 return VTBits;
2355 }
2356
2357 return 1;
2358}
2359
2361 EVT) const {
2362 return MVT::i64;
2363}
2364
2366 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2367 unsigned *Fast) const {
2368 if (Subtarget->requiresStrictAlign())
2369 return false;
2370
2371 if (Fast) {
2372 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2373 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2374 // See comments in performSTORECombine() for more details about
2375 // these conditions.
2376
2377 // Code that uses clang vector extensions can mark that it
2378 // wants unaligned accesses to be treated as fast by
2379 // underspecifying alignment to be 1 or 2.
2380 Alignment <= 2 ||
2381
2382 // Disregard v2i64. Memcpy lowering produces those and splitting
2383 // them regresses performance on micro-benchmarks and olden/bh.
2384 VT == MVT::v2i64;
2385 }
2386 return true;
2387}
2388
2389// Same as above but handling LLTs instead.
2391 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2392 unsigned *Fast) const {
2393 if (Subtarget->requiresStrictAlign())
2394 return false;
2395
2396 if (Fast) {
2397 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2398 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2399 Ty.getSizeInBytes() != 16 ||
2400 // See comments in performSTORECombine() for more details about
2401 // these conditions.
2402
2403 // Code that uses clang vector extensions can mark that it
2404 // wants unaligned accesses to be treated as fast by
2405 // underspecifying alignment to be 1 or 2.
2406 Alignment <= 2 ||
2407
2408 // Disregard v2i64. Memcpy lowering produces those and splitting
2409 // them regresses performance on micro-benchmarks and olden/bh.
2410 Ty == LLT::fixed_vector(2, 64);
2411 }
2412 return true;
2413}
2414
2415FastISel *
2417 const TargetLibraryInfo *libInfo) const {
2418 return AArch64::createFastISel(funcInfo, libInfo);
2419}
2420
2421const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2422#define MAKE_CASE(V) \
2423 case V: \
2424 return #V;
2425 switch ((AArch64ISD::NodeType)Opcode) {
2427 break;
2744 }
2745#undef MAKE_CASE
2746 return nullptr;
2747}
2748
2751 MachineBasicBlock *MBB) const {
2752 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2753 // phi node:
2754
2755 // OrigBB:
2756 // [... previous instrs leading to comparison ...]
2757 // b.ne TrueBB
2758 // b EndBB
2759 // TrueBB:
2760 // ; Fallthrough
2761 // EndBB:
2762 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2763
2764 MachineFunction *MF = MBB->getParent();
2765 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2766 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2767 DebugLoc DL = MI.getDebugLoc();
2769
2770 Register DestReg = MI.getOperand(0).getReg();
2771 Register IfTrueReg = MI.getOperand(1).getReg();
2772 Register IfFalseReg = MI.getOperand(2).getReg();
2773 unsigned CondCode = MI.getOperand(3).getImm();
2774 bool NZCVKilled = MI.getOperand(4).isKill();
2775
2776 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2777 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2778 MF->insert(It, TrueBB);
2779 MF->insert(It, EndBB);
2780
2781 // Transfer rest of current basic-block to EndBB
2782 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2783 MBB->end());
2785
2786 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2787 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2788 MBB->addSuccessor(TrueBB);
2789 MBB->addSuccessor(EndBB);
2790
2791 // TrueBB falls through to the end.
2792 TrueBB->addSuccessor(EndBB);
2793
2794 if (!NZCVKilled) {
2795 TrueBB->addLiveIn(AArch64::NZCV);
2796 EndBB->addLiveIn(AArch64::NZCV);
2797 }
2798
2799 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2800 .addReg(IfTrueReg)
2801 .addMBB(TrueBB)
2802 .addReg(IfFalseReg)
2803 .addMBB(MBB);
2804
2805 MI.eraseFromParent();
2806 return EndBB;
2807}
2808
2810 MachineInstr &MI, MachineBasicBlock *BB) const {
2812 BB->getParent()->getFunction().getPersonalityFn())) &&
2813 "SEH does not use catchret!");
2814 return BB;
2815}
2816
2819 MachineBasicBlock *MBB) const {
2820 MachineFunction &MF = *MBB->getParent();
2821 MachineBasicBlock::iterator MBBI = MI.getIterator();
2823 const AArch64InstrInfo &TII =
2824 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2825 Register TargetReg = MI.getOperand(0).getReg();
2827 TII.probedStackAlloc(MBBI, TargetReg, false);
2828
2829 MI.eraseFromParent();
2830 return NextInst->getParent();
2831}
2832
2834AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2836 MachineBasicBlock *BB) const {
2837 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2838 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2839
2840 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2841 MIB.add(MI.getOperand(1)); // slice index register
2842 MIB.add(MI.getOperand(2)); // slice index offset
2843 MIB.add(MI.getOperand(3)); // pg
2844 MIB.add(MI.getOperand(4)); // base
2845 MIB.add(MI.getOperand(5)); // offset
2846
2847 MI.eraseFromParent(); // The pseudo is gone now.
2848 return BB;
2849}
2850
2853 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2855 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2856
2857 MIB.addReg(AArch64::ZA, RegState::Define);
2858 MIB.add(MI.getOperand(0)); // Vector select register
2859 MIB.add(MI.getOperand(1)); // Vector select offset
2860 MIB.add(MI.getOperand(2)); // Base
2861 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2862
2863 MI.eraseFromParent(); // The pseudo is gone now.
2864 return BB;
2865}
2866
2869 unsigned Opcode,
2870 bool Op0IsDef) const {
2871 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2873
2874 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
2875 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
2876 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2877 MIB.add(MI.getOperand(I));
2878
2879 MI.eraseFromParent(); // The pseudo is gone now.
2880 return BB;
2881}
2882
2884AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2886 MachineBasicBlock *BB, bool HasTile) const {
2887 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2888 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2889 unsigned StartIdx = 0;
2890
2891 if (HasTile) {
2892 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2893 MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2894 StartIdx = 1;
2895 } else
2896 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
2897
2898 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2899 MIB.add(MI.getOperand(I));
2900
2901 MI.eraseFromParent(); // The pseudo is gone now.
2902 return BB;
2903}
2904
2907 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2909 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2910 MIB.add(MI.getOperand(0)); // Mask
2911
2912 unsigned Mask = MI.getOperand(0).getImm();
2913 for (unsigned I = 0; I < 8; I++) {
2914 if (Mask & (1 << I))
2915 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2916 }
2917
2918 MI.eraseFromParent(); // The pseudo is gone now.
2919 return BB;
2920}
2921
2923 MachineInstr &MI, MachineBasicBlock *BB) const {
2924
2925 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
2926 if (SMEOrigInstr != -1) {
2927 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2928 uint64_t SMEMatrixType =
2929 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
2930 switch (SMEMatrixType) {
2932 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
2934 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
2936 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
2938 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
2940 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
2942 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
2943 }
2944 }
2945
2946 switch (MI.getOpcode()) {
2947 default:
2948#ifndef NDEBUG
2949 MI.dump();
2950#endif
2951 llvm_unreachable("Unexpected instruction for custom inserter!");
2952
2953 case AArch64::F128CSEL:
2954 return EmitF128CSEL(MI, BB);
2955 case TargetOpcode::STATEPOINT:
2956 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2957 // while bl call instruction (where statepoint will be lowered at the end)
2958 // has implicit def. This def is early-clobber as it will be set at
2959 // the moment of the call and earlier than any use is read.
2960 // Add this implicit dead def here as a workaround.
2961 MI.addOperand(*MI.getMF(),
2963 AArch64::LR, /*isDef*/ true,
2964 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2965 /*isUndef*/ false, /*isEarlyClobber*/ true));
2966 [[fallthrough]];
2967 case TargetOpcode::STACKMAP:
2968 case TargetOpcode::PATCHPOINT:
2969 return emitPatchPoint(MI, BB);
2970
2971 case TargetOpcode::PATCHABLE_EVENT_CALL:
2972 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
2973 return BB;
2974
2975 case AArch64::CATCHRET:
2976 return EmitLoweredCatchRet(MI, BB);
2977
2978 case AArch64::PROBED_STACKALLOC_DYN:
2979 return EmitDynamicProbedAlloc(MI, BB);
2980
2981 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2982 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2983 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2984 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2985 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2986 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2987 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2988 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2989 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2990 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2991 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2992 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2993 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2994 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2995 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2996 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2997 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2998 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2999 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3000 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3001 case AArch64::LDR_ZA_PSEUDO:
3002 return EmitFill(MI, BB);
3003 case AArch64::LDR_TX_PSEUDO:
3004 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3005 case AArch64::STR_TX_PSEUDO:
3006 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3007 case AArch64::ZERO_M_PSEUDO:
3008 return EmitZero(MI, BB);
3009 case AArch64::ZERO_T_PSEUDO:
3010 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3011 }
3012}
3013
3014//===----------------------------------------------------------------------===//
3015// AArch64 Lowering private implementation.
3016//===----------------------------------------------------------------------===//
3017
3018//===----------------------------------------------------------------------===//
3019// Lowering Code
3020//===----------------------------------------------------------------------===//
3021
3022// Forward declarations of SVE fixed length lowering helpers
3027 SelectionDAG &DAG);
3030 EVT VT);
3031
3032/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3033static bool isZerosVector(const SDNode *N) {
3034 // Look through a bit convert.
3035 while (N->getOpcode() == ISD::BITCAST)
3036 N = N->getOperand(0).getNode();
3037
3039 return true;
3040
3041 if (N->getOpcode() != AArch64ISD::DUP)
3042 return false;
3043
3044 auto Opnd0 = N->getOperand(0);
3045 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3046}
3047
3048/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3049/// CC
3051 switch (CC) {
3052 default:
3053 llvm_unreachable("Unknown condition code!");
3054 case ISD::SETNE:
3055 return AArch64CC::NE;
3056 case ISD::SETEQ:
3057 return AArch64CC::EQ;
3058 case ISD::SETGT:
3059 return AArch64CC::GT;
3060 case ISD::SETGE:
3061 return AArch64CC::GE;
3062 case ISD::SETLT:
3063 return AArch64CC::LT;
3064 case ISD::SETLE:
3065 return AArch64CC::LE;
3066 case ISD::SETUGT:
3067 return AArch64CC::HI;
3068 case ISD::SETUGE:
3069 return AArch64CC::HS;
3070 case ISD::SETULT:
3071 return AArch64CC::LO;
3072 case ISD::SETULE:
3073 return AArch64CC::LS;
3074 }
3075}
3076
3077/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3079 AArch64CC::CondCode &CondCode,
3080 AArch64CC::CondCode &CondCode2) {
3081 CondCode2 = AArch64CC::AL;
3082 switch (CC) {
3083 default:
3084 llvm_unreachable("Unknown FP condition!");
3085 case ISD::SETEQ:
3086 case ISD::SETOEQ:
3087 CondCode = AArch64CC::EQ;
3088 break;
3089 case ISD::SETGT:
3090 case ISD::SETOGT:
3091 CondCode = AArch64CC::GT;
3092 break;
3093 case ISD::SETGE:
3094 case ISD::SETOGE:
3095 CondCode = AArch64CC::GE;
3096 break;
3097 case ISD::SETOLT:
3098 CondCode = AArch64CC::MI;
3099 break;
3100 case ISD::SETOLE:
3101 CondCode = AArch64CC::LS;
3102 break;
3103 case ISD::SETONE:
3104 CondCode = AArch64CC::MI;
3105 CondCode2 = AArch64CC::GT;
3106 break;
3107 case ISD::SETO:
3108 CondCode = AArch64CC::VC;
3109 break;
3110 case ISD::SETUO:
3111 CondCode = AArch64CC::VS;
3112 break;
3113 case ISD::SETUEQ:
3114 CondCode = AArch64CC::EQ;
3115 CondCode2 = AArch64CC::VS;
3116 break;
3117 case ISD::SETUGT:
3118 CondCode = AArch64CC::HI;
3119 break;
3120 case ISD::SETUGE:
3121 CondCode = AArch64CC::PL;
3122 break;
3123 case ISD::SETLT:
3124 case ISD::SETULT:
3125 CondCode = AArch64CC::LT;
3126 break;
3127 case ISD::SETLE:
3128 case ISD::SETULE:
3129 CondCode = AArch64CC::LE;
3130 break;
3131 case ISD::SETNE:
3132 case ISD::SETUNE:
3133 CondCode = AArch64CC::NE;
3134 break;
3135 }
3136}
3137
3138/// Convert a DAG fp condition code to an AArch64 CC.
3139/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3140/// should be AND'ed instead of OR'ed.
3142 AArch64CC::CondCode &CondCode,
3143 AArch64CC::CondCode &CondCode2) {
3144 CondCode2 = AArch64CC::AL;
3145 switch (CC) {
3146 default:
3147 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3148 assert(CondCode2 == AArch64CC::AL);
3149 break;
3150 case ISD::SETONE:
3151 // (a one b)
3152 // == ((a olt b) || (a ogt b))
3153 // == ((a ord b) && (a une b))
3154 CondCode = AArch64CC::VC;
3155 CondCode2 = AArch64CC::NE;
3156 break;
3157 case ISD::SETUEQ:
3158 // (a ueq b)
3159 // == ((a uno b) || (a oeq b))
3160 // == ((a ule b) && (a uge b))
3161 CondCode = AArch64CC::PL;
3162 CondCode2 = AArch64CC::LE;
3163 break;
3164 }
3165}
3166
3167/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3168/// CC usable with the vector instructions. Fewer operations are available
3169/// without a real NZCV register, so we have to use less efficient combinations
3170/// to get the same effect.
3172 AArch64CC::CondCode &CondCode,
3173 AArch64CC::CondCode &CondCode2,
3174 bool &Invert) {
3175 Invert = false;
3176 switch (CC) {
3177 default:
3178 // Mostly the scalar mappings work fine.
3179 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3180 break;
3181 case ISD::SETUO:
3182 Invert = true;
3183 [[fallthrough]];
3184 case ISD::SETO:
3185 CondCode = AArch64CC::MI;
3186 CondCode2 = AArch64CC::GE;
3187 break;
3188 case ISD::SETUEQ:
3189 case ISD::SETULT:
3190 case ISD::SETULE:
3191 case ISD::SETUGT:
3192 case ISD::SETUGE:
3193 // All of the compare-mask comparisons are ordered, but we can switch
3194 // between the two by a double inversion. E.g. ULE == !OGT.
3195 Invert = true;
3196 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3197 CondCode, CondCode2);
3198 break;
3199 }
3200}
3201
3203 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3204 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3205 LLVM_DEBUG(dbgs() << "Is imm " << C
3206 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3207 return IsLegal;
3208}
3209
3210// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3211// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3212// can be set differently by this operation. It comes down to whether
3213// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3214// everything is fine. If not then the optimization is wrong. Thus general
3215// comparisons are only valid if op2 != 0.
3216//
3217// So, finally, the only LLVM-native comparisons that don't mention C and V
3218// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
3219// the absence of information about op2.
3221 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3222 (CC == ISD::SETEQ || CC == ISD::SETNE);
3223}
3224
3226 SelectionDAG &DAG, SDValue Chain,
3227 bool IsSignaling) {
3228 EVT VT = LHS.getValueType();
3229 assert(VT != MVT::f128);
3230
3231 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3232
3233 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3234 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3235 {Chain, LHS});
3236 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3237 {LHS.getValue(1), RHS});
3238 Chain = RHS.getValue(1);
3239 VT = MVT::f32;
3240 }
3241 unsigned Opcode =
3243 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
3244}
3245
3247 const SDLoc &dl, SelectionDAG &DAG) {
3248 EVT VT = LHS.getValueType();
3249 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3250
3251 if (VT.isFloatingPoint()) {
3252 assert(VT != MVT::f128);
3253 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3254 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3255 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3256 VT = MVT::f32;
3257 }
3258 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
3259 }
3260
3261 // The CMP instruction is just an alias for SUBS, and representing it as
3262 // SUBS means that it's possible to get CSE with subtract operations.
3263 // A later phase can perform the optimization of setting the destination
3264 // register to WZR/XZR if it ends up being unused.
3265 unsigned Opcode = AArch64ISD::SUBS;
3266
3267 if (isCMN(RHS, CC)) {
3268 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3269 Opcode = AArch64ISD::ADDS;
3270 RHS = RHS.getOperand(1);
3271 } else if (isCMN(LHS, CC)) {
3272 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3273 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3274 Opcode = AArch64ISD::ADDS;
3275 LHS = LHS.getOperand(1);
3276 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3277 if (LHS.getOpcode() == ISD::AND) {
3278 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3279 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3280 // of the signed comparisons.
3281 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3282 DAG.getVTList(VT, MVT_CC),
3283 LHS.getOperand(0),
3284 LHS.getOperand(1));
3285 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3286 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3287 return ANDSNode.getValue(1);
3288 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3289 // Use result of ANDS
3290 return LHS.getValue(1);
3291 }
3292 }
3293
3294 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3295 .getValue(1);
3296}
3297
3298/// \defgroup AArch64CCMP CMP;CCMP matching
3299///
3300/// These functions deal with the formation of CMP;CCMP;... sequences.
3301/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3302/// a comparison. They set the NZCV flags to a predefined value if their
3303/// predicate is false. This allows to express arbitrary conjunctions, for
3304/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3305/// expressed as:
3306/// cmp A
3307/// ccmp B, inv(CB), CA
3308/// check for CB flags
3309///
3310/// This naturally lets us implement chains of AND operations with SETCC
3311/// operands. And we can even implement some other situations by transforming
3312/// them:
3313/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3314/// negating the flags used in a CCMP/FCCMP operations.
3315/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3316/// by negating the flags we test for afterwards. i.e.
3317/// NEG (CMP CCMP CCCMP ...) can be implemented.
3318/// - Note that we can only ever negate all previously processed results.
3319/// What we can not implement by flipping the flags to test is a negation
3320/// of two sub-trees (because the negation affects all sub-trees emitted so
3321/// far, so the 2nd sub-tree we emit would also affect the first).
3322/// With those tools we can implement some OR operations:
3323/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3324/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3325/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3326/// elimination rules from earlier to implement the whole thing as a
3327/// CCMP/FCCMP chain.
3328///
3329/// As complete example:
3330/// or (or (setCA (cmp A)) (setCB (cmp B)))
3331/// (and (setCC (cmp C)) (setCD (cmp D)))"
3332/// can be reassociated to:
3333/// or (and (setCC (cmp C)) setCD (cmp D))
3334// (or (setCA (cmp A)) (setCB (cmp B)))
3335/// can be transformed to: