LLVM 18.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148/// Value type used for condition codes.
149static const MVT MVT_CC = MVT::i32;
150
151static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
152 AArch64::X3, AArch64::X4, AArch64::X5,
153 AArch64::X6, AArch64::X7};
154static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
155 AArch64::Q3, AArch64::Q4, AArch64::Q5,
156 AArch64::Q6, AArch64::Q7};
157
159
161
162static inline EVT getPackedSVEVectorVT(EVT VT) {
163 switch (VT.getSimpleVT().SimpleTy) {
164 default:
165 llvm_unreachable("unexpected element type for vector");
166 case MVT::i8:
167 return MVT::nxv16i8;
168 case MVT::i16:
169 return MVT::nxv8i16;
170 case MVT::i32:
171 return MVT::nxv4i32;
172 case MVT::i64:
173 return MVT::nxv2i64;
174 case MVT::f16:
175 return MVT::nxv8f16;
176 case MVT::f32:
177 return MVT::nxv4f32;
178 case MVT::f64:
179 return MVT::nxv2f64;
180 case MVT::bf16:
181 return MVT::nxv8bf16;
182 }
183}
184
185// NOTE: Currently there's only a need to return integer vector types. If this
186// changes then just add an extra "type" parameter.
188 switch (EC.getKnownMinValue()) {
189 default:
190 llvm_unreachable("unexpected element count for vector");
191 case 16:
192 return MVT::nxv16i8;
193 case 8:
194 return MVT::nxv8i16;
195 case 4:
196 return MVT::nxv4i32;
197 case 2:
198 return MVT::nxv2i64;
199 }
200}
201
203 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
204 "Expected scalable predicate vector type!");
205 switch (VT.getVectorMinNumElements()) {
206 default:
207 llvm_unreachable("unexpected element count for vector");
208 case 2:
209 return MVT::nxv2i64;
210 case 4:
211 return MVT::nxv4i32;
212 case 8:
213 return MVT::nxv8i16;
214 case 16:
215 return MVT::nxv16i8;
216 }
217}
218
219/// Returns true if VT's elements occupy the lowest bit positions of its
220/// associated register class without any intervening space.
221///
222/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
223/// same register class, but only nxv8f16 can be treated as a packed vector.
224static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
226 "Expected legal vector type!");
227 return VT.isFixedLengthVector() ||
229}
230
231// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
232// predicate and end with a passthru value matching the result type.
233static bool isMergePassthruOpcode(unsigned Opc) {
234 switch (Opc) {
235 default:
236 return false;
266 return true;
267 }
268}
269
270// Returns true if inactive lanes are known to be zeroed by construction.
272 switch (Op.getOpcode()) {
273 default:
274 // We guarantee i1 splat_vectors to zero the other lanes by
275 // implementing it with ptrue and possibly a punpklo for nxv1i1.
277 return true;
278 return false;
281 return true;
283 switch (Op.getConstantOperandVal(0)) {
284 default:
285 return false;
286 case Intrinsic::aarch64_sve_ptrue:
287 case Intrinsic::aarch64_sve_pnext:
288 case Intrinsic::aarch64_sve_cmpeq:
289 case Intrinsic::aarch64_sve_cmpne:
290 case Intrinsic::aarch64_sve_cmpge:
291 case Intrinsic::aarch64_sve_cmpgt:
292 case Intrinsic::aarch64_sve_cmphs:
293 case Intrinsic::aarch64_sve_cmphi:
294 case Intrinsic::aarch64_sve_cmpeq_wide:
295 case Intrinsic::aarch64_sve_cmpne_wide:
296 case Intrinsic::aarch64_sve_cmpge_wide:
297 case Intrinsic::aarch64_sve_cmpgt_wide:
298 case Intrinsic::aarch64_sve_cmplt_wide:
299 case Intrinsic::aarch64_sve_cmple_wide:
300 case Intrinsic::aarch64_sve_cmphs_wide:
301 case Intrinsic::aarch64_sve_cmphi_wide:
302 case Intrinsic::aarch64_sve_cmplo_wide:
303 case Intrinsic::aarch64_sve_cmpls_wide:
304 case Intrinsic::aarch64_sve_fcmpeq:
305 case Intrinsic::aarch64_sve_fcmpne:
306 case Intrinsic::aarch64_sve_fcmpge:
307 case Intrinsic::aarch64_sve_fcmpgt:
308 case Intrinsic::aarch64_sve_fcmpuo:
309 case Intrinsic::aarch64_sve_facgt:
310 case Intrinsic::aarch64_sve_facge:
311 case Intrinsic::aarch64_sve_whilege:
312 case Intrinsic::aarch64_sve_whilegt:
313 case Intrinsic::aarch64_sve_whilehi:
314 case Intrinsic::aarch64_sve_whilehs:
315 case Intrinsic::aarch64_sve_whilele:
316 case Intrinsic::aarch64_sve_whilelo:
317 case Intrinsic::aarch64_sve_whilels:
318 case Intrinsic::aarch64_sve_whilelt:
319 case Intrinsic::aarch64_sve_match:
320 case Intrinsic::aarch64_sve_nmatch:
321 case Intrinsic::aarch64_sve_whilege_x2:
322 case Intrinsic::aarch64_sve_whilegt_x2:
323 case Intrinsic::aarch64_sve_whilehi_x2:
324 case Intrinsic::aarch64_sve_whilehs_x2:
325 case Intrinsic::aarch64_sve_whilele_x2:
326 case Intrinsic::aarch64_sve_whilelo_x2:
327 case Intrinsic::aarch64_sve_whilels_x2:
328 case Intrinsic::aarch64_sve_whilelt_x2:
329 return true;
330 }
331 }
332}
333
335 const AArch64Subtarget &STI)
336 : TargetLowering(TM), Subtarget(&STI) {
337 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
338 // we have to make something up. Arbitrarily, choose ZeroOrOne.
340 // When comparing vectors the result sets the different elements in the
341 // vector to all-one or all-zero.
343
344 // Set up the register classes.
345 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
346 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
347
348 if (Subtarget->hasLS64()) {
349 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
350 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
352 }
353
354 if (Subtarget->hasFPARMv8()) {
355 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
356 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
357 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
358 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
359 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
360 }
361
362 if (Subtarget->hasNEON()) {
363 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
364 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
365 // Someone set us up the NEON.
366 addDRTypeForNEON(MVT::v2f32);
367 addDRTypeForNEON(MVT::v8i8);
368 addDRTypeForNEON(MVT::v4i16);
369 addDRTypeForNEON(MVT::v2i32);
370 addDRTypeForNEON(MVT::v1i64);
371 addDRTypeForNEON(MVT::v1f64);
372 addDRTypeForNEON(MVT::v4f16);
373 if (Subtarget->hasBF16())
374 addDRTypeForNEON(MVT::v4bf16);
375
376 addQRTypeForNEON(MVT::v4f32);
377 addQRTypeForNEON(MVT::v2f64);
378 addQRTypeForNEON(MVT::v16i8);
379 addQRTypeForNEON(MVT::v8i16);
380 addQRTypeForNEON(MVT::v4i32);
381 addQRTypeForNEON(MVT::v2i64);
382 addQRTypeForNEON(MVT::v8f16);
383 if (Subtarget->hasBF16())
384 addQRTypeForNEON(MVT::v8bf16);
385 }
386
387 if (Subtarget->hasSVEorSME()) {
388 // Add legal sve predicate types
389 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
390 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
391 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
392 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
393 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
394
395 // Add legal sve data types
396 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
397 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
398 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
399 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
400
401 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
402 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
403 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
404 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
405 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
406 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
407
408 if (Subtarget->hasBF16()) {
409 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
410 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
411 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
412 }
413
414 if (Subtarget->useSVEForFixedLengthVectors()) {
417 addRegisterClass(VT, &AArch64::ZPRRegClass);
418
421 addRegisterClass(VT, &AArch64::ZPRRegClass);
422 }
423 }
424
425 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
426 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
427 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
428 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
429
430 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
431 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
432 }
433
434 // Compute derived properties from the register classes
436
437 // Provide all sorts of operation actions
474
478
482
484
485 // Custom lowering hooks are needed for XOR
486 // to fold it into CSINC/CSINV.
489
490 // Virtually no operation on f128 is legal, but LLVM can't expand them when
491 // there's a valid register class, so we need custom operations in most cases.
515 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
516 // aren't handled.
517
518 // Lowering for many of the conversions is actually specified by the non-f128
519 // type. The LowerXXX function will be trivial when f128 isn't involved.
550
555
556 // Variable arguments.
561
562 // Variable-sized objects.
565
566 // Lowering Funnel Shifts to EXTR
571
572 if (Subtarget->isTargetWindows())
574 else
576
577 // Constant pool entries
579
580 // BlockAddress
582
583 // AArch64 lacks both left-rotate and popcount instructions.
589 }
590
591 // AArch64 doesn't have i32 MULH{S|U}.
594
595 // AArch64 doesn't have {U|S}MUL_LOHI.
600
601 if (Subtarget->hasCSSC()) {
605
607
611
614
619
624 } else {
628
631
634 }
635
641 }
648
649 // Custom lower Add/Sub/Mul with overflow.
662
671
680 if (Subtarget->hasFullFP16())
682 else
684
685 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
693 setOperationAction(Op, MVT::f16, Promote);
694 setOperationAction(Op, MVT::v4f16, Expand);
695 setOperationAction(Op, MVT::v8f16, Expand);
696 }
697
698 if (!Subtarget->hasFullFP16()) {
699 for (auto Op :
714 setOperationAction(Op, MVT::f16, Promote);
715
716 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
717 // because the result type is integer.
721 setOperationAction(Op, MVT::f16, Custom);
722
723 // promote v4f16 to v4f32 when that is known to be safe.
724 setOperationPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
725 setOperationPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
726 setOperationPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
727 setOperationPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
728
729 setOperationAction(ISD::FABS, MVT::v4f16, Expand);
730 setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
733 setOperationAction(ISD::FMA, MVT::v4f16, Expand);
745
746 setOperationAction(ISD::FABS, MVT::v8f16, Expand);
747 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
750 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
752 setOperationAction(ISD::FMA, MVT::v8f16, Expand);
753 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
755 setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
760 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
767 }
768
769 // AArch64 has implementations of a lot of rounding-like FP operations.
770 for (auto Op :
781 for (MVT Ty : {MVT::f32, MVT::f64})
783 if (Subtarget->hasFullFP16())
784 setOperationAction(Op, MVT::f16, Legal);
785 }
786
787 // Basic strict FP operations are legal
790 for (MVT Ty : {MVT::f32, MVT::f64})
792 if (Subtarget->hasFullFP16())
793 setOperationAction(Op, MVT::f16, Legal);
794 }
795
796 // Strict conversion to a larger type is legal
797 for (auto VT : {MVT::f32, MVT::f64})
799
801
804
806 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
809 } else {
812 }
815
816 // Generate outline atomics library calls only if LSE was not specified for
817 // subtarget
818 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
844#define LCALLNAMES(A, B, N) \
845 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
846 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
847 setLibcallName(A##N##_REL, #B #N "_rel"); \
848 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
849#define LCALLNAME4(A, B) \
850 LCALLNAMES(A, B, 1) \
851 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
852#define LCALLNAME5(A, B) \
853 LCALLNAMES(A, B, 1) \
854 LCALLNAMES(A, B, 2) \
855 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
856 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
857 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
858 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
859 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
860 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
861 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
862#undef LCALLNAMES
863#undef LCALLNAME4
864#undef LCALLNAME5
865 }
866
867 if (Subtarget->hasLSE128()) {
868 // Custom lowering because i128 is not legal. Must be replaced by 2x64
869 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
873 }
874
875 // 128-bit loads and stores can be done without expanding
878
879 // Aligned 128-bit loads and stores are single-copy atomic according to the
880 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
881 if (Subtarget->hasLSE2()) {
884 }
885
886 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
887 // custom lowering, as there are no un-paired non-temporal stores and
888 // legalization will break up 256 bit inputs.
890 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
891 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
896
897 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
898 // custom lowering, as there are no un-paired non-temporal loads legalization
899 // will break up 256 bit inputs.
900 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
901 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
902 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
903 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
904 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
905 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
906 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
907
908 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
910
911 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
912 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
913 // Issue __sincos_stret if available.
916 } else {
919 }
920
921 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
922 // MSVCRT doesn't have powi; fall back to pow
923 setLibcallName(RTLIB::POWI_F32, nullptr);
924 setLibcallName(RTLIB::POWI_F64, nullptr);
925 }
926
927 // Make floating-point constants legal for the large code model, so they don't
928 // become loads from the constant pool.
929 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
932 }
933
934 // AArch64 does not have floating-point extending loads, i1 sign-extending
935 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
936 for (MVT VT : MVT::fp_valuetypes()) {
937 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
938 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
939 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
940 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
941 }
942 for (MVT VT : MVT::integer_valuetypes())
944
945 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
946 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
947 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
948 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
949 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
950 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
951 setTruncStoreAction(MVT::f128, MVT::f16, Expand);
952
956
957 // Indexed loads and stores are supported.
958 for (unsigned im = (unsigned)ISD::PRE_INC;
960 setIndexedLoadAction(im, MVT::i8, Legal);
961 setIndexedLoadAction(im, MVT::i16, Legal);
962 setIndexedLoadAction(im, MVT::i32, Legal);
963 setIndexedLoadAction(im, MVT::i64, Legal);
964 setIndexedLoadAction(im, MVT::f64, Legal);
965 setIndexedLoadAction(im, MVT::f32, Legal);
966 setIndexedLoadAction(im, MVT::f16, Legal);
967 setIndexedLoadAction(im, MVT::bf16, Legal);
968 setIndexedStoreAction(im, MVT::i8, Legal);
969 setIndexedStoreAction(im, MVT::i16, Legal);
970 setIndexedStoreAction(im, MVT::i32, Legal);
971 setIndexedStoreAction(im, MVT::i64, Legal);
972 setIndexedStoreAction(im, MVT::f64, Legal);
973 setIndexedStoreAction(im, MVT::f32, Legal);
974 setIndexedStoreAction(im, MVT::f16, Legal);
975 setIndexedStoreAction(im, MVT::bf16, Legal);
976 }
977
978 // Trap.
979 setOperationAction(ISD::TRAP, MVT::Other, Legal);
982
983 // We combine OR nodes for bitfield operations.
985 // Try to create BICs for vector ANDs.
987
988 // Vector add and sub nodes may conceal a high-half opportunity.
989 // Also, try to fold ADD into CSINC/CSINV..
992
995
996 // Try and combine setcc with csel
998
1000
1007
1009
1011
1013
1017
1019
1021
1023
1025
1029
1031
1032 // In case of strict alignment, avoid an excessive number of byte wide stores.
1035 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1036
1040 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1041
1044
1047 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1048
1050
1052
1053 EnableExtLdPromotion = true;
1054
1055 // Set required alignment.
1057 // Set preferred alignments.
1058
1059 // Don't align loops on Windows. The SEH unwind info generation needs to
1060 // know the exact length of functions before the alignments have been
1061 // expanded.
1062 if (!Subtarget->isTargetWindows())
1066
1067 // Only change the limit for entries in a jump table if specified by
1068 // the sub target, but not at the command line.
1069 unsigned MaxJT = STI.getMaximumJumpTableSize();
1070 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1072
1074
1076
1078
1079 if (Subtarget->hasNEON()) {
1080 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1081 // silliness like this:
1082 for (auto Op :
1097 setOperationAction(Op, MVT::v1f64, Expand);
1098
1099 for (auto Op :
1104 setOperationAction(Op, MVT::v1i64, Expand);
1105
1106 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1107 // elements smaller than i32, so promote the input to i32 first.
1108 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1109 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1110
1111 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1112 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1113 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1116 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1118
1119 if (Subtarget->hasFullFP16()) {
1122
1131 } else {
1132 // when AArch64 doesn't have fullfp16 support, promote the input
1133 // to i32 first.
1134 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1135 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1136 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1137 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1138 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1139 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1140 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1141 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1142 }
1143
1144 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1145 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1152 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1157 }
1158
1159 // Custom handling for some quad-vector types to detect MULL.
1160 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1161 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1162 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1163 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1164 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1165 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1166
1167 // Saturates
1168 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1169 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1174 }
1175
1176 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1177 MVT::v4i32}) {
1184 }
1185
1186 // Vector reductions
1187 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1188 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1189 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1194
1196 }
1197 }
1198 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1199 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1208 }
1213
1215 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1216 // Likewise, narrowing and extending vector loads/stores aren't handled
1217 // directly.
1220
1221 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1224 } else {
1227 }
1230
1233
1234 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1235 setTruncStoreAction(VT, InnerVT, Expand);
1236 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1237 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1238 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1239 }
1240 }
1241
1242 // AArch64 has implementations of a lot of rounding-like FP operations.
1243 for (auto Op :
1248 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1250 if (Subtarget->hasFullFP16())
1251 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1253 }
1254
1255 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1256
1261
1265
1266 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1267 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1268 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1269 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1270 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1271 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1272
1273 // ADDP custom lowering
1274 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1276 // FADDP custom lowering
1277 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1279 }
1280
1281 if (Subtarget->hasSME()) {
1283 }
1284
1285 // FIXME: Move lowering for more nodes here if those are common between
1286 // SVE and SME.
1287 if (Subtarget->hasSVEorSME()) {
1288 for (auto VT :
1289 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1294 }
1295 }
1296
1297 if (Subtarget->hasSVEorSME()) {
1298 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1341
1347
1356
1361
1362 if (!Subtarget->isLittleEndian())
1364 }
1365
1366 // Illegal unpacked integer vector types.
1367 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1370 }
1371
1372 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1373 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1374 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1376
1377 for (auto VT :
1378 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1379 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1381
1382 for (auto VT :
1383 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1391
1395
1396 // There are no legal MVT::nxv16f## based types.
1397 if (VT != MVT::nxv16i1) {
1400 }
1401 }
1402
1403 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1404 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1405 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1406 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1411 }
1412
1413 // Firstly, exclude all scalable vector extending loads/truncating stores,
1414 // include both integer and floating scalable vector.
1416 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1417 setTruncStoreAction(VT, InnerVT, Expand);
1418 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1419 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1420 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1421 }
1422 }
1423
1424 // Then, selectively enable those which we directly support.
1425 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1426 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1427 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1428 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1429 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1430 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1431 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1432 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1433 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1434 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1435 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1436 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1437 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1438 }
1439
1440 // SVE supports truncating stores of 64 and 128-bit vectors
1441 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1442 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1443 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1444 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1445 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1446
1447 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1448 MVT::nxv4f32, MVT::nxv2f64}) {
1484 if (Subtarget->isSVEAvailable())
1489
1503
1515
1516 if (!Subtarget->isLittleEndian())
1518 }
1519
1520 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1527
1528 if (!Subtarget->isLittleEndian())
1530 }
1531
1534
1535 // NEON doesn't support integer divides, but SVE does
1536 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1537 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1540 }
1541
1542 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1543 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1544 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1545
1546 if (Subtarget->isSVEAvailable()) {
1547 // NEON doesn't support across-vector reductions, but SVE does.
1548 for (auto VT :
1549 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1551 }
1552
1553 if (!Subtarget->isNeonAvailable()) {
1554 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Custom);
1555 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Custom);
1556 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Custom);
1557 setTruncStoreAction(MVT::v1f64, MVT::v1f16, Custom);
1558 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Custom);
1559 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Custom);
1560 setTruncStoreAction(MVT::v1f64, MVT::v1f32, Custom);
1561 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Custom);
1562 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
1563 for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1564 MVT::v4i32, MVT::v1i64, MVT::v2i64})
1565 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true);
1566
1567 for (MVT VT :
1568 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1569 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true);
1570 }
1571
1572 // NOTE: Currently this has to happen after computeRegisterProperties rather
1573 // than the preferred option of combining it with the addRegisterClass call.
1574 if (Subtarget->useSVEForFixedLengthVectors()) {
1577 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false);
1580 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false);
1581
1582 // 64bit results can mean a bigger than NEON input.
1583 for (auto VT : {MVT::v8i8, MVT::v4i16})
1586
1587 // 128bit results imply a bigger than NEON input.
1588 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1590 for (auto VT : {MVT::v8f16, MVT::v4f32})
1592
1593 // These operations are not supported on NEON but SVE can do them.
1595 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1596 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1597 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1598 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1599 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1600 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1601 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1602 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1603 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1604 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1605 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1606 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1607 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1608 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1609 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1614
1615 // Int operations with no NEON support.
1616 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1617 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1625 }
1626
1627
1628 // Use SVE for vectors with more than 2 elements.
1629 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1631 }
1632
1633 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1634 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1635 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1636 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1637
1639 }
1640
1641 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1642 // Only required for llvm.aarch64.mops.memset.tag
1644 }
1645
1647
1648 if (Subtarget->hasSVE()) {
1652 }
1653
1654 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1655
1656 IsStrictFPEnabled = true;
1657}
1658
1659void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1660 assert(VT.isVector() && "VT should be a vector type");
1661
1662 if (VT.isFloatingPoint()) {
1664 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1665 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1666 }
1667
1668 // Mark vector float intrinsics as expand.
1669 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1679 }
1680
1681 // But we do support custom-lowering for FCOPYSIGN.
1682 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1683 ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
1685
1698
1702 for (MVT InnerVT : MVT::all_valuetypes())
1703 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1704
1705 // CNT supports only B element sizes, then use UADDLP to widen.
1706 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1708
1714
1715 for (unsigned Opcode :
1719
1720 if (!VT.isFloatingPoint())
1722
1723 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1724 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1725 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1727
1728 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1729 // NEON types.
1730 if (VT.isFloatingPoint() &&
1731 VT.getVectorElementType() != MVT::bf16 &&
1732 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1733 for (unsigned Opcode :
1740
1741 // Strict fp extend and trunc are legal
1742 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1744 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1746
1747 // FIXME: We could potentially make use of the vector comparison instructions
1748 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1749 // complications:
1750 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1751 // so we would need to expand when the condition code doesn't match the
1752 // kind of comparison.
1753 // * Some kinds of comparison require more than one FCMXY instruction so
1754 // would need to be expanded instead.
1755 // * The lowering of the non-strict versions involves target-specific ISD
1756 // nodes so we would likely need to add strict versions of all of them and
1757 // handle them appropriately.
1760
1761 if (Subtarget->isLittleEndian()) {
1762 for (unsigned im = (unsigned)ISD::PRE_INC;
1766 }
1767 }
1768
1769 if (Subtarget->hasD128()) {
1772 }
1773}
1774
1776 EVT OpVT) const {
1777 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1778 if (!Subtarget->hasSVE())
1779 return true;
1780
1781 // We can only support legal predicate result types. We can use the SVE
1782 // whilelo instruction for generating fixed-width predicates too.
1783 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1784 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1785 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1786 return true;
1787
1788 // The whilelo instruction only works with i32 or i64 scalar inputs.
1789 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1790 return true;
1791
1792 return false;
1793}
1794
1796 return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
1797}
1798
1799void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT,
1800 bool StreamingSVE) {
1801 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1802
1803 // By default everything must be expanded.
1804 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1806
1807 if (VT.isFloatingPoint()) {
1817 }
1818
1819 // Mark integer truncating stores/extending loads as having custom lowering
1820 if (VT.isInteger()) {
1821 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1822 while (InnerVT != VT) {
1823 setTruncStoreAction(VT, InnerVT, Custom);
1824 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1825 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
1826 InnerVT = InnerVT.changeVectorElementType(
1827 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1828 }
1829 }
1830
1831 // Mark floating-point truncating stores/extending loads as having custom
1832 // lowering
1833 if (VT.isFloatingPoint()) {
1834 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1835 while (InnerVT != VT) {
1836 setTruncStoreAction(VT, InnerVT, Custom);
1837 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
1838 InnerVT = InnerVT.changeVectorElementType(
1840 }
1841 }
1842
1843 // Lower fixed length vector operations to scalable equivalents.
1848 setOperationAction(ISD::BITCAST, VT, StreamingSVE ? Legal : Custom);
1883 setOperationAction(ISD::LOAD, VT, StreamingSVE ? Legal : Custom);
1884 setOperationAction(ISD::MGATHER, VT, StreamingSVE ? Expand : Custom);
1886 setOperationAction(ISD::MSCATTER, VT, StreamingSVE ? Expand : Custom);
1905 setOperationAction(ISD::STORE, VT, StreamingSVE ? Legal : Custom);
1921 StreamingSVE ? Expand : Custom);
1932}
1933
1934void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1935 addRegisterClass(VT, &AArch64::FPR64RegClass);
1936 addTypeForNEON(VT);
1937}
1938
1939void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1940 addRegisterClass(VT, &AArch64::FPR128RegClass);
1941 addTypeForNEON(VT);
1942}
1943
1945 LLVMContext &C, EVT VT) const {
1946 if (!VT.isVector())
1947 return MVT::i32;
1948 if (VT.isScalableVector())
1949 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
1951}
1952
1953// isIntImmediate - This method tests to see if the node is a constant
1954// operand. If so Imm will receive the value.
1955static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
1956 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
1957 Imm = C->getZExtValue();
1958 return true;
1959 }
1960 return false;
1961}
1962
1963// isOpcWithIntImmediate - This method tests to see if the node is a specific
1964// opcode and that it has a immediate integer right operand.
1965// If so Imm will receive the value.
1966static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
1967 uint64_t &Imm) {
1968 return N->getOpcode() == Opc &&
1969 isIntImmediate(N->getOperand(1).getNode(), Imm);
1970}
1971
1972static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1973 const APInt &Demanded,
1975 unsigned NewOpc) {
1976 uint64_t OldImm = Imm, NewImm, Enc;
1977 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1978
1979 // Return if the immediate is already all zeros, all ones, a bimm32 or a
1980 // bimm64.
1981 if (Imm == 0 || Imm == Mask ||
1983 return false;
1984
1985 unsigned EltSize = Size;
1986 uint64_t DemandedBits = Demanded.getZExtValue();
1987
1988 // Clear bits that are not demanded.
1989 Imm &= DemandedBits;
1990
1991 while (true) {
1992 // The goal here is to set the non-demanded bits in a way that minimizes
1993 // the number of switching between 0 and 1. In order to achieve this goal,
1994 // we set the non-demanded bits to the value of the preceding demanded bits.
1995 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
1996 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
1997 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
1998 // The final result is 0b11000011.
1999 uint64_t NonDemandedBits = ~DemandedBits;
2000 uint64_t InvertedImm = ~Imm & DemandedBits;
2001 uint64_t RotatedImm =
2002 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2003 NonDemandedBits;
2004 uint64_t Sum = RotatedImm + NonDemandedBits;
2005 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2006 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2007 NewImm = (Imm | Ones) & Mask;
2008
2009 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2010 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2011 // we halve the element size and continue the search.
2012 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2013 break;
2014
2015 // We cannot shrink the element size any further if it is 2-bits.
2016 if (EltSize == 2)
2017 return false;
2018
2019 EltSize /= 2;
2020 Mask >>= EltSize;
2021 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2022
2023 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2024 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2025 return false;
2026
2027 // Merge the upper and lower halves of Imm and DemandedBits.
2028 Imm |= Hi;
2029 DemandedBits |= DemandedBitsHi;
2030 }
2031
2032 ++NumOptimizedImms;
2033
2034 // Replicate the element across the register width.
2035 while (EltSize < Size) {
2036 NewImm |= NewImm << EltSize;
2037 EltSize *= 2;
2038 }
2039
2040 (void)OldImm;
2041 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2042 "demanded bits should never be altered");
2043 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2044
2045 // Create the new constant immediate node.
2046 EVT VT = Op.getValueType();
2047 SDLoc DL(Op);
2048 SDValue New;
2049
2050 // If the new constant immediate is all-zeros or all-ones, let the target
2051 // independent DAG combine optimize this node.
2052 if (NewImm == 0 || NewImm == OrigMask) {
2053 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2054 TLO.DAG.getConstant(NewImm, DL, VT));
2055 // Otherwise, create a machine node so that target independent DAG combine
2056 // doesn't undo this optimization.
2057 } else {
2059 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2060 New = SDValue(
2061 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2062 }
2063
2064 return TLO.CombineTo(Op, New);
2065}
2066
2068 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2069 TargetLoweringOpt &TLO) const {
2070 // Delay this optimization to as late as possible.
2071 if (!TLO.LegalOps)
2072 return false;
2073
2075 return false;
2076
2077 EVT VT = Op.getValueType();
2078 if (VT.isVector())
2079 return false;
2080
2081 unsigned Size = VT.getSizeInBits();
2082 assert((Size == 32 || Size == 64) &&
2083 "i32 or i64 is expected after legalization.");
2084
2085 // Exit early if we demand all bits.
2086 if (DemandedBits.popcount() == Size)
2087 return false;
2088
2089 unsigned NewOpc;
2090 switch (Op.getOpcode()) {
2091 default:
2092 return false;
2093 case ISD::AND:
2094 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2095 break;
2096 case ISD::OR:
2097 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2098 break;
2099 case ISD::XOR:
2100 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2101 break;
2102 }
2103 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2104 if (!C)
2105 return false;
2106 uint64_t Imm = C->getZExtValue();
2107 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2108}
2109
2110/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2111/// Mask are known to be either zero or one and return them Known.
2113 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2114 const SelectionDAG &DAG, unsigned Depth) const {
2115 switch (Op.getOpcode()) {
2116 default:
2117 break;
2118 case AArch64ISD::DUP: {
2119 SDValue SrcOp = Op.getOperand(0);
2120 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2121 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2122 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2123 "Expected DUP implicit truncation");
2124 Known = Known.trunc(Op.getScalarValueSizeInBits());
2125 }
2126 break;
2127 }
2128 case AArch64ISD::CSEL: {
2129 KnownBits Known2;
2130 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2131 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2132 Known = Known.intersectWith(Known2);
2133 break;
2134 }
2135 case AArch64ISD::BICi: {
2136 // Compute the bit cleared value.
2137 uint64_t Mask =
2138 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
2139 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2140 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
2141 break;
2142 }
2143 case AArch64ISD::VLSHR: {
2144 KnownBits Known2;
2145 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2146 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2147 Known = KnownBits::lshr(Known, Known2);
2148 break;
2149 }
2150 case AArch64ISD::VASHR: {
2151 KnownBits Known2;
2152 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2153 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2154 Known = KnownBits::ashr(Known, Known2);
2155 break;
2156 }
2157 case AArch64ISD::VSHL: {
2158 KnownBits Known2;
2159 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2160 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2161 Known = KnownBits::shl(Known, Known2);
2162 break;
2163 }
2164 case AArch64ISD::MOVI: {
2165 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(0));
2166 Known =
2168 break;
2169 }
2171 case AArch64ISD::ADDlow: {
2172 if (!Subtarget->isTargetILP32())
2173 break;
2174 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2175 Known.Zero = APInt::getHighBitsSet(64, 32);
2176 break;
2177 }
2179 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2180 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2181 break;
2182 }
2184 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
2185 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
2186 switch (IntID) {
2187 default: return;
2188 case Intrinsic::aarch64_ldaxr:
2189 case Intrinsic::aarch64_ldxr: {
2190 unsigned BitWidth = Known.getBitWidth();
2191 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2192 unsigned MemBits = VT.getScalarSizeInBits();
2193 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2194 return;
2195 }
2196 }
2197 break;
2198 }
2200 case ISD::INTRINSIC_VOID: {
2201 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2202 switch (IntNo) {
2203 default:
2204 break;
2205 case Intrinsic::aarch64_neon_uaddlv: {
2206 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2207 unsigned BitWidth = Known.getBitWidth();
2208 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2209 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2210 assert(BitWidth >= Bound && "Unexpected width!");
2212 Known.Zero |= Mask;
2213 }
2214 break;
2215 }
2216 case Intrinsic::aarch64_neon_umaxv:
2217 case Intrinsic::aarch64_neon_uminv: {
2218 // Figure out the datatype of the vector operand. The UMINV instruction
2219 // will zero extend the result, so we can mark as known zero all the
2220 // bits larger than the element datatype. 32-bit or larget doesn't need
2221 // this as those are legal types and will be handled by isel directly.
2222 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2223 unsigned BitWidth = Known.getBitWidth();
2224 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2225 assert(BitWidth >= 8 && "Unexpected width!");
2227 Known.Zero |= Mask;
2228 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2229 assert(BitWidth >= 16 && "Unexpected width!");
2231 Known.Zero |= Mask;
2232 }
2233 break;
2234 } break;
2235 }
2236 }
2237 }
2238}
2239
2241 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2242 unsigned Depth) const {
2243 EVT VT = Op.getValueType();
2244 unsigned VTBits = VT.getScalarSizeInBits();
2245 unsigned Opcode = Op.getOpcode();
2246 switch (Opcode) {
2247 case AArch64ISD::CMEQ:
2248 case AArch64ISD::CMGE:
2249 case AArch64ISD::CMGT:
2250 case AArch64ISD::CMHI:
2251 case AArch64ISD::CMHS:
2252 case AArch64ISD::FCMEQ:
2253 case AArch64ISD::FCMGE:
2254 case AArch64ISD::FCMGT:
2255 case AArch64ISD::CMEQz:
2256 case AArch64ISD::CMGEz:
2257 case AArch64ISD::CMGTz:
2258 case AArch64ISD::CMLEz:
2259 case AArch64ISD::CMLTz:
2260 case AArch64ISD::FCMEQz:
2261 case AArch64ISD::FCMGEz:
2262 case AArch64ISD::FCMGTz:
2263 case AArch64ISD::FCMLEz:
2264 case AArch64ISD::FCMLTz:
2265 // Compares return either 0 or all-ones
2266 return VTBits;
2267 }
2268
2269 return 1;
2270}
2271
2273 EVT) const {
2274 return MVT::i64;
2275}
2276
2278 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2279 unsigned *Fast) const {
2280 if (Subtarget->requiresStrictAlign())
2281 return false;
2282
2283 if (Fast) {
2284 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2285 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2286 // See comments in performSTORECombine() for more details about
2287 // these conditions.
2288
2289 // Code that uses clang vector extensions can mark that it
2290 // wants unaligned accesses to be treated as fast by
2291 // underspecifying alignment to be 1 or 2.
2292 Alignment <= 2 ||
2293
2294 // Disregard v2i64. Memcpy lowering produces those and splitting
2295 // them regresses performance on micro-benchmarks and olden/bh.
2296 VT == MVT::v2i64;
2297 }
2298 return true;
2299}
2300
2301// Same as above but handling LLTs instead.
2303 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2304 unsigned *Fast) const {
2305 if (Subtarget->requiresStrictAlign())
2306 return false;
2307
2308 if (Fast) {
2309 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2310 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2311 Ty.getSizeInBytes() != 16 ||
2312 // See comments in performSTORECombine() for more details about
2313 // these conditions.
2314
2315 // Code that uses clang vector extensions can mark that it
2316 // wants unaligned accesses to be treated as fast by
2317 // underspecifying alignment to be 1 or 2.
2318 Alignment <= 2 ||
2319
2320 // Disregard v2i64. Memcpy lowering produces those and splitting
2321 // them regresses performance on micro-benchmarks and olden/bh.
2322 Ty == LLT::fixed_vector(2, 64);
2323 }
2324 return true;
2325}
2326
2327FastISel *
2329 const TargetLibraryInfo *libInfo) const {
2330 return AArch64::createFastISel(funcInfo, libInfo);
2331}
2332
2334#define MAKE_CASE(V) \
2335 case V: \
2336 return #V;
2337 switch ((AArch64ISD::NodeType)Opcode) {
2339 break;
2649 }
2650#undef MAKE_CASE
2651 return nullptr;
2652}
2653
2656 MachineBasicBlock *MBB) const {
2657 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2658 // phi node:
2659
2660 // OrigBB:
2661 // [... previous instrs leading to comparison ...]
2662 // b.ne TrueBB
2663 // b EndBB
2664 // TrueBB:
2665 // ; Fallthrough
2666 // EndBB:
2667 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2668
2669 MachineFunction *MF = MBB->getParent();
2670 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2671 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2672 DebugLoc DL = MI.getDebugLoc();
2674
2675 Register DestReg = MI.getOperand(0).getReg();
2676 Register IfTrueReg = MI.getOperand(1).getReg();
2677 Register IfFalseReg = MI.getOperand(2).getReg();
2678 unsigned CondCode = MI.getOperand(3).getImm();
2679 bool NZCVKilled = MI.getOperand(4).isKill();
2680
2681 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2682 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2683 MF->insert(It, TrueBB);
2684 MF->insert(It, EndBB);
2685
2686 // Transfer rest of current basic-block to EndBB
2687 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2688 MBB->end());
2690
2691 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2692 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2693 MBB->addSuccessor(TrueBB);
2694 MBB->addSuccessor(EndBB);
2695
2696 // TrueBB falls through to the end.
2697 TrueBB->addSuccessor(EndBB);
2698
2699 if (!NZCVKilled) {
2700 TrueBB->addLiveIn(AArch64::NZCV);
2701 EndBB->addLiveIn(AArch64::NZCV);
2702 }
2703
2704 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2705 .addReg(IfTrueReg)
2706 .addMBB(TrueBB)
2707 .addReg(IfFalseReg)
2708 .addMBB(MBB);
2709
2710 MI.eraseFromParent();
2711 return EndBB;
2712}
2713
2715 MachineInstr &MI, MachineBasicBlock *BB) const {
2717 BB->getParent()->getFunction().getPersonalityFn())) &&
2718 "SEH does not use catchret!");
2719 return BB;
2720}
2721
2723AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2725 MachineBasicBlock *BB) const {
2726 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2727 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2728
2729 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2730 MIB.add(MI.getOperand(1)); // slice index register
2731 MIB.add(MI.getOperand(2)); // slice index offset
2732 MIB.add(MI.getOperand(3)); // pg
2733 MIB.add(MI.getOperand(4)); // base
2734 MIB.add(MI.getOperand(5)); // offset
2735
2736 MI.eraseFromParent(); // The pseudo is gone now.
2737 return BB;
2738}
2739
2742 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2744 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2745
2746 MIB.addReg(AArch64::ZA, RegState::Define);
2747 MIB.add(MI.getOperand(0)); // Vector select register
2748 MIB.add(MI.getOperand(1)); // Vector select offset
2749 MIB.add(MI.getOperand(2)); // Base
2750 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2751
2752 MI.eraseFromParent(); // The pseudo is gone now.
2753 return BB;
2754}
2755
2757AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2759 MachineBasicBlock *BB, bool HasTile) const {
2760 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2761 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2762 unsigned StartIdx = 0;
2763
2764 if (HasTile) {
2765 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2766 MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2767 StartIdx = 1;
2768 } else
2769 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
2770
2771 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2772 MIB.add(MI.getOperand(I));
2773
2774 MI.eraseFromParent(); // The pseudo is gone now.
2775 return BB;
2776}
2777
2780 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2782 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2783 MIB.add(MI.getOperand(0)); // Mask
2784
2785 unsigned Mask = MI.getOperand(0).getImm();
2786 for (unsigned I = 0; I < 8; I++) {
2787 if (Mask & (1 << I))
2788 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2789 }
2790
2791 MI.eraseFromParent(); // The pseudo is gone now.
2792 return BB;
2793}
2794
2796 MachineInstr &MI, MachineBasicBlock *BB) const {
2797
2798 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
2799 if (SMEOrigInstr != -1) {
2800 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2801 uint64_t SMEMatrixType =
2802 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
2803 switch (SMEMatrixType) {
2805 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
2807 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
2809 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
2811 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
2813 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
2815 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
2816 }
2817 }
2818
2819 switch (MI.getOpcode()) {
2820 default:
2821#ifndef NDEBUG
2822 MI.dump();
2823#endif
2824 llvm_unreachable("Unexpected instruction for custom inserter!");
2825
2826 case AArch64::F128CSEL:
2827 return EmitF128CSEL(MI, BB);
2828 case TargetOpcode::STATEPOINT:
2829 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2830 // while bl call instruction (where statepoint will be lowered at the end)
2831 // has implicit def. This def is early-clobber as it will be set at
2832 // the moment of the call and earlier than any use is read.
2833 // Add this implicit dead def here as a workaround.
2834 MI.addOperand(*MI.getMF(),
2836 AArch64::LR, /*isDef*/ true,
2837 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2838 /*isUndef*/ false, /*isEarlyClobber*/ true));
2839 [[fallthrough]];
2840 case TargetOpcode::STACKMAP:
2841 case TargetOpcode::PATCHPOINT:
2842 return emitPatchPoint(MI, BB);
2843
2844 case TargetOpcode::PATCHABLE_EVENT_CALL:
2845 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
2846 return BB;
2847
2848 case AArch64::CATCHRET:
2849 return EmitLoweredCatchRet(MI, BB);
2850 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2851 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2852 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2853 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2854 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2855 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2856 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2857 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2858 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2859 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2860 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2861 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2862 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2863 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2864 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2865 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2866 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2867 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2868 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
2869 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
2870 case AArch64::LDR_ZA_PSEUDO:
2871 return EmitFill(MI, BB);
2872 case AArch64::ZERO_M_PSEUDO:
2873 return EmitZero(MI, BB);
2874 }
2875}
2876
2877//===----------------------------------------------------------------------===//
2878// AArch64 Lowering private implementation.
2879//===----------------------------------------------------------------------===//
2880
2881//===----------------------------------------------------------------------===//
2882// Lowering Code
2883//===----------------------------------------------------------------------===//
2884
2885// Forward declarations of SVE fixed length lowering helpers
2890 SelectionDAG &DAG);
2892 EVT VT);
2893
2894/// isZerosVector - Check whether SDNode N is a zero-filled vector.
2895static bool isZerosVector(const SDNode *N) {
2896 // Look through a bit convert.
2897 while (N->getOpcode() == ISD::BITCAST)
2898 N = N->getOperand(0).getNode();
2899
2901 return true;
2902
2903 if (N->getOpcode() != AArch64ISD::DUP)
2904 return false;
2905
2906 auto Opnd0 = N->getOperand(0);
2907 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
2908}
2909
2910/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2911/// CC
2913 switch (CC) {
2914 default:
2915 llvm_unreachable("Unknown condition code!");
2916 case ISD::SETNE:
2917 return AArch64CC::NE;
2918 case ISD::SETEQ:
2919 return AArch64CC::EQ;
2920 case ISD::SETGT:
2921 return AArch64CC::GT;
2922 case ISD::SETGE:
2923 return AArch64CC::GE;
2924 case ISD::SETLT:
2925 return AArch64CC::LT;
2926 case ISD::SETLE:
2927 return AArch64CC::LE;
2928 case ISD::SETUGT:
2929 return AArch64CC::HI;
2930 case ISD::SETUGE:
2931 return AArch64CC::HS;
2932 case ISD::SETULT:
2933 return AArch64CC::LO;
2934 case ISD::SETULE:
2935 return AArch64CC::LS;
2936 }
2937}
2938
2939/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
2941 AArch64CC::CondCode &CondCode,
2942 AArch64CC::CondCode &CondCode2) {
2943 CondCode2 = AArch64CC::AL;
2944 switch (CC) {
2945 default:
2946 llvm_unreachable("Unknown FP condition!");
2947 case ISD::SETEQ:
2948 case ISD::SETOEQ:
2949 CondCode = AArch64CC::EQ;
2950 break;
2951 case ISD::SETGT:
2952 case ISD::SETOGT:
2953 CondCode = AArch64CC::GT;
2954 break;
2955 case ISD::SETGE:
2956 case ISD::SETOGE:
2957 CondCode = AArch64CC::GE;
2958 break;
2959 case ISD::SETOLT:
2960 CondCode = AArch64CC::MI;
2961 break;
2962 case ISD::SETOLE:
2963 CondCode = AArch64CC::LS;
2964 break;
2965 case ISD::SETONE:
2966 CondCode = AArch64CC::MI;
2967 CondCode2 = AArch64CC::GT;
2968 break;
2969 case ISD::SETO:
2970 CondCode = AArch64CC::VC;
2971 break;
2972 case ISD::SETUO:
2973 CondCode = AArch64CC::VS;
2974 break;
2975 case ISD::SETUEQ:
2976 CondCode = AArch64CC::EQ;
2977 CondCode2 = AArch64CC::VS;
2978 break;
2979 case ISD::SETUGT:
2980 CondCode = AArch64CC::HI;
2981 break;
2982 case ISD::SETUGE:
2983 CondCode = AArch64CC::PL;
2984 break;
2985 case ISD::SETLT:
2986 case ISD::SETULT:
2987 CondCode = AArch64CC::LT;
2988 break;
2989 case ISD::SETLE:
2990 case ISD::SETULE:
2991 CondCode = AArch64CC::LE;
2992 break;
2993 case ISD::SETNE:
2994 case ISD::SETUNE:
2995 CondCode = AArch64CC::NE;
2996 break;
2997 }
2998}
2999
3000/// Convert a DAG fp condition code to an AArch64 CC.
3001/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3002/// should be AND'ed instead of OR'ed.
3004 AArch64CC::CondCode &CondCode,
3005 AArch64CC::CondCode &CondCode2) {
3006 CondCode2 = AArch64CC::AL;
3007 switch (CC) {
3008 default:
3009 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3010 assert(CondCode2 == AArch64CC::AL);
3011 break;
3012 case ISD::SETONE:
3013 // (a one b)
3014 // == ((a olt b) || (a ogt b))
3015 // == ((a ord b) && (a une b))
3016 CondCode = AArch64CC::VC;
3017 CondCode2 = AArch64CC::NE;
3018 break;
3019 case ISD::SETUEQ:
3020 // (a ueq b)
3021 // == ((a uno b) || (a oeq b))
3022 // == ((a ule b) && (a uge b))
3023 CondCode = AArch64CC::PL;
3024 CondCode2 = AArch64CC::LE;
3025 break;
3026 }
3027}
3028
3029/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3030/// CC usable with the vector instructions. Fewer operations are available
3031/// without a real NZCV register, so we have to use less efficient combinations
3032/// to get the same effect.
3034 AArch64CC::CondCode &CondCode,
3035 AArch64CC::CondCode &CondCode2,
3036 bool &Invert) {
3037 Invert = false;
3038 switch (CC) {
3039 default:
3040 // Mostly the scalar mappings work fine.
3041 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3042 break;
3043 case ISD::SETUO:
3044 Invert = true;
3045 [[fallthrough]];
3046 case ISD::SETO:
3047 CondCode = AArch64CC::MI;
3048 CondCode2 = AArch64CC::GE;
3049 break;
3050 case ISD::SETUEQ:
3051 case ISD::SETULT:
3052 case ISD::SETULE:
3053 case ISD::SETUGT:
3054 case ISD::SETUGE:
3055 // All of the compare-mask comparisons are ordered, but we can switch
3056 // between the two by a double inversion. E.g. ULE == !OGT.
3057 Invert = true;
3058 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3059 CondCode, CondCode2);
3060 break;
3061 }
3062}
3063
3065 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3066 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3067 LLVM_DEBUG(dbgs() << "Is imm " << C
3068 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3069 return IsLegal;
3070}
3071
3072// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3073// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3074// can be set differently by this operation. It comes down to whether
3075// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3076// everything is fine. If not then the optimization is wrong. Thus general
3077// comparisons are only valid if op2 != 0.
3078//
3079// So, finally, the only LLVM-native comparisons that don't mention C and V
3080// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
3081// the absence of information about op2.
3083 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3084 (CC == ISD::SETEQ || CC == ISD::SETNE);
3085}
3086
3088 SelectionDAG &DAG, SDValue Chain,
3089 bool IsSignaling) {
3090 EVT VT = LHS.getValueType();
3091 assert(VT != MVT::f128);
3092
3093 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3094
3095 if (VT == MVT::f16 && !FullFP16) {
3096 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3097 {Chain, LHS});
3098 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3099 {LHS.getValue(1), RHS});
3100 Chain = RHS.getValue(1);
3101 VT = MVT::f32;
3102 }
3103 unsigned Opcode =
3105 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
3106}
3107
3109 const SDLoc &dl, SelectionDAG &DAG) {
3110 EVT VT = LHS.getValueType();
3111 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3112
3113 if (VT.isFloatingPoint()) {
3114 assert(VT != MVT::f128);
3115 if (VT == MVT::f16 && !FullFP16) {
3116 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3117 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3118 VT = MVT::f32;
3119 }
3120 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
3121 }
3122
3123 // The CMP instruction is just an alias for SUBS, and representing it as
3124 // SUBS means that it's possible to get CSE with subtract operations.
3125 // A later phase can perform the optimization of setting the destination
3126 // register to WZR/XZR if it ends up being unused.
3127 unsigned Opcode = AArch64ISD::SUBS;
3128
3129 if (isCMN(RHS, CC)) {
3130 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3132 RHS = RHS.getOperand(1);
3133 } else if (isCMN(LHS, CC)) {
3134 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3135 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3137 LHS = LHS.getOperand(1);
3138 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3139 if (LHS.getOpcode() == ISD::AND) {
3140 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3141 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3142 // of the signed comparisons.
3143 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3144 DAG.getVTList(VT, MVT_CC),
3145 LHS.getOperand(0),
3146 LHS.getOperand(1));
3147 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3148 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3149 return ANDSNode.getValue(1);
3150 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3151 // Use result of ANDS
3152 return LHS.getValue(1);
3153 }
3154 }
3155
3156 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3157 .getValue(1);
3158}
3159
3160/// \defgroup AArch64CCMP CMP;CCMP matching
3161///
3162/// These functions deal with the formation of CMP;CCMP;... sequences.
3163/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3164/// a comparison. They set the NZCV flags to a predefined value if their
3165/// predicate is false. This allows to express arbitrary conjunctions, for
3166/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3167/// expressed as:
3168/// cmp A
3169/// ccmp B, inv(CB), CA
3170/// check for CB flags
3171///
3172/// This naturally lets us implement chains of AND operations with SETCC
3173/// operands. And we can even implement some other situations by transforming
3174/// them:
3175/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3176/// negating the flags used in a CCMP/FCCMP operations.
3177/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3178/// by negating the flags we test for afterwards. i.e.
3179/// NEG (CMP CCMP CCCMP ...) can be implemented.
3180/// - Note that we can only ever negate all previously processed results.
3181/// What we can not implement by flipping the flags to test is a negation
3182/// of two sub-trees (because the negation affects all sub-trees emitted so
3183/// far, so the 2nd sub-tree we emit would also affect the first).
3184/// With those tools we can implement some OR operations:
3185/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3186/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3187/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3188/// elimination rules from earlier to implement the whole thing as a
3189/// CCMP/FCCMP chain.
3190///
3191/// As complete example:
3192/// or (or (setCA (cmp A)) (setCB (cmp B)))
3193/// (and (setCC (cmp C)) (setCD (cmp D)))"
3194/// can be reassociated to:
3195/// or (and (setCC (cmp C)) setCD (cmp D))
3196// (or (setCA (cmp A)) (setCB (cmp B)))
3197/// can be transformed to:
3198/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3199/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3200/// which can be implemented as:
3201/// cmp C
3202/// ccmp D, inv(CD), CC
3203/// ccmp A, CA, inv(CD)
3204/// ccmp B, CB, inv(CA)
3205/// check for CB flags
3206///
3207/// A counterexample is "or (and A B) (and C D)" which translates to
3208/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3209/// can only implement 1 of the inner (not) operations, but not both!
3210/// @{
3211
3212/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3214 ISD::CondCode CC, SDValue CCOp,
3215 AArch64CC::CondCode Predicate,
3216 AArch64CC::CondCode OutCC,
3217 const SDLoc &DL, SelectionDAG &DAG) {
3218 unsigned Opcode = 0;
3219 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3220
3221 if (LHS.getValueType().isFloatingPoint()) {
3222 assert(LHS.getValueType() != MVT::f128);
3223 if (LHS.getValueType() == MVT::f16 && !FullFP16) {
3224 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3225 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3226 }
3228 } else if (RHS.getOpcode() == ISD::SUB) {
3229 SDValue SubOp0 = RHS.getOperand(0);
3230 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3231 // See emitComparison() on why we can only do this for SETEQ and SETNE.
3233 RHS = RHS.getOperand(1);
3234 }
3235 }
3236 if (Opcode == 0)
3238
3239 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3241 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3242 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3243 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3244}
3245
3246/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3247/// expressed as a conjunction. See \ref AArch64CCMP.
3248/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3249/// changing the conditions on the SETCC tests.
3250/// (this means we can call emitConjunctionRec() with
3251/// Negate==true on this sub-tree)
3252/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3253/// cannot do the negation naturally. We are required to
3254/// emit the subtree first in this case.
3255/// \param WillNegate Is true if are called when the result of this
3256/// subexpression must be negated. This happens when the
3257/// outer expression is an OR. We can use this fact to know
3258/// that we have a double negation (or (or ...) ...) that
3259/// can be implemented for free.
3260static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3261 bool &MustBeFirst, bool WillNegate,
3262 unsigned Depth = 0) {
3263 if (!Val.hasOneUse())
3264 return false;
3265 unsigned Opcode = Val->getOpcode();
3266 if (Opcode == ISD::SETCC) {
3267 if (Val->getOperand(0).getValueType() == MVT::f128)
3268 return false;
3269 CanNegate = true;
3270 MustBeFirst = false;
3271 return true;
3272 }
3273 // Protect against exponential runtime and stack overflow.
3274 if (Depth > 6)
3275 return false;
3276 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3277 bool IsOR = Opcode == ISD::OR;
3278 SDValue O0 = Val->getOperand(0);
3279 SDValue O1 = Val->getOperand(1);
3280 bool CanNegateL;
3281 bool MustBeFirstL;
3282 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3283 return false;
3284 bool CanNegateR;
3285 bool MustBeFirstR;
3286 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3287 return false;
3288
3289 if (MustBeFirstL && MustBeFirstR)
3290 return false;
3291
3292 if (IsOR) {
3293 // For an OR expression we need to be able to naturally negate at least
3294 // one side or we cannot do the transformation at all.
3295 if (!CanNegateL && !CanNegateR)
3296 return false;
3297 // If we the result of the OR will be negated and we can naturally negate
3298 // the leafs, then this sub-tree as a whole negates naturally.
3299 CanNegate = WillNegate && CanNegateL && CanNegateR;
3300 // If we cannot naturally negate the whole sub-tree, then this must be
3301 // emitted first.
3302 MustBeFirst = !CanNegate;
3303 } else {
3304 assert(Opcode == ISD::AND && "Must be OR or AND");
3305 // We cannot naturally negate an AND operation.
3306 CanNegate = false;
3307 MustBeFirst = MustBeFirstL || MustBeFirstR;
3308 }
3309 return true;
3310 }
3311 return false;
3312}
3313
3314/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3315/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3316/// Tries to transform the given i1 producing node @p Val to a series compare
3317/// and conditional compare operations. @returns an NZCV flags producing node
3318/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3319/// transformation was not possible.
3320/// \p Negate is true if we want this sub-tree being negated just by changing
3321/// SETCC conditions.
3323 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3324 AArch64CC::CondCode Predicate) {
3325 // We're at a tree leaf, produce a conditional comparison operation.
3326 unsigned Opcode = Val->getOpcode();
3327 if (Opcode == ISD::SETCC) {
3328 SDValue LHS = Val->getOperand(0);
3329 SDValue RHS = Val->getOperand(1);
3330 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();